Merge perf/chunked-transcription: chunk-based processing for large files
This commit is contained in:
@@ -121,3 +121,80 @@ def test_progress_every_segment(monkeypatch):
|
||||
assert len(transcribing_msgs) >= 8, (
|
||||
f"Expected at least 8 transcribing progress messages (one per segment), got {len(transcribing_msgs)}"
|
||||
)
|
||||
|
||||
|
||||
def test_chunk_report_size_progress():
|
||||
"""Test CHUNK_REPORT_SIZE progress emission."""
|
||||
from voice_to_notes.services.transcribe import CHUNK_REPORT_SIZE
|
||||
assert CHUNK_REPORT_SIZE == 10
|
||||
|
||||
|
||||
def test_transcribe_chunked_with_mocked_ffmpeg(monkeypatch):
|
||||
"""Test transcribe_chunked with mocked ffmpeg/ffprobe and mocked WhisperModel."""
|
||||
from unittest.mock import MagicMock, patch
|
||||
from voice_to_notes.services.transcribe import TranscribeService, SegmentResult, WordResult
|
||||
|
||||
# Mock subprocess.run for ffprobe (returns duration of 700s = ~2 chunks at 300s each)
|
||||
original_run = __import__("subprocess").run
|
||||
|
||||
def mock_subprocess_run(cmd, **kwargs):
|
||||
if "ffprobe" in cmd:
|
||||
result = MagicMock()
|
||||
result.stdout = "700.0\n"
|
||||
result.returncode = 0
|
||||
return result
|
||||
elif "ffmpeg" in cmd:
|
||||
# Create an empty temp file (simulate chunk extraction)
|
||||
# The output file is the last argument
|
||||
import pathlib
|
||||
output_file = cmd[-1]
|
||||
pathlib.Path(output_file).touch()
|
||||
result = MagicMock()
|
||||
result.returncode = 0
|
||||
return result
|
||||
return original_run(cmd, **kwargs)
|
||||
|
||||
# Mock WhisperModel
|
||||
mock_model = MagicMock()
|
||||
def mock_transcribe_call(file_path, **kwargs):
|
||||
mock_segments = []
|
||||
for i in range(3):
|
||||
seg = MagicMock()
|
||||
seg.start = i * 1.0
|
||||
seg.end = (i + 1) * 1.0
|
||||
seg.text = f"Segment {i}"
|
||||
seg.words = []
|
||||
mock_segments.append(seg)
|
||||
mock_info = MagicMock()
|
||||
mock_info.language = "en"
|
||||
mock_info.language_probability = 0.99
|
||||
mock_info.duration = 300.0
|
||||
return iter(mock_segments), mock_info
|
||||
|
||||
mock_model.transcribe = mock_transcribe_call
|
||||
|
||||
service = TranscribeService()
|
||||
service._model = mock_model
|
||||
service._current_model_name = "base"
|
||||
service._current_device = "cpu"
|
||||
service._current_compute_type = "int8"
|
||||
|
||||
written_messages = []
|
||||
def mock_write(msg):
|
||||
written_messages.append(msg)
|
||||
|
||||
with patch("subprocess.run", mock_subprocess_run), \
|
||||
patch("voice_to_notes.services.transcribe.write_message", mock_write):
|
||||
result = service.transcribe_chunked("req-1", "/fake/long_audio.wav")
|
||||
|
||||
# Should have segments from multiple chunks
|
||||
assert len(result.segments) > 0
|
||||
|
||||
# Verify timestamp offsets — segments from chunk 1 should start at 0,
|
||||
# segments from chunk 2 should be offset by 300000ms
|
||||
if len(result.segments) > 3:
|
||||
# Chunk 2 segments should have offset timestamps
|
||||
assert result.segments[3].start_ms >= 300000
|
||||
|
||||
assert result.duration_ms == 700000
|
||||
assert result.language == "en"
|
||||
|
||||
Reference in New Issue
Block a user