Merge perf/chunked-transcription: chunk-based processing for large files

2026-03-20 13:54:14 -07:00
parent c23b9a90dd 16f4b57771
commit 0771508203
3 changed files with 227 additions and 9 deletions
--- a/python/tests/test_transcribe.py
+++ b/python/tests/test_transcribe.py
@@ -121,3 +121,80 @@ def test_progress_every_segment(monkeypatch):
    assert len(transcribing_msgs) >= 8, (
        f"Expected at least 8 transcribing progress messages (one per segment), got {len(transcribing_msgs)}"
    )
+
+
+def test_chunk_report_size_progress():
+    """Test CHUNK_REPORT_SIZE progress emission."""
+    from voice_to_notes.services.transcribe import CHUNK_REPORT_SIZE
+    assert CHUNK_REPORT_SIZE == 10
+
+
+def test_transcribe_chunked_with_mocked_ffmpeg(monkeypatch):
+    """Test transcribe_chunked with mocked ffmpeg/ffprobe and mocked WhisperModel."""
+    from unittest.mock import MagicMock, patch
+    from voice_to_notes.services.transcribe import TranscribeService, SegmentResult, WordResult
+
+    # Mock subprocess.run for ffprobe (returns duration of 700s = ~2 chunks at 300s each)
+    original_run = __import__("subprocess").run
+
+    def mock_subprocess_run(cmd, **kwargs):
+        if "ffprobe" in cmd:
+            result = MagicMock()
+            result.stdout = "700.0\n"
+            result.returncode = 0
+            return result
+        elif "ffmpeg" in cmd:
+            # Create an empty temp file (simulate chunk extraction)
+            # The output file is the last argument
+            import pathlib
+            output_file = cmd[-1]
+            pathlib.Path(output_file).touch()
+            result = MagicMock()
+            result.returncode = 0
+            return result
+        return original_run(cmd, **kwargs)
+
+    # Mock WhisperModel
+    mock_model = MagicMock()
+    def mock_transcribe_call(file_path, **kwargs):
+        mock_segments = []
+        for i in range(3):
+            seg = MagicMock()
+            seg.start = i * 1.0
+            seg.end = (i + 1) * 1.0
+            seg.text = f"Segment {i}"
+            seg.words = []
+            mock_segments.append(seg)
+        mock_info = MagicMock()
+        mock_info.language = "en"
+        mock_info.language_probability = 0.99
+        mock_info.duration = 300.0
+        return iter(mock_segments), mock_info
+
+    mock_model.transcribe = mock_transcribe_call
+
+    service = TranscribeService()
+    service._model = mock_model
+    service._current_model_name = "base"
+    service._current_device = "cpu"
+    service._current_compute_type = "int8"
+
+    written_messages = []
+    def mock_write(msg):
+        written_messages.append(msg)
+
+    with patch("subprocess.run", mock_subprocess_run), \
+         patch("voice_to_notes.services.transcribe.write_message", mock_write):
+        result = service.transcribe_chunked("req-1", "/fake/long_audio.wav")
+
+    # Should have segments from multiple chunks
+    assert len(result.segments) > 0
+
+    # Verify timestamp offsets — segments from chunk 1 should start at 0,
+    # segments from chunk 2 should be offset by 300000ms
+    if len(result.segments) > 3:
+        # Chunk 2 segments should have offset timestamps
+        assert result.segments[3].start_ms >= 300000
+
+    assert result.duration_ms == 700000
+    assert result.language == "en"