Merge perf/chunked-transcription: chunk-based processing for large files

2026-03-20 13:54:14 -07:00
parent c23b9a90dd 16f4b57771
commit 0771508203
3 changed files with 227 additions and 9 deletions
--- a/python/tests/test_transcribe.py
+++ b/python/tests/test_transcribe.py
@@ -121,3 +121,80 @@ def test_progress_every_segment(monkeypatch):
    assert len(transcribing_msgs) >= 8, (
        f"Expected at least 8 transcribing progress messages (one per segment), got {len(transcribing_msgs)}"
    )
+
+
+def test_chunk_report_size_progress():
+    """Test CHUNK_REPORT_SIZE progress emission."""
+    from voice_to_notes.services.transcribe import CHUNK_REPORT_SIZE
+    assert CHUNK_REPORT_SIZE == 10
+
+
+def test_transcribe_chunked_with_mocked_ffmpeg(monkeypatch):
+    """Test transcribe_chunked with mocked ffmpeg/ffprobe and mocked WhisperModel."""
+    from unittest.mock import MagicMock, patch
+    from voice_to_notes.services.transcribe import TranscribeService, SegmentResult, WordResult
+
+    # Mock subprocess.run for ffprobe (returns duration of 700s = ~2 chunks at 300s each)
+    original_run = __import__("subprocess").run
+
+    def mock_subprocess_run(cmd, **kwargs):
+        if "ffprobe" in cmd:
+            result = MagicMock()
+            result.stdout = "700.0\n"
+            result.returncode = 0
+            return result
+        elif "ffmpeg" in cmd:
+            # Create an empty temp file (simulate chunk extraction)
+            # The output file is the last argument
+            import pathlib
+            output_file = cmd[-1]
+            pathlib.Path(output_file).touch()
+            result = MagicMock()
+            result.returncode = 0
+            return result
+        return original_run(cmd, **kwargs)
+
+    # Mock WhisperModel
+    mock_model = MagicMock()
+    def mock_transcribe_call(file_path, **kwargs):
+        mock_segments = []
+        for i in range(3):
+            seg = MagicMock()
+            seg.start = i * 1.0
+            seg.end = (i + 1) * 1.0
+            seg.text = f"Segment {i}"
+            seg.words = []
+            mock_segments.append(seg)
+        mock_info = MagicMock()
+        mock_info.language = "en"
+        mock_info.language_probability = 0.99
+        mock_info.duration = 300.0
+        return iter(mock_segments), mock_info
+
+    mock_model.transcribe = mock_transcribe_call
+
+    service = TranscribeService()
+    service._model = mock_model
+    service._current_model_name = "base"
+    service._current_device = "cpu"
+    service._current_compute_type = "int8"
+
+    written_messages = []
+    def mock_write(msg):
+        written_messages.append(msg)
+
+    with patch("subprocess.run", mock_subprocess_run), \
+         patch("voice_to_notes.services.transcribe.write_message", mock_write):
+        result = service.transcribe_chunked("req-1", "/fake/long_audio.wav")
+
+    # Should have segments from multiple chunks
+    assert len(result.segments) > 0
+
+    # Verify timestamp offsets — segments from chunk 1 should start at 0,
+    # segments from chunk 2 should be offset by 300000ms
+    if len(result.segments) > 3:
+        # Chunk 2 segments should have offset timestamps
+        assert result.segments[3].start_ms >= 300000
+
+    assert result.duration_ms == 700000
+    assert result.language == "en"
--- a/python/voice_to_notes/services/pipeline.py
+++ b/python/voice_to_notes/services/pipeline.py
@@ -96,15 +96,40 @@ class PipelineService:
                "words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words],
            }))

-        transcription = self._transcribe_service.transcribe(
-            request_id=request_id,
-            file_path=file_path,
-            model_name=model_name,
-            device=device,
-            compute_type=compute_type,
-            language=language,
-            on_segment=_emit_segment,
-        )
+        # Probe audio duration for conditional chunked transcription
+        audio_duration_sec = None
+        try:
+            import subprocess
+            probe_result = subprocess.run(
+                ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
+                 "-of", "default=noprint_wrappers=1:nokey=1", file_path],
+                capture_output=True, text=True, check=True,
+            )
+            audio_duration_sec = float(probe_result.stdout.strip())
+        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
+            pass
+
+        from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
+        if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
+            transcription = self._transcribe_service.transcribe_chunked(
+                request_id=request_id,
+                file_path=file_path,
+                model_name=model_name,
+                device=device,
+                compute_type=compute_type,
+                language=language,
+                on_segment=_emit_segment,
+            )
+        else:
+            transcription = self._transcribe_service.transcribe(
+                request_id=request_id,
+                file_path=file_path,
+                model_name=model_name,
+                device=device,
+                compute_type=compute_type,
+                language=language,
+                on_segment=_emit_segment,
+            )

        if skip_diarization:
            # Convert transcription directly without speaker labels
--- a/python/voice_to_notes/services/transcribe.py
+++ b/python/voice_to_notes/services/transcribe.py
@@ -13,6 +13,9 @@ from faster_whisper import WhisperModel
 from voice_to_notes.ipc.messages import progress_message
 from voice_to_notes.ipc.protocol import write_message

+CHUNK_REPORT_SIZE = 10
+LARGE_FILE_THRESHOLD_SEC = 3600  # 1 hour
+

@dataclass
 class WordResult:
@@ -159,6 +162,12 @@ class TranscribeService:
                )
            )

+            if segment_count % CHUNK_REPORT_SIZE == 0:
+                write_message(progress_message(
+                    request_id, progress_pct, "transcribing",
+                    f"Completed chunk of {CHUNK_REPORT_SIZE} segments "
+                    f"({segment_count} total, {progress_pct}% of audio)..."))
+
        elapsed = time.time() - start_time
        print(
            f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
@@ -169,6 +178,113 @@ class TranscribeService:
        write_message(progress_message(request_id, 100, "done", "Transcription complete"))
        return result

+    def transcribe_chunked(
+        self,
+        request_id: str,
+        file_path: str,
+        model_name: str = "base",
+        device: str = "cpu",
+        compute_type: str = "int8",
+        language: str | None = None,
+        on_segment: Callable[[SegmentResult, int], None] | None = None,
+        chunk_duration_sec: int = 300,
+    ) -> TranscriptionResult:
+        """Transcribe a large audio file by splitting into chunks.
+
+        Uses ffmpeg to split the file into chunks, transcribes each chunk,
+        then merges the results with corrected timestamps.
+
+        Falls back to standard transcribe() if ffmpeg is not available.
+        """
+        import subprocess
+        import tempfile
+
+        # Get total duration via ffprobe
+        try:
+            probe_result = subprocess.run(
+                ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
+                 "-of", "default=noprint_wrappers=1:nokey=1", file_path],
+                capture_output=True, text=True, check=True,
+            )
+            total_duration = float(probe_result.stdout.strip())
+        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
+            # ffprobe not available or failed — fall back to standard transcription
+            write_message(progress_message(
+                request_id, 5, "transcribing",
+                "ffmpeg not available, using standard transcription..."))
+            return self.transcribe(request_id, file_path, model_name, device,
+                                   compute_type, language, on_segment=on_segment)
+
+        num_chunks = max(1, int(total_duration / chunk_duration_sec) + 1)
+        write_message(progress_message(
+            request_id, 5, "transcribing",
+            f"Splitting {total_duration:.0f}s file into {num_chunks} chunks..."))
+
+        merged_result = TranscriptionResult()
+        global_segment_index = 0
+
+        for chunk_idx in range(num_chunks):
+            chunk_start = chunk_idx * chunk_duration_sec
+            if chunk_start >= total_duration:
+                break
+
+            chunk_start_ms = int(chunk_start * 1000)
+
+            # Extract chunk to temp file
+            tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            tmp.close()
+            try:
+                subprocess.run(
+                    ["ffmpeg", "-y", "-ss", str(chunk_start),
+                     "-t", str(chunk_duration_sec),
+                     "-i", file_path,
+                     "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
+                     tmp.name],
+                    capture_output=True, check=True,
+                )
+
+                # Wrap on_segment to offset the index
+                chunk_on_segment = None
+                if on_segment:
+                    base_index = global_segment_index
+                    def chunk_on_segment(seg: SegmentResult, idx: int, _base=base_index) -> None:
+                        on_segment(seg, _base + idx)
+
+                chunk_result = self.transcribe(
+                    request_id, tmp.name, model_name, device,
+                    compute_type, language, on_segment=chunk_on_segment,
+                )
+
+                # Offset timestamps and merge
+                for seg in chunk_result.segments:
+                    seg.start_ms += chunk_start_ms
+                    seg.end_ms += chunk_start_ms
+                    for word in seg.words:
+                        word.start_ms += chunk_start_ms
+                        word.end_ms += chunk_start_ms
+                    merged_result.segments.append(seg)
+
+                global_segment_index += len(chunk_result.segments)
+
+                # Take language from first chunk
+                if chunk_idx == 0:
+                    merged_result.language = chunk_result.language
+                    merged_result.language_probability = chunk_result.language_probability
+
+            finally:
+                import os
+                os.unlink(tmp.name)
+
+            # Chunk progress
+            chunk_pct = min(10 + int(((chunk_idx + 1) / num_chunks) * 80), 90)
+            write_message(progress_message(
+                request_id, chunk_pct, "transcribing",
+                f"Completed chunk {chunk_idx + 1}/{num_chunks}..."))
+
+        merged_result.duration_ms = int(total_duration * 1000)
+        write_message(progress_message(request_id, 100, "done", "Transcription complete"))
+        return merged_result
+

 def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
    """Convert TranscriptionResult to IPC payload dict."""