Merge perf/chunked-transcription: chunk-based processing for large files

2026-03-20 13:54:14 -07:00
parent c23b9a90dd 16f4b57771
commit 0771508203
3 changed files with 227 additions and 9 deletions
--- a/python/voice_to_notes/services/pipeline.py
+++ b/python/voice_to_notes/services/pipeline.py
@@ -96,15 +96,40 @@ class PipelineService:
                "words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words],
            }))

-        transcription = self._transcribe_service.transcribe(
-            request_id=request_id,
-            file_path=file_path,
-            model_name=model_name,
-            device=device,
-            compute_type=compute_type,
-            language=language,
-            on_segment=_emit_segment,
-        )
+        # Probe audio duration for conditional chunked transcription
+        audio_duration_sec = None
+        try:
+            import subprocess
+            probe_result = subprocess.run(
+                ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
+                 "-of", "default=noprint_wrappers=1:nokey=1", file_path],
+                capture_output=True, text=True, check=True,
+            )
+            audio_duration_sec = float(probe_result.stdout.strip())
+        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
+            pass
+
+        from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
+        if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
+            transcription = self._transcribe_service.transcribe_chunked(
+                request_id=request_id,
+                file_path=file_path,
+                model_name=model_name,
+                device=device,
+                compute_type=compute_type,
+                language=language,
+                on_segment=_emit_segment,
+            )
+        else:
+            transcription = self._transcribe_service.transcribe(
+                request_id=request_id,
+                file_path=file_path,
+                model_name=model_name,
+                device=device,
+                compute_type=compute_type,
+                language=language,
+                on_segment=_emit_segment,
+            )

        if skip_diarization:
            # Convert transcription directly without speaker labels