Add chunked transcription for large audio files (>1 hour)

Split files >1 hour into 5-minute chunks via ffmpeg, transcribe each chunk independently, then merge results with corrected timestamps. Also add chunk-level progress markers every 10 segments for all files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 13:49:20 -07:00
parent d00281f0c7
commit 16f4b57771
4 changed files with 248 additions and 8 deletions
--- a/python/voice_to_notes/services/pipeline.py
+++ b/python/voice_to_notes/services/pipeline.py
@@ -82,14 +82,38 @@ class PipelineService:
            progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...")
        )

-        transcription = self._transcribe_service.transcribe(
-            request_id=request_id,
-            file_path=file_path,
-            model_name=model_name,
-            device=device,
-            compute_type=compute_type,
-            language=language,
-        )
+        # Probe audio duration for conditional chunked transcription
+        audio_duration_sec = None
+        try:
+            import subprocess
+            probe_result = subprocess.run(
+                ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
+                 "-of", "default=noprint_wrappers=1:nokey=1", file_path],
+                capture_output=True, text=True, check=True,
+            )
+            audio_duration_sec = float(probe_result.stdout.strip())
+        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
+            pass
+
+        from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
+        if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
+            transcription = self._transcribe_service.transcribe_chunked(
+                request_id=request_id,
+                file_path=file_path,
+                model_name=model_name,
+                device=device,
+                compute_type=compute_type,
+                language=language,
+            )
+        else:
+            transcription = self._transcribe_service.transcribe(
+                request_id=request_id,
+                file_path=file_path,
+                model_name=model_name,
+                device=device,
+                compute_type=compute_type,
+                language=language,
+            )

        if skip_diarization:
            # Convert transcription directly without speaker labels