Merge perf/chunked-transcription: chunk-based processing for large files

This commit is contained in:
Claude
2026-03-20 13:54:14 -07:00
3 changed files with 227 additions and 9 deletions

View File

@@ -121,3 +121,80 @@ def test_progress_every_segment(monkeypatch):
assert len(transcribing_msgs) >= 8, (
f"Expected at least 8 transcribing progress messages (one per segment), got {len(transcribing_msgs)}"
)
def test_chunk_report_size_progress():
"""Test CHUNK_REPORT_SIZE progress emission."""
from voice_to_notes.services.transcribe import CHUNK_REPORT_SIZE
assert CHUNK_REPORT_SIZE == 10
def test_transcribe_chunked_with_mocked_ffmpeg(monkeypatch):
"""Test transcribe_chunked with mocked ffmpeg/ffprobe and mocked WhisperModel."""
from unittest.mock import MagicMock, patch
from voice_to_notes.services.transcribe import TranscribeService, SegmentResult, WordResult
# Mock subprocess.run for ffprobe (returns duration of 700s = ~2 chunks at 300s each)
original_run = __import__("subprocess").run
def mock_subprocess_run(cmd, **kwargs):
if "ffprobe" in cmd:
result = MagicMock()
result.stdout = "700.0\n"
result.returncode = 0
return result
elif "ffmpeg" in cmd:
# Create an empty temp file (simulate chunk extraction)
# The output file is the last argument
import pathlib
output_file = cmd[-1]
pathlib.Path(output_file).touch()
result = MagicMock()
result.returncode = 0
return result
return original_run(cmd, **kwargs)
# Mock WhisperModel
mock_model = MagicMock()
def mock_transcribe_call(file_path, **kwargs):
mock_segments = []
for i in range(3):
seg = MagicMock()
seg.start = i * 1.0
seg.end = (i + 1) * 1.0
seg.text = f"Segment {i}"
seg.words = []
mock_segments.append(seg)
mock_info = MagicMock()
mock_info.language = "en"
mock_info.language_probability = 0.99
mock_info.duration = 300.0
return iter(mock_segments), mock_info
mock_model.transcribe = mock_transcribe_call
service = TranscribeService()
service._model = mock_model
service._current_model_name = "base"
service._current_device = "cpu"
service._current_compute_type = "int8"
written_messages = []
def mock_write(msg):
written_messages.append(msg)
with patch("subprocess.run", mock_subprocess_run), \
patch("voice_to_notes.services.transcribe.write_message", mock_write):
result = service.transcribe_chunked("req-1", "/fake/long_audio.wav")
# Should have segments from multiple chunks
assert len(result.segments) > 0
# Verify timestamp offsets — segments from chunk 1 should start at 0,
# segments from chunk 2 should be offset by 300000ms
if len(result.segments) > 3:
# Chunk 2 segments should have offset timestamps
assert result.segments[3].start_ms >= 300000
assert result.duration_ms == 700000
assert result.language == "en"

View File

@@ -96,15 +96,40 @@ class PipelineService:
"words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words],
}))
transcription = self._transcribe_service.transcribe(
request_id=request_id,
file_path=file_path,
model_name=model_name,
device=device,
compute_type=compute_type,
language=language,
on_segment=_emit_segment,
)
# Probe audio duration for conditional chunked transcription
audio_duration_sec = None
try:
import subprocess
probe_result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
capture_output=True, text=True, check=True,
)
audio_duration_sec = float(probe_result.stdout.strip())
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
pass
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
transcription = self._transcribe_service.transcribe_chunked(
request_id=request_id,
file_path=file_path,
model_name=model_name,
device=device,
compute_type=compute_type,
language=language,
on_segment=_emit_segment,
)
else:
transcription = self._transcribe_service.transcribe(
request_id=request_id,
file_path=file_path,
model_name=model_name,
device=device,
compute_type=compute_type,
language=language,
on_segment=_emit_segment,
)
if skip_diarization:
# Convert transcription directly without speaker labels

View File

@@ -13,6 +13,9 @@ from faster_whisper import WhisperModel
from voice_to_notes.ipc.messages import progress_message
from voice_to_notes.ipc.protocol import write_message
CHUNK_REPORT_SIZE = 10
LARGE_FILE_THRESHOLD_SEC = 3600 # 1 hour
@dataclass
class WordResult:
@@ -159,6 +162,12 @@ class TranscribeService:
)
)
if segment_count % CHUNK_REPORT_SIZE == 0:
write_message(progress_message(
request_id, progress_pct, "transcribing",
f"Completed chunk of {CHUNK_REPORT_SIZE} segments "
f"({segment_count} total, {progress_pct}% of audio)..."))
elapsed = time.time() - start_time
print(
f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
@@ -169,6 +178,113 @@ class TranscribeService:
write_message(progress_message(request_id, 100, "done", "Transcription complete"))
return result
def transcribe_chunked(
self,
request_id: str,
file_path: str,
model_name: str = "base",
device: str = "cpu",
compute_type: str = "int8",
language: str | None = None,
on_segment: Callable[[SegmentResult, int], None] | None = None,
chunk_duration_sec: int = 300,
) -> TranscriptionResult:
"""Transcribe a large audio file by splitting into chunks.
Uses ffmpeg to split the file into chunks, transcribes each chunk,
then merges the results with corrected timestamps.
Falls back to standard transcribe() if ffmpeg is not available.
"""
import subprocess
import tempfile
# Get total duration via ffprobe
try:
probe_result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
capture_output=True, text=True, check=True,
)
total_duration = float(probe_result.stdout.strip())
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
# ffprobe not available or failed — fall back to standard transcription
write_message(progress_message(
request_id, 5, "transcribing",
"ffmpeg not available, using standard transcription..."))
return self.transcribe(request_id, file_path, model_name, device,
compute_type, language, on_segment=on_segment)
num_chunks = max(1, int(total_duration / chunk_duration_sec) + 1)
write_message(progress_message(
request_id, 5, "transcribing",
f"Splitting {total_duration:.0f}s file into {num_chunks} chunks..."))
merged_result = TranscriptionResult()
global_segment_index = 0
for chunk_idx in range(num_chunks):
chunk_start = chunk_idx * chunk_duration_sec
if chunk_start >= total_duration:
break
chunk_start_ms = int(chunk_start * 1000)
# Extract chunk to temp file
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tmp.close()
try:
subprocess.run(
["ffmpeg", "-y", "-ss", str(chunk_start),
"-t", str(chunk_duration_sec),
"-i", file_path,
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
tmp.name],
capture_output=True, check=True,
)
# Wrap on_segment to offset the index
chunk_on_segment = None
if on_segment:
base_index = global_segment_index
def chunk_on_segment(seg: SegmentResult, idx: int, _base=base_index) -> None:
on_segment(seg, _base + idx)
chunk_result = self.transcribe(
request_id, tmp.name, model_name, device,
compute_type, language, on_segment=chunk_on_segment,
)
# Offset timestamps and merge
for seg in chunk_result.segments:
seg.start_ms += chunk_start_ms
seg.end_ms += chunk_start_ms
for word in seg.words:
word.start_ms += chunk_start_ms
word.end_ms += chunk_start_ms
merged_result.segments.append(seg)
global_segment_index += len(chunk_result.segments)
# Take language from first chunk
if chunk_idx == 0:
merged_result.language = chunk_result.language
merged_result.language_probability = chunk_result.language_probability
finally:
import os
os.unlink(tmp.name)
# Chunk progress
chunk_pct = min(10 + int(((chunk_idx + 1) / num_chunks) * 80), 90)
write_message(progress_message(
request_id, chunk_pct, "transcribing",
f"Completed chunk {chunk_idx + 1}/{num_chunks}..."))
merged_result.duration_ms = int(total_duration * 1000)
write_message(progress_message(request_id, 100, "done", "Transcription complete"))
return merged_result
def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
"""Convert TranscriptionResult to IPC payload dict."""