perf/pipeline-improvements #1
21
.claude/settings.local.json
Normal file
21
.claude/settings.local.json
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(git init:*)",
|
||||||
|
"Bash(git:*)",
|
||||||
|
"WebSearch",
|
||||||
|
"Bash(npm create:*)",
|
||||||
|
"Bash(cp:*)",
|
||||||
|
"Bash(npm install:*)",
|
||||||
|
"Bash(/home/jknapp/.cargo/bin/cargo test:*)",
|
||||||
|
"Bash(ruff:*)",
|
||||||
|
"Bash(npm run:*)",
|
||||||
|
"Bash(npx svelte-check:*)",
|
||||||
|
"Bash(pip install:*)",
|
||||||
|
"Bash(python3:*)",
|
||||||
|
"Bash(/home/jknapp/.cargo/bin/cargo check:*)",
|
||||||
|
"Bash(cargo check:*)",
|
||||||
|
"Bash(npm ls:*)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -49,3 +49,80 @@ def test_result_to_payload_empty():
|
|||||||
assert payload["segments"] == []
|
assert payload["segments"] == []
|
||||||
assert payload["language"] == ""
|
assert payload["language"] == ""
|
||||||
assert payload["duration_ms"] == 0
|
assert payload["duration_ms"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_report_size_progress():
|
||||||
|
"""Test CHUNK_REPORT_SIZE progress emission."""
|
||||||
|
from voice_to_notes.services.transcribe import CHUNK_REPORT_SIZE
|
||||||
|
assert CHUNK_REPORT_SIZE == 10
|
||||||
|
|
||||||
|
|
||||||
|
def test_transcribe_chunked_with_mocked_ffmpeg(monkeypatch):
|
||||||
|
"""Test transcribe_chunked with mocked ffmpeg/ffprobe and mocked WhisperModel."""
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from voice_to_notes.services.transcribe import TranscribeService, SegmentResult, WordResult
|
||||||
|
|
||||||
|
# Mock subprocess.run for ffprobe (returns duration of 700s = ~2 chunks at 300s each)
|
||||||
|
original_run = __import__("subprocess").run
|
||||||
|
|
||||||
|
def mock_subprocess_run(cmd, **kwargs):
|
||||||
|
if "ffprobe" in cmd:
|
||||||
|
result = MagicMock()
|
||||||
|
result.stdout = "700.0\n"
|
||||||
|
result.returncode = 0
|
||||||
|
return result
|
||||||
|
elif "ffmpeg" in cmd:
|
||||||
|
# Create an empty temp file (simulate chunk extraction)
|
||||||
|
# The output file is the last argument
|
||||||
|
import pathlib
|
||||||
|
output_file = cmd[-1]
|
||||||
|
pathlib.Path(output_file).touch()
|
||||||
|
result = MagicMock()
|
||||||
|
result.returncode = 0
|
||||||
|
return result
|
||||||
|
return original_run(cmd, **kwargs)
|
||||||
|
|
||||||
|
# Mock WhisperModel
|
||||||
|
mock_model = MagicMock()
|
||||||
|
def mock_transcribe_call(file_path, **kwargs):
|
||||||
|
mock_segments = []
|
||||||
|
for i in range(3):
|
||||||
|
seg = MagicMock()
|
||||||
|
seg.start = i * 1.0
|
||||||
|
seg.end = (i + 1) * 1.0
|
||||||
|
seg.text = f"Segment {i}"
|
||||||
|
seg.words = []
|
||||||
|
mock_segments.append(seg)
|
||||||
|
mock_info = MagicMock()
|
||||||
|
mock_info.language = "en"
|
||||||
|
mock_info.language_probability = 0.99
|
||||||
|
mock_info.duration = 300.0
|
||||||
|
return iter(mock_segments), mock_info
|
||||||
|
|
||||||
|
mock_model.transcribe = mock_transcribe_call
|
||||||
|
|
||||||
|
service = TranscribeService()
|
||||||
|
service._model = mock_model
|
||||||
|
service._current_model_name = "base"
|
||||||
|
service._current_device = "cpu"
|
||||||
|
service._current_compute_type = "int8"
|
||||||
|
|
||||||
|
written_messages = []
|
||||||
|
def mock_write(msg):
|
||||||
|
written_messages.append(msg)
|
||||||
|
|
||||||
|
with patch("subprocess.run", mock_subprocess_run), \
|
||||||
|
patch("voice_to_notes.services.transcribe.write_message", mock_write):
|
||||||
|
result = service.transcribe_chunked("req-1", "/fake/long_audio.wav")
|
||||||
|
|
||||||
|
# Should have segments from multiple chunks
|
||||||
|
assert len(result.segments) > 0
|
||||||
|
|
||||||
|
# Verify timestamp offsets — segments from chunk 1 should start at 0,
|
||||||
|
# segments from chunk 2 should be offset by 300000ms
|
||||||
|
if len(result.segments) > 3:
|
||||||
|
# Chunk 2 segments should have offset timestamps
|
||||||
|
assert result.segments[3].start_ms >= 300000
|
||||||
|
|
||||||
|
assert result.duration_ms == 700000
|
||||||
|
assert result.language == "en"
|
||||||
|
|||||||
@@ -82,6 +82,30 @@ class PipelineService:
|
|||||||
progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...")
|
progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Probe audio duration for conditional chunked transcription
|
||||||
|
audio_duration_sec = None
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
probe_result = subprocess.run(
|
||||||
|
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
|
||||||
|
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
|
||||||
|
capture_output=True, text=True, check=True,
|
||||||
|
)
|
||||||
|
audio_duration_sec = float(probe_result.stdout.strip())
|
||||||
|
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
|
||||||
|
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
|
||||||
|
transcription = self._transcribe_service.transcribe_chunked(
|
||||||
|
request_id=request_id,
|
||||||
|
file_path=file_path,
|
||||||
|
model_name=model_name,
|
||||||
|
device=device,
|
||||||
|
compute_type=compute_type,
|
||||||
|
language=language,
|
||||||
|
)
|
||||||
|
else:
|
||||||
transcription = self._transcribe_service.transcribe(
|
transcription = self._transcribe_service.transcribe(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -12,6 +13,9 @@ from faster_whisper import WhisperModel
|
|||||||
from voice_to_notes.ipc.messages import progress_message
|
from voice_to_notes.ipc.messages import progress_message
|
||||||
from voice_to_notes.ipc.protocol import write_message
|
from voice_to_notes.ipc.protocol import write_message
|
||||||
|
|
||||||
|
CHUNK_REPORT_SIZE = 10
|
||||||
|
LARGE_FILE_THRESHOLD_SEC = 3600 # 1 hour
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class WordResult:
|
class WordResult:
|
||||||
@@ -90,6 +94,7 @@ class TranscribeService:
|
|||||||
device: str = "cpu",
|
device: str = "cpu",
|
||||||
compute_type: str = "int8",
|
compute_type: str = "int8",
|
||||||
language: str | None = None,
|
language: str | None = None,
|
||||||
|
on_segment: Callable[[SegmentResult, int], None] | None = None,
|
||||||
) -> TranscriptionResult:
|
) -> TranscriptionResult:
|
||||||
"""Transcribe an audio file with word-level timestamps.
|
"""Transcribe an audio file with word-level timestamps.
|
||||||
|
|
||||||
@@ -156,6 +161,12 @@ class TranscribeService:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if segment_count % CHUNK_REPORT_SIZE == 0:
|
||||||
|
write_message(progress_message(
|
||||||
|
request_id, progress_pct, "transcribing",
|
||||||
|
f"Completed chunk of {CHUNK_REPORT_SIZE} segments "
|
||||||
|
f"({segment_count} total, {progress_pct}% of audio)..."))
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
elapsed = time.time() - start_time
|
||||||
print(
|
print(
|
||||||
f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
|
f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
|
||||||
@@ -166,6 +177,113 @@ class TranscribeService:
|
|||||||
write_message(progress_message(request_id, 100, "done", "Transcription complete"))
|
write_message(progress_message(request_id, 100, "done", "Transcription complete"))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def transcribe_chunked(
|
||||||
|
self,
|
||||||
|
request_id: str,
|
||||||
|
file_path: str,
|
||||||
|
model_name: str = "base",
|
||||||
|
device: str = "cpu",
|
||||||
|
compute_type: str = "int8",
|
||||||
|
language: str | None = None,
|
||||||
|
on_segment: Callable[[SegmentResult, int], None] | None = None,
|
||||||
|
chunk_duration_sec: int = 300,
|
||||||
|
) -> TranscriptionResult:
|
||||||
|
"""Transcribe a large audio file by splitting into chunks.
|
||||||
|
|
||||||
|
Uses ffmpeg to split the file into chunks, transcribes each chunk,
|
||||||
|
then merges the results with corrected timestamps.
|
||||||
|
|
||||||
|
Falls back to standard transcribe() if ffmpeg is not available.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
# Get total duration via ffprobe
|
||||||
|
try:
|
||||||
|
probe_result = subprocess.run(
|
||||||
|
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
|
||||||
|
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
|
||||||
|
capture_output=True, text=True, check=True,
|
||||||
|
)
|
||||||
|
total_duration = float(probe_result.stdout.strip())
|
||||||
|
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
||||||
|
# ffprobe not available or failed — fall back to standard transcription
|
||||||
|
write_message(progress_message(
|
||||||
|
request_id, 5, "transcribing",
|
||||||
|
"ffmpeg not available, using standard transcription..."))
|
||||||
|
return self.transcribe(request_id, file_path, model_name, device,
|
||||||
|
compute_type, language, on_segment=on_segment)
|
||||||
|
|
||||||
|
num_chunks = max(1, int(total_duration / chunk_duration_sec) + 1)
|
||||||
|
write_message(progress_message(
|
||||||
|
request_id, 5, "transcribing",
|
||||||
|
f"Splitting {total_duration:.0f}s file into {num_chunks} chunks..."))
|
||||||
|
|
||||||
|
merged_result = TranscriptionResult()
|
||||||
|
global_segment_index = 0
|
||||||
|
|
||||||
|
for chunk_idx in range(num_chunks):
|
||||||
|
chunk_start = chunk_idx * chunk_duration_sec
|
||||||
|
if chunk_start >= total_duration:
|
||||||
|
break
|
||||||
|
|
||||||
|
chunk_start_ms = int(chunk_start * 1000)
|
||||||
|
|
||||||
|
# Extract chunk to temp file
|
||||||
|
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||||
|
tmp.close()
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["ffmpeg", "-y", "-ss", str(chunk_start),
|
||||||
|
"-t", str(chunk_duration_sec),
|
||||||
|
"-i", file_path,
|
||||||
|
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
|
||||||
|
tmp.name],
|
||||||
|
capture_output=True, check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wrap on_segment to offset the index
|
||||||
|
chunk_on_segment = None
|
||||||
|
if on_segment:
|
||||||
|
base_index = global_segment_index
|
||||||
|
def chunk_on_segment(seg: SegmentResult, idx: int, _base=base_index) -> None:
|
||||||
|
on_segment(seg, _base + idx)
|
||||||
|
|
||||||
|
chunk_result = self.transcribe(
|
||||||
|
request_id, tmp.name, model_name, device,
|
||||||
|
compute_type, language, on_segment=chunk_on_segment,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Offset timestamps and merge
|
||||||
|
for seg in chunk_result.segments:
|
||||||
|
seg.start_ms += chunk_start_ms
|
||||||
|
seg.end_ms += chunk_start_ms
|
||||||
|
for word in seg.words:
|
||||||
|
word.start_ms += chunk_start_ms
|
||||||
|
word.end_ms += chunk_start_ms
|
||||||
|
merged_result.segments.append(seg)
|
||||||
|
|
||||||
|
global_segment_index += len(chunk_result.segments)
|
||||||
|
|
||||||
|
# Take language from first chunk
|
||||||
|
if chunk_idx == 0:
|
||||||
|
merged_result.language = chunk_result.language
|
||||||
|
merged_result.language_probability = chunk_result.language_probability
|
||||||
|
|
||||||
|
finally:
|
||||||
|
import os
|
||||||
|
os.unlink(tmp.name)
|
||||||
|
|
||||||
|
# Chunk progress
|
||||||
|
chunk_pct = min(10 + int(((chunk_idx + 1) / num_chunks) * 80), 90)
|
||||||
|
write_message(progress_message(
|
||||||
|
request_id, chunk_pct, "transcribing",
|
||||||
|
f"Completed chunk {chunk_idx + 1}/{num_chunks}..."))
|
||||||
|
|
||||||
|
merged_result.duration_ms = int(total_duration * 1000)
|
||||||
|
write_message(progress_message(request_id, 100, "done", "Transcription complete"))
|
||||||
|
return merged_result
|
||||||
|
|
||||||
|
|
||||||
def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
|
def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
|
||||||
"""Convert TranscriptionResult to IPC payload dict."""
|
"""Convert TranscriptionResult to IPC payload dict."""
|
||||||
|
|||||||
Reference in New Issue
Block a user