"""Transcription service — faster-whisper pipeline with word-level timestamps.""" from __future__ import annotations import sys import time from dataclasses import dataclass, field from typing import Any from faster_whisper import WhisperModel from voice_to_notes.ipc.messages import progress_message from voice_to_notes.ipc.protocol import write_message @dataclass class WordResult: """A single word with timestamp.""" word: str start_ms: int end_ms: int confidence: float @dataclass class SegmentResult: """A transcription segment with words.""" text: str start_ms: int end_ms: int words: list[WordResult] = field(default_factory=list) @dataclass class TranscriptionResult: """Full transcription output.""" segments: list[SegmentResult] = field(default_factory=list) language: str = "" language_probability: float = 0.0 duration_ms: int = 0 class TranscribeService: """Handles audio transcription via faster-whisper.""" def __init__(self) -> None: self._model: WhisperModel | None = None self._current_model_name: str = "" self._current_device: str = "" self._current_compute_type: str = "" def _ensure_model( self, model_name: str = "base", device: str = "cpu", compute_type: str = "int8", ) -> WhisperModel: """Load or reuse the Whisper model.""" if ( self._model is not None and self._current_model_name == model_name and self._current_device == device and self._current_compute_type == compute_type ): return self._model print( f"[sidecar] Loading model {model_name} on {device} ({compute_type})", file=sys.stderr, flush=True, ) self._model = WhisperModel( model_name, device=device, compute_type=compute_type, ) self._current_model_name = model_name self._current_device = device self._current_compute_type = compute_type return self._model def transcribe( self, request_id: str, file_path: str, model_name: str = "base", device: str = "cpu", compute_type: str = "int8", language: str | None = None, ) -> TranscriptionResult: """Transcribe an audio file with word-level timestamps. Sends progress messages via IPC during processing. """ # Stage: loading model write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}...")) model = self._ensure_model(model_name, device, compute_type) # Stage: transcribing write_message(progress_message(request_id, 10, "transcribing", "Starting transcription...")) start_time = time.time() segments_iter, info = model.transcribe( file_path, language=language, word_timestamps=True, vad_filter=True, ) result = TranscriptionResult( language=info.language, language_probability=info.language_probability, duration_ms=int(info.duration * 1000), ) # Process segments with progress reporting total_duration = info.duration if info.duration > 0 else 1.0 segment_count = 0 for segment in segments_iter: segment_count += 1 progress_pct = min(10 + int((segment.end / total_duration) * 80), 90) words = [] if segment.words: for w in segment.words: words.append( WordResult( word=w.word.strip(), start_ms=int(w.start * 1000), end_ms=int(w.end * 1000), confidence=round(w.probability, 4), ) ) result.segments.append( SegmentResult( text=segment.text.strip(), start_ms=int(segment.start * 1000), end_ms=int(segment.end * 1000), words=words, ) ) # Send progress every few segments if segment_count % 5 == 0: write_message( progress_message( request_id, progress_pct, "transcribing", f"Processed {segment_count} segments...", ) ) elapsed = time.time() - start_time print( f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s", file=sys.stderr, flush=True, ) write_message(progress_message(request_id, 100, "done", "Transcription complete")) return result def result_to_payload(result: TranscriptionResult) -> dict[str, Any]: """Convert TranscriptionResult to IPC payload dict.""" return { "segments": [ { "text": seg.text, "start_ms": seg.start_ms, "end_ms": seg.end_ms, "words": [ { "word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence, } for w in seg.words ], } for seg in result.segments ], "language": result.language, "language_probability": result.language_probability, "duration_ms": result.duration_ms, }