Phase 2: Core transcription pipeline and audio playback

- Implement faster-whisper TranscribeService with word-level timestamps, progress reporting, and hardware auto-detection - Wire up Rust SidecarManager for Python process lifecycle (spawn, IPC, shutdown) - Add transcribe_file Tauri command bridging frontend to Python sidecar - Integrate wavesurfer.js WaveformPlayer with play/pause, skip, seek controls - Build TranscriptEditor with word-level click-to-seek and active highlighting - Connect file import flow: prompt → asset load → transcribe → display - Add typed tauri-bridge service with TranscriptionResult interface - Add Python tests for hardware detection and transcription result formatting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 15:53:09 -08:00
parent 503cc6c0cf
commit 48fe41b064
18 changed files with 1775 additions and 32 deletions
--- a/python/voice_to_notes/services/transcribe.py
+++ b/python/voice_to_notes/services/transcribe.py
@@ -1,13 +1,193 @@
-"""Transcription service — faster-whisper + wav2vec2 pipeline."""
+"""Transcription service — faster-whisper pipeline with word-level timestamps."""

 from __future__ import annotations

+import sys
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+from faster_whisper import WhisperModel
+
+from voice_to_notes.ipc.messages import progress_message
+from voice_to_notes.ipc.protocol import write_message
+
+
+@dataclass
+class WordResult:
+    """A single word with timestamp."""
+
+    word: str
+    start_ms: int
+    end_ms: int
+    confidence: float
+
+
+@dataclass
+class SegmentResult:
+    """A transcription segment with words."""
+
+    text: str
+    start_ms: int
+    end_ms: int
+    words: list[WordResult] = field(default_factory=list)
+
+
+@dataclass
+class TranscriptionResult:
+    """Full transcription output."""
+
+    segments: list[SegmentResult] = field(default_factory=list)
+    language: str = ""
+    language_probability: float = 0.0
+    duration_ms: int = 0
+

 class TranscribeService:
    """Handles audio transcription via faster-whisper."""

-    # TODO: Implement faster-whisper integration
-    # - Load model based on hardware detection
-    # - Transcribe audio with word-level timestamps
-    # - Report progress via IPC
-    pass
+    def __init__(self) -> None:
+        self._model: WhisperModel | None = None
+        self._current_model_name: str = ""
+        self._current_device: str = ""
+        self._current_compute_type: str = ""
+
+    def _ensure_model(
+        self,
+        model_name: str = "base",
+        device: str = "cpu",
+        compute_type: str = "int8",
+    ) -> WhisperModel:
+        """Load or reuse the Whisper model."""
+        if (
+            self._model is not None
+            and self._current_model_name == model_name
+            and self._current_device == device
+            and self._current_compute_type == compute_type
+        ):
+            return self._model
+
+        print(
+            f"[sidecar] Loading model {model_name} on {device} ({compute_type})",
+            file=sys.stderr,
+            flush=True,
+        )
+        self._model = WhisperModel(
+            model_name,
+            device=device,
+            compute_type=compute_type,
+        )
+        self._current_model_name = model_name
+        self._current_device = device
+        self._current_compute_type = compute_type
+        return self._model
+
+    def transcribe(
+        self,
+        request_id: str,
+        file_path: str,
+        model_name: str = "base",
+        device: str = "cpu",
+        compute_type: str = "int8",
+        language: str | None = None,
+    ) -> TranscriptionResult:
+        """Transcribe an audio file with word-level timestamps.
+
+        Sends progress messages via IPC during processing.
+        """
+        # Stage: loading model
+        write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}..."))
+        model = self._ensure_model(model_name, device, compute_type)
+
+        # Stage: transcribing
+        write_message(progress_message(request_id, 10, "transcribing", "Starting transcription..."))
+
+        start_time = time.time()
+        segments_iter, info = model.transcribe(
+            file_path,
+            language=language,
+            word_timestamps=True,
+            vad_filter=True,
+        )
+
+        result = TranscriptionResult(
+            language=info.language,
+            language_probability=info.language_probability,
+            duration_ms=int(info.duration * 1000),
+        )
+
+        # Process segments with progress reporting
+        total_duration = info.duration if info.duration > 0 else 1.0
+        segment_count = 0
+
+        for segment in segments_iter:
+            segment_count += 1
+            progress_pct = min(10 + int((segment.end / total_duration) * 80), 90)
+
+            words = []
+            if segment.words:
+                for w in segment.words:
+                    words.append(
+                        WordResult(
+                            word=w.word.strip(),
+                            start_ms=int(w.start * 1000),
+                            end_ms=int(w.end * 1000),
+                            confidence=round(w.probability, 4),
+                        )
+                    )
+
+            result.segments.append(
+                SegmentResult(
+                    text=segment.text.strip(),
+                    start_ms=int(segment.start * 1000),
+                    end_ms=int(segment.end * 1000),
+                    words=words,
+                )
+            )
+
+            # Send progress every few segments
+            if segment_count % 5 == 0:
+                write_message(
+                    progress_message(
+                        request_id,
+                        progress_pct,
+                        "transcribing",
+                        f"Processed {segment_count} segments...",
+                    )
+                )
+
+        elapsed = time.time() - start_time
+        print(
+            f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
+            file=sys.stderr,
+            flush=True,
+        )
+
+        write_message(progress_message(request_id, 100, "done", "Transcription complete"))
+        return result
+
+
+def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
+    """Convert TranscriptionResult to IPC payload dict."""
+    return {
+        "segments": [
+            {
+                "text": seg.text,
+                "start_ms": seg.start_ms,
+                "end_ms": seg.end_ms,
+                "words": [
+                    {
+                        "word": w.word,
+                        "start_ms": w.start_ms,
+                        "end_ms": w.end_ms,
+                        "confidence": w.confidence,
+                    }
+                    for w in seg.words
+                ],
+            }
+            for seg in result.segments
+        ],
+        "language": result.language,
+        "language_probability": result.language_probability,
+        "duration_ms": result.duration_ms,
+    }