python/voice_to_notes/services/transcribe.py

"""Transcription service — faster-whisper pipeline with word-level timestamps."""

from __future__ import annotations

import sys
import time
from dataclasses import dataclass, field
from typing import Any

from faster_whisper import WhisperModel

from voice_to_notes.ipc.messages import progress_message
from voice_to_notes.ipc.protocol import write_message


@dataclass
class WordResult:
    """A single word with timestamp."""

    word: str
    start_ms: int
    end_ms: int
    confidence: float


@dataclass
class SegmentResult:
    """A transcription segment with words."""

    text: str
    start_ms: int
    end_ms: int
    words: list[WordResult] = field(default_factory=list)


@dataclass
class TranscriptionResult:
    """Full transcription output."""

    segments: list[SegmentResult] = field(default_factory=list)
    language: str = ""
    language_probability: float = 0.0
    duration_ms: int = 0


class TranscribeService:
    """Handles audio transcription via faster-whisper."""

    def __init__(self) -> None:
        self._model: WhisperModel | None = None
        self._current_model_name: str = ""
        self._current_device: str = ""
        self._current_compute_type: str = ""

    def _ensure_model(
        self,
        model_name: str = "base",
        device: str = "cpu",
        compute_type: str = "int8",
    ) -> WhisperModel:
        """Load or reuse the Whisper model."""
        if (
            self._model is not None
            and self._current_model_name == model_name
            and self._current_device == device
            and self._current_compute_type == compute_type
        ):
            return self._model

        print(
            f"[sidecar] Loading model {model_name} on {device} ({compute_type})",
            file=sys.stderr,
            flush=True,
        )
        self._model = WhisperModel(
            model_name,
            device=device,
            compute_type=compute_type,
        )
        self._current_model_name = model_name
        self._current_device = device
        self._current_compute_type = compute_type
        return self._model

    def transcribe(
        self,
        request_id: str,
        file_path: str,
        model_name: str = "base",
        device: str = "cpu",
        compute_type: str = "int8",
        language: str | None = None,
    ) -> TranscriptionResult:
        """Transcribe an audio file with word-level timestamps.

        Sends progress messages via IPC during processing.
        """
        # Stage: loading model
        write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}..."))
        model = self._ensure_model(model_name, device, compute_type)

        # Stage: transcribing
        write_message(progress_message(request_id, 10, "transcribing", "Starting transcription..."))

        start_time = time.time()
        segments_iter, info = model.transcribe(
            file_path,
            language=language,
            word_timestamps=True,
            vad_filter=True,
        )

        result = TranscriptionResult(
            language=info.language,
            language_probability=info.language_probability,
            duration_ms=int(info.duration * 1000),
        )

        # Process segments with progress reporting
        total_duration = info.duration if info.duration > 0 else 1.0
        segment_count = 0

        for segment in segments_iter:
            segment_count += 1
            progress_pct = min(10 + int((segment.end / total_duration) * 80), 90)

            words = []
            if segment.words:
                for w in segment.words:
                    words.append(
                        WordResult(
                            word=w.word.strip(),
                            start_ms=int(w.start * 1000),
                            end_ms=int(w.end * 1000),
                            confidence=round(w.probability, 4),
                        )
                    )

            result.segments.append(
                SegmentResult(
                    text=segment.text.strip(),
                    start_ms=int(segment.start * 1000),
                    end_ms=int(segment.end * 1000),
                    words=words,
                )
            )

            # Send progress every few segments
            if segment_count % 5 == 0:
                write_message(
                    progress_message(
                        request_id,
                        progress_pct,
                        "transcribing",
                        f"Processed {segment_count} segments...",
                    )
                )

        elapsed = time.time() - start_time
        print(
            f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
            file=sys.stderr,
            flush=True,
        )

        write_message(progress_message(request_id, 100, "done", "Transcription complete"))
        return result


def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
    """Convert TranscriptionResult to IPC payload dict."""
    return {
        "segments": [
            {
                "text": seg.text,
                "start_ms": seg.start_ms,
                "end_ms": seg.end_ms,
                "words": [
                    {
                        "word": w.word,
                        "start_ms": w.start_ms,
                        "end_ms": w.end_ms,
                        "confidence": w.confidence,
                    }
                    for w in seg.words
                ],
            }
            for seg in result.segments
        ],
        "language": result.language,
        "language_probability": result.language_probability,
        "duration_ms": result.duration_ms,
    }
Phase 2: Core transcription pipeline and audio playback - Implement faster-whisper TranscribeService with word-level timestamps, progress reporting, and hardware auto-detection - Wire up Rust SidecarManager for Python process lifecycle (spawn, IPC, shutdown) - Add transcribe_file Tauri command bridging frontend to Python sidecar - Integrate wavesurfer.js WaveformPlayer with play/pause, skip, seek controls - Build TranscriptEditor with word-level click-to-seek and active highlighting - Connect file import flow: prompt → asset load → transcribe → display - Add typed tauri-bridge service with TranscriptionResult interface - Add Python tests for hardware detection and transcription result formatting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-26 15:53:09 -08:00			`"""Transcription service — faster-whisper pipeline with word-level timestamps."""`
Phase 1 foundation: Tauri shell, Python sidecar, SQLite database Tauri v2 + Svelte + TypeScript frontend: - App shell with workspace layout (waveform, transcript, speakers, AI chat) - Placeholder components for all major UI areas - Typed stores (project, transcript, playback, AI) - TypeScript interfaces matching the database schema - Tauri bridge service with typed invoke wrappers - svelte-check passes with 0 errors Rust backend: - Tauri v2 app entry point with command registration - SQLite database layer (rusqlite with bundled SQLite) - Full schema: projects, media_files, speakers, segments, words, ai_outputs, annotations (with indexes) - Model structs with serde serialization - CRUD queries for projects, speakers, segments, words - Segment text editing preserves original text - Schema versioning for future migrations - 6 tests passing - Command stubs for project, transcribe, export, AI, settings, system - App state management Python sidecar: - JSON-line IPC protocol (stdin/stdout) - Message types: IPCMessage, progress, error, ready - Handler registry with routing and error handling - Ping/pong handler for connectivity testing - Service stubs: transcribe, diarize, pipeline, AI, export - Provider stubs: local (llama-server), OpenAI, Anthropic, LiteLLM - Hardware detection stubs - 14 tests passing, ruff clean Also adds: - Testing strategy document (docs/TESTING.md) - Validation script (scripts/validate.sh) - Updated .gitignore for Svelte, Rust, Python artifacts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-26 15:16:06 -08:00
			`from __future__ import annotations`

Phase 2: Core transcription pipeline and audio playback - Implement faster-whisper TranscribeService with word-level timestamps, progress reporting, and hardware auto-detection - Wire up Rust SidecarManager for Python process lifecycle (spawn, IPC, shutdown) - Add transcribe_file Tauri command bridging frontend to Python sidecar - Integrate wavesurfer.js WaveformPlayer with play/pause, skip, seek controls - Build TranscriptEditor with word-level click-to-seek and active highlighting - Connect file import flow: prompt → asset load → transcribe → display - Add typed tauri-bridge service with TranscriptionResult interface - Add Python tests for hardware detection and transcription result formatting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-26 15:53:09 -08:00			`import sys`
			`import time`
			`from dataclasses import dataclass, field`
			`from typing import Any`

			`from faster_whisper import WhisperModel`

			`from voice_to_notes.ipc.messages import progress_message`
			`from voice_to_notes.ipc.protocol import write_message`


			`@dataclass`
			`class WordResult:`
			`"""A single word with timestamp."""`

			`word: str`
			`start_ms: int`
			`end_ms: int`
			`confidence: float`


			`@dataclass`
			`class SegmentResult:`
			`"""A transcription segment with words."""`

			`text: str`
			`start_ms: int`
			`end_ms: int`
			`words: list[WordResult] = field(default_factory=list)`


			`@dataclass`
			`class TranscriptionResult:`
			`"""Full transcription output."""`

			`segments: list[SegmentResult] = field(default_factory=list)`
			`language: str = ""`
			`language_probability: float = 0.0`
			`duration_ms: int = 0`

Phase 1 foundation: Tauri shell, Python sidecar, SQLite database Tauri v2 + Svelte + TypeScript frontend: - App shell with workspace layout (waveform, transcript, speakers, AI chat) - Placeholder components for all major UI areas - Typed stores (project, transcript, playback, AI) - TypeScript interfaces matching the database schema - Tauri bridge service with typed invoke wrappers - svelte-check passes with 0 errors Rust backend: - Tauri v2 app entry point with command registration - SQLite database layer (rusqlite with bundled SQLite) - Full schema: projects, media_files, speakers, segments, words, ai_outputs, annotations (with indexes) - Model structs with serde serialization - CRUD queries for projects, speakers, segments, words - Segment text editing preserves original text - Schema versioning for future migrations - 6 tests passing - Command stubs for project, transcribe, export, AI, settings, system - App state management Python sidecar: - JSON-line IPC protocol (stdin/stdout) - Message types: IPCMessage, progress, error, ready - Handler registry with routing and error handling - Ping/pong handler for connectivity testing - Service stubs: transcribe, diarize, pipeline, AI, export - Provider stubs: local (llama-server), OpenAI, Anthropic, LiteLLM - Hardware detection stubs - 14 tests passing, ruff clean Also adds: - Testing strategy document (docs/TESTING.md) - Validation script (scripts/validate.sh) - Updated .gitignore for Svelte, Rust, Python artifacts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-26 15:16:06 -08:00
			`class TranscribeService:`
			`"""Handles audio transcription via faster-whisper."""`

Phase 2: Core transcription pipeline and audio playback - Implement faster-whisper TranscribeService with word-level timestamps, progress reporting, and hardware auto-detection - Wire up Rust SidecarManager for Python process lifecycle (spawn, IPC, shutdown) - Add transcribe_file Tauri command bridging frontend to Python sidecar - Integrate wavesurfer.js WaveformPlayer with play/pause, skip, seek controls - Build TranscriptEditor with word-level click-to-seek and active highlighting - Connect file import flow: prompt → asset load → transcribe → display - Add typed tauri-bridge service with TranscriptionResult interface - Add Python tests for hardware detection and transcription result formatting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-26 15:53:09 -08:00			`def __init__(self) -> None:`
			`self._model: WhisperModel \| None = None`
			`self._current_model_name: str = ""`
			`self._current_device: str = ""`
			`self._current_compute_type: str = ""`

			`def _ensure_model(`
			`self,`
			`model_name: str = "base",`
			`device: str = "cpu",`
			`compute_type: str = "int8",`
			`) -> WhisperModel:`
			`"""Load or reuse the Whisper model."""`
			`if (`
			`self._model is not None`
			`and self._current_model_name == model_name`
			`and self._current_device == device`
			`and self._current_compute_type == compute_type`
			`):`
			`return self._model`

			`print(`
			`f"[sidecar] Loading model {model_name} on {device} ({compute_type})",`
			`file=sys.stderr,`
			`flush=True,`
			`)`
			`self._model = WhisperModel(`
			`model_name,`
			`device=device,`
			`compute_type=compute_type,`
			`)`
			`self._current_model_name = model_name`
			`self._current_device = device`
			`self._current_compute_type = compute_type`
			`return self._model`

			`def transcribe(`
			`self,`
			`request_id: str,`
			`file_path: str,`
			`model_name: str = "base",`
			`device: str = "cpu",`
			`compute_type: str = "int8",`
			`language: str \| None = None,`
			`) -> TranscriptionResult:`
			`"""Transcribe an audio file with word-level timestamps.`

			`Sends progress messages via IPC during processing.`
			`"""`
			`# Stage: loading model`
			`write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}..."))`
			`model = self._ensure_model(model_name, device, compute_type)`

			`# Stage: transcribing`
			`write_message(progress_message(request_id, 10, "transcribing", "Starting transcription..."))`

			`start_time = time.time()`
			`segments_iter, info = model.transcribe(`
			`file_path,`
			`language=language,`
			`word_timestamps=True,`
			`vad_filter=True,`
			`)`

			`result = TranscriptionResult(`
			`language=info.language,`
			`language_probability=info.language_probability,`
			`duration_ms=int(info.duration * 1000),`
			`)`

			`# Process segments with progress reporting`
			`total_duration = info.duration if info.duration > 0 else 1.0`
			`segment_count = 0`

			`for segment in segments_iter:`
			`segment_count += 1`
			`progress_pct = min(10 + int((segment.end / total_duration) * 80), 90)`

			`words = []`
			`if segment.words:`
			`for w in segment.words:`
			`words.append(`
			`WordResult(`
			`word=w.word.strip(),`
			`start_ms=int(w.start * 1000),`
			`end_ms=int(w.end * 1000),`
			`confidence=round(w.probability, 4),`
			`)`
			`)`

			`result.segments.append(`
			`SegmentResult(`
			`text=segment.text.strip(),`
			`start_ms=int(segment.start * 1000),`
			`end_ms=int(segment.end * 1000),`
			`words=words,`
			`)`
			`)`

			`# Send progress every few segments`
			`if segment_count % 5 == 0:`
			`write_message(`
			`progress_message(`
			`request_id,`
			`progress_pct,`
			`"transcribing",`
			`f"Processed {segment_count} segments...",`
			`)`
			`)`

			`elapsed = time.time() - start_time`
			`print(`
			`f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",`
			`file=sys.stderr,`
			`flush=True,`
			`)`

			`write_message(progress_message(request_id, 100, "done", "Transcription complete"))`
			`return result`


			`def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:`
			`"""Convert TranscriptionResult to IPC payload dict."""`
			`return {`
			`"segments": [`
			`{`
			`"text": seg.text,`
			`"start_ms": seg.start_ms,`
			`"end_ms": seg.end_ms,`
			`"words": [`
			`{`
			`"word": w.word,`
			`"start_ms": w.start_ms,`
			`"end_ms": w.end_ms,`
			`"confidence": w.confidence,`
			`}`
			`for w in seg.words`
			`],`
			`}`
			`for seg in result.segments`
			`],`
			`"language": result.language,`
			`"language_probability": result.language_probability,`
			`"duration_ms": result.duration_ms,`
			`}`