voice-to-notes/python/voice_to_notes/services/transcribe.py

"""Transcription service — faster-whisper pipeline with word-level timestamps."""

from __future__ import annotations

import sys
import time
from dataclasses import dataclass, field
from typing import Any

from faster_whisper import WhisperModel

from voice_to_notes.ipc.messages import progress_message
from voice_to_notes.ipc.protocol import write_message


@dataclass
class WordResult:
    """A single word with timestamp."""

    word: str
    start_ms: int
    end_ms: int
    confidence: float


@dataclass
class SegmentResult:
    """A transcription segment with words."""

    text: str
    start_ms: int
    end_ms: int
    words: list[WordResult] = field(default_factory=list)


@dataclass
class TranscriptionResult:
    """Full transcription output."""

    segments: list[SegmentResult] = field(default_factory=list)
    language: str = ""
    language_probability: float = 0.0
    duration_ms: int = 0


class TranscribeService:
    """Handles audio transcription via faster-whisper."""

    def __init__(self) -> None:
        self._model: WhisperModel | None = None
        self._current_model_name: str = ""
        self._current_device: str = ""
        self._current_compute_type: str = ""

    def _ensure_model(
        self,
        model_name: str = "base",
        device: str = "cpu",
        compute_type: str = "int8",
    ) -> WhisperModel:
        """Load or reuse the Whisper model."""
        if (
            self._model is not None
            and self._current_model_name == model_name
            and self._current_device == device
            and self._current_compute_type == compute_type
        ):
            return self._model

        print(
            f"[sidecar] Loading model {model_name} on {device} ({compute_type})",
            file=sys.stderr,
            flush=True,
        )
        self._model = WhisperModel(
            model_name,
            device=device,
            compute_type=compute_type,
        )
        self._current_model_name = model_name
        self._current_device = device
        self._current_compute_type = compute_type
        return self._model

    def transcribe(
        self,
        request_id: str,
        file_path: str,
        model_name: str = "base",
        device: str = "cpu",
        compute_type: str = "int8",
        language: str | None = None,
    ) -> TranscriptionResult:
        """Transcribe an audio file with word-level timestamps.

        Sends progress messages via IPC during processing.
        """
        # Stage: loading model
        write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}..."))
        model = self._ensure_model(model_name, device, compute_type)

        # Stage: transcribing
        write_message(progress_message(request_id, 10, "transcribing", "Starting transcription..."))

        start_time = time.time()
        segments_iter, info = model.transcribe(
            file_path,
            language=language,
            word_timestamps=True,
            vad_filter=True,
        )

        result = TranscriptionResult(
            language=info.language,
            language_probability=info.language_probability,
            duration_ms=int(info.duration * 1000),
        )

        # Process segments with progress reporting
        total_duration = info.duration if info.duration > 0 else 1.0
        segment_count = 0

        for segment in segments_iter:
            segment_count += 1
            progress_pct = min(10 + int((segment.end / total_duration) * 80), 90)

            words = []
            if segment.words:
                for w in segment.words:
                    words.append(
                        WordResult(
                            word=w.word.strip(),
                            start_ms=int(w.start * 1000),
                            end_ms=int(w.end * 1000),
                            confidence=round(w.probability, 4),
                        )
                    )

            result.segments.append(
                SegmentResult(
                    text=segment.text.strip(),
                    start_ms=int(segment.start * 1000),
                    end_ms=int(segment.end * 1000),
                    words=words,
                )
            )

            # Send progress every few segments
            if segment_count % 5 == 0:
                write_message(
                    progress_message(
                        request_id,
                        progress_pct,
                        "transcribing",
                        f"Processed {segment_count} segments...",
                    )
                )

        elapsed = time.time() - start_time
        print(
            f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
            file=sys.stderr,
            flush=True,
        )

        write_message(progress_message(request_id, 100, "done", "Transcription complete"))
        return result


def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
    """Convert TranscriptionResult to IPC payload dict."""
    return {
        "segments": [
            {
                "text": seg.text,
                "start_ms": seg.start_ms,
                "end_ms": seg.end_ms,
                "words": [
                    {
                        "word": w.word,
                        "start_ms": w.start_ms,
                        "end_ms": w.end_ms,
                        "confidence": w.confidence,
                    }
                    for w in seg.words
                ],
            }
            for seg in result.segments
        ],
        "language": result.language,
        "language_probability": result.language_probability,
        "duration_ms": result.duration_ms,
    }