Files
voice-to-notes/python/voice_to_notes/services/transcribe.py

194 lines
5.7 KiB
Python
Raw Normal View History

"""Transcription service — faster-whisper pipeline with word-level timestamps."""
Phase 1 foundation: Tauri shell, Python sidecar, SQLite database Tauri v2 + Svelte + TypeScript frontend: - App shell with workspace layout (waveform, transcript, speakers, AI chat) - Placeholder components for all major UI areas - Typed stores (project, transcript, playback, AI) - TypeScript interfaces matching the database schema - Tauri bridge service with typed invoke wrappers - svelte-check passes with 0 errors Rust backend: - Tauri v2 app entry point with command registration - SQLite database layer (rusqlite with bundled SQLite) - Full schema: projects, media_files, speakers, segments, words, ai_outputs, annotations (with indexes) - Model structs with serde serialization - CRUD queries for projects, speakers, segments, words - Segment text editing preserves original text - Schema versioning for future migrations - 6 tests passing - Command stubs for project, transcribe, export, AI, settings, system - App state management Python sidecar: - JSON-line IPC protocol (stdin/stdout) - Message types: IPCMessage, progress, error, ready - Handler registry with routing and error handling - Ping/pong handler for connectivity testing - Service stubs: transcribe, diarize, pipeline, AI, export - Provider stubs: local (llama-server), OpenAI, Anthropic, LiteLLM - Hardware detection stubs - 14 tests passing, ruff clean Also adds: - Testing strategy document (docs/TESTING.md) - Validation script (scripts/validate.sh) - Updated .gitignore for Svelte, Rust, Python artifacts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 15:16:06 -08:00
from __future__ import annotations
import sys
import time
from dataclasses import dataclass, field
from typing import Any
from faster_whisper import WhisperModel
from voice_to_notes.ipc.messages import progress_message
from voice_to_notes.ipc.protocol import write_message
@dataclass
class WordResult:
"""A single word with timestamp."""
word: str
start_ms: int
end_ms: int
confidence: float
@dataclass
class SegmentResult:
"""A transcription segment with words."""
text: str
start_ms: int
end_ms: int
words: list[WordResult] = field(default_factory=list)
@dataclass
class TranscriptionResult:
"""Full transcription output."""
segments: list[SegmentResult] = field(default_factory=list)
language: str = ""
language_probability: float = 0.0
duration_ms: int = 0
Phase 1 foundation: Tauri shell, Python sidecar, SQLite database Tauri v2 + Svelte + TypeScript frontend: - App shell with workspace layout (waveform, transcript, speakers, AI chat) - Placeholder components for all major UI areas - Typed stores (project, transcript, playback, AI) - TypeScript interfaces matching the database schema - Tauri bridge service with typed invoke wrappers - svelte-check passes with 0 errors Rust backend: - Tauri v2 app entry point with command registration - SQLite database layer (rusqlite with bundled SQLite) - Full schema: projects, media_files, speakers, segments, words, ai_outputs, annotations (with indexes) - Model structs with serde serialization - CRUD queries for projects, speakers, segments, words - Segment text editing preserves original text - Schema versioning for future migrations - 6 tests passing - Command stubs for project, transcribe, export, AI, settings, system - App state management Python sidecar: - JSON-line IPC protocol (stdin/stdout) - Message types: IPCMessage, progress, error, ready - Handler registry with routing and error handling - Ping/pong handler for connectivity testing - Service stubs: transcribe, diarize, pipeline, AI, export - Provider stubs: local (llama-server), OpenAI, Anthropic, LiteLLM - Hardware detection stubs - 14 tests passing, ruff clean Also adds: - Testing strategy document (docs/TESTING.md) - Validation script (scripts/validate.sh) - Updated .gitignore for Svelte, Rust, Python artifacts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 15:16:06 -08:00
class TranscribeService:
"""Handles audio transcription via faster-whisper."""
def __init__(self) -> None:
self._model: WhisperModel | None = None
self._current_model_name: str = ""
self._current_device: str = ""
self._current_compute_type: str = ""
def _ensure_model(
self,
model_name: str = "base",
device: str = "cpu",
compute_type: str = "int8",
) -> WhisperModel:
"""Load or reuse the Whisper model."""
if (
self._model is not None
and self._current_model_name == model_name
and self._current_device == device
and self._current_compute_type == compute_type
):
return self._model
print(
f"[sidecar] Loading model {model_name} on {device} ({compute_type})",
file=sys.stderr,
flush=True,
)
self._model = WhisperModel(
model_name,
device=device,
compute_type=compute_type,
)
self._current_model_name = model_name
self._current_device = device
self._current_compute_type = compute_type
return self._model
def transcribe(
self,
request_id: str,
file_path: str,
model_name: str = "base",
device: str = "cpu",
compute_type: str = "int8",
language: str | None = None,
) -> TranscriptionResult:
"""Transcribe an audio file with word-level timestamps.
Sends progress messages via IPC during processing.
"""
# Stage: loading model
write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}..."))
model = self._ensure_model(model_name, device, compute_type)
# Stage: transcribing
write_message(progress_message(request_id, 10, "transcribing", "Starting transcription..."))
start_time = time.time()
segments_iter, info = model.transcribe(
file_path,
language=language,
word_timestamps=True,
vad_filter=True,
)
result = TranscriptionResult(
language=info.language,
language_probability=info.language_probability,
duration_ms=int(info.duration * 1000),
)
# Process segments with progress reporting
total_duration = info.duration if info.duration > 0 else 1.0
segment_count = 0
for segment in segments_iter:
segment_count += 1
progress_pct = min(10 + int((segment.end / total_duration) * 80), 90)
words = []
if segment.words:
for w in segment.words:
words.append(
WordResult(
word=w.word.strip(),
start_ms=int(w.start * 1000),
end_ms=int(w.end * 1000),
confidence=round(w.probability, 4),
)
)
result.segments.append(
SegmentResult(
text=segment.text.strip(),
start_ms=int(segment.start * 1000),
end_ms=int(segment.end * 1000),
words=words,
)
)
# Send progress every few segments
if segment_count % 5 == 0:
write_message(
progress_message(
request_id,
progress_pct,
"transcribing",
f"Processed {segment_count} segments...",
)
)
elapsed = time.time() - start_time
print(
f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
file=sys.stderr,
flush=True,
)
write_message(progress_message(request_id, 100, "done", "Transcription complete"))
return result
def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
"""Convert TranscriptionResult to IPC payload dict."""
return {
"segments": [
{
"text": seg.text,
"start_ms": seg.start_ms,
"end_ms": seg.end_ms,
"words": [
{
"word": w.word,
"start_ms": w.start_ms,
"end_ms": w.end_ms,
"confidence": w.confidence,
}
for w in seg.words
],
}
for seg in result.segments
],
"language": result.language,
"language_probability": result.language_probability,
"duration_ms": result.duration_ms,
}