Phase 2: Core transcription pipeline and audio playback

- Implement faster-whisper TranscribeService with word-level timestamps, progress reporting, and hardware auto-detection - Wire up Rust SidecarManager for Python process lifecycle (spawn, IPC, shutdown) - Add transcribe_file Tauri command bridging frontend to Python sidecar - Integrate wavesurfer.js WaveformPlayer with play/pause, skip, seek controls - Build TranscriptEditor with word-level click-to-seek and active highlighting - Connect file import flow: prompt → asset load → transcribe → display - Add typed tauri-bridge service with TranscriptionResult interface - Add Python tests for hardware detection and transcription result formatting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 15:53:09 -08:00
parent 503cc6c0cf
commit 48fe41b064
18 changed files with 1775 additions and 32 deletions
--- a/python/voice_to_notes/hardware/detect.py
+++ b/python/voice_to_notes/hardware/detect.py
@@ -2,8 +2,74 @@

 from __future__ import annotations

-# TODO: Implement hardware detection
-# - Check torch.cuda.is_available()
-# - Detect VRAM size
-# - Detect CPU cores and available RAM
-# - Return recommended model configuration
+import os
+import sys
+from dataclasses import dataclass
+
+
+@dataclass
+class HardwareInfo:
+    """Detected hardware capabilities."""
+
+    has_cuda: bool = False
+    cuda_device_name: str = ""
+    vram_mb: int = 0
+    ram_mb: int = 0
+    cpu_cores: int = 0
+    recommended_model: str = "base"
+    recommended_device: str = "cpu"
+    recommended_compute_type: str = "int8"
+
+
+def detect_hardware() -> HardwareInfo:
+    """Detect available hardware and recommend model configuration."""
+    info = HardwareInfo()
+
+    # CPU info
+    info.cpu_cores = os.cpu_count() or 1
+
+    # RAM info
+    try:
+        with open("/proc/meminfo") as f:
+            for line in f:
+                if line.startswith("MemTotal:"):
+                    # Value is in kB
+                    info.ram_mb = int(line.split()[1]) // 1024
+                    break
+    except (FileNotFoundError, ValueError):
+        pass
+
+    # CUDA detection
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            info.has_cuda = True
+            info.cuda_device_name = torch.cuda.get_device_name(0)
+            info.vram_mb = torch.cuda.get_device_properties(0).total_mem // (1024 * 1024)
+    except ImportError:
+        print("[sidecar] torch not available, GPU detection skipped", file=sys.stderr, flush=True)
+
+    # Model recommendation based on hardware
+    if info.has_cuda and info.vram_mb >= 8000:
+        info.recommended_model = "large-v3-turbo"
+        info.recommended_device = "cuda"
+        info.recommended_compute_type = "int8"
+    elif info.has_cuda and info.vram_mb >= 4000:
+        info.recommended_model = "medium"
+        info.recommended_device = "cuda"
+        info.recommended_compute_type = "int8"
+    elif info.ram_mb >= 16000:
+        info.recommended_model = "medium"
+        info.recommended_device = "cpu"
+        info.recommended_compute_type = "int8"
+    elif info.ram_mb >= 8000:
+        info.recommended_model = "small"
+        info.recommended_device = "cpu"
+        info.recommended_compute_type = "int8"
+    else:
+        info.recommended_model = "base"
+        info.recommended_device = "cpu"
+        info.recommended_compute_type = "int8"
+
+    return info
--- a/python/voice_to_notes/ipc/handlers.py
+++ b/python/voice_to_notes/ipc/handlers.py
@@ -37,3 +37,49 @@ class HandlerRegistry:
 def ping_handler(msg: IPCMessage) -> IPCMessage:
    """Simple ping handler for testing connectivity."""
    return IPCMessage(id=msg.id, type="pong", payload={"echo": msg.payload})
+
+
+def make_transcribe_handler() -> HandlerFunc:
+    """Create a transcription handler with a persistent TranscribeService."""
+    from voice_to_notes.services.transcribe import TranscribeService, result_to_payload
+
+    service = TranscribeService()
+
+    def handler(msg: IPCMessage) -> IPCMessage:
+        payload = msg.payload
+        result = service.transcribe(
+            request_id=msg.id,
+            file_path=payload["file"],
+            model_name=payload.get("model", "base"),
+            device=payload.get("device", "cpu"),
+            compute_type=payload.get("compute_type", "int8"),
+            language=payload.get("language"),
+        )
+        return IPCMessage(
+            id=msg.id,
+            type="transcribe.result",
+            payload=result_to_payload(result),
+        )
+
+    return handler
+
+
+def hardware_detect_handler(msg: IPCMessage) -> IPCMessage:
+    """Detect hardware capabilities and return recommendations."""
+    from voice_to_notes.hardware.detect import detect_hardware
+
+    info = detect_hardware()
+    return IPCMessage(
+        id=msg.id,
+        type="hardware.info",
+        payload={
+            "has_cuda": info.has_cuda,
+            "cuda_device_name": info.cuda_device_name,
+            "vram_mb": info.vram_mb,
+            "ram_mb": info.ram_mb,
+            "cpu_cores": info.cpu_cores,
+            "recommended_model": info.recommended_model,
+            "recommended_device": info.recommended_device,
+            "recommended_compute_type": info.recommended_compute_type,
+        },
+    )
--- a/python/voice_to_notes/main.py
+++ b/python/voice_to_notes/main.py
@@ -5,7 +5,12 @@ from __future__ import annotations
 import signal
 import sys

-from voice_to_notes.ipc.handlers import HandlerRegistry, ping_handler
+from voice_to_notes.ipc.handlers import (
+    HandlerRegistry,
+    hardware_detect_handler,
+    make_transcribe_handler,
+    ping_handler,
+)
 from voice_to_notes.ipc.messages import ready_message
 from voice_to_notes.ipc.protocol import read_message, write_message

@@ -14,7 +19,9 @@ def create_registry() -> HandlerRegistry:
    """Set up the message handler registry."""
    registry = HandlerRegistry()
    registry.register("ping", ping_handler)
-    # TODO: Register transcribe, diarize, pipeline, ai, export handlers
+    registry.register("transcribe.start", make_transcribe_handler())
+    registry.register("hardware.detect", hardware_detect_handler)
+    # TODO: Register diarize, pipeline, ai, export handlers
    return registry


--- a/python/voice_to_notes/services/transcribe.py
+++ b/python/voice_to_notes/services/transcribe.py
@@ -1,13 +1,193 @@
-"""Transcription service — faster-whisper + wav2vec2 pipeline."""
+"""Transcription service — faster-whisper pipeline with word-level timestamps."""

 from __future__ import annotations

+import sys
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+from faster_whisper import WhisperModel
+
+from voice_to_notes.ipc.messages import progress_message
+from voice_to_notes.ipc.protocol import write_message
+
+
+@dataclass
+class WordResult:
+    """A single word with timestamp."""
+
+    word: str
+    start_ms: int
+    end_ms: int
+    confidence: float
+
+
+@dataclass
+class SegmentResult:
+    """A transcription segment with words."""
+
+    text: str
+    start_ms: int
+    end_ms: int
+    words: list[WordResult] = field(default_factory=list)
+
+
+@dataclass
+class TranscriptionResult:
+    """Full transcription output."""
+
+    segments: list[SegmentResult] = field(default_factory=list)
+    language: str = ""
+    language_probability: float = 0.0
+    duration_ms: int = 0
+

 class TranscribeService:
    """Handles audio transcription via faster-whisper."""

-    # TODO: Implement faster-whisper integration
-    # - Load model based on hardware detection
-    # - Transcribe audio with word-level timestamps
-    # - Report progress via IPC
-    pass
+    def __init__(self) -> None:
+        self._model: WhisperModel | None = None
+        self._current_model_name: str = ""
+        self._current_device: str = ""
+        self._current_compute_type: str = ""
+
+    def _ensure_model(
+        self,
+        model_name: str = "base",
+        device: str = "cpu",
+        compute_type: str = "int8",
+    ) -> WhisperModel:
+        """Load or reuse the Whisper model."""
+        if (
+            self._model is not None
+            and self._current_model_name == model_name
+            and self._current_device == device
+            and self._current_compute_type == compute_type
+        ):
+            return self._model
+
+        print(
+            f"[sidecar] Loading model {model_name} on {device} ({compute_type})",
+            file=sys.stderr,
+            flush=True,
+        )
+        self._model = WhisperModel(
+            model_name,
+            device=device,
+            compute_type=compute_type,
+        )
+        self._current_model_name = model_name
+        self._current_device = device
+        self._current_compute_type = compute_type
+        return self._model
+
+    def transcribe(
+        self,
+        request_id: str,
+        file_path: str,
+        model_name: str = "base",
+        device: str = "cpu",
+        compute_type: str = "int8",
+        language: str | None = None,
+    ) -> TranscriptionResult:
+        """Transcribe an audio file with word-level timestamps.
+
+        Sends progress messages via IPC during processing.
+        """
+        # Stage: loading model
+        write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}..."))
+        model = self._ensure_model(model_name, device, compute_type)
+
+        # Stage: transcribing
+        write_message(progress_message(request_id, 10, "transcribing", "Starting transcription..."))
+
+        start_time = time.time()
+        segments_iter, info = model.transcribe(
+            file_path,
+            language=language,
+            word_timestamps=True,
+            vad_filter=True,
+        )
+
+        result = TranscriptionResult(
+            language=info.language,
+            language_probability=info.language_probability,
+            duration_ms=int(info.duration * 1000),
+        )
+
+        # Process segments with progress reporting
+        total_duration = info.duration if info.duration > 0 else 1.0
+        segment_count = 0
+
+        for segment in segments_iter:
+            segment_count += 1
+            progress_pct = min(10 + int((segment.end / total_duration) * 80), 90)
+
+            words = []
+            if segment.words:
+                for w in segment.words:
+                    words.append(
+                        WordResult(
+                            word=w.word.strip(),
+                            start_ms=int(w.start * 1000),
+                            end_ms=int(w.end * 1000),
+                            confidence=round(w.probability, 4),
+                        )
+                    )
+
+            result.segments.append(
+                SegmentResult(
+                    text=segment.text.strip(),
+                    start_ms=int(segment.start * 1000),
+                    end_ms=int(segment.end * 1000),
+                    words=words,
+                )
+            )
+
+            # Send progress every few segments
+            if segment_count % 5 == 0:
+                write_message(
+                    progress_message(
+                        request_id,
+                        progress_pct,
+                        "transcribing",
+                        f"Processed {segment_count} segments...",
+                    )
+                )
+
+        elapsed = time.time() - start_time
+        print(
+            f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
+            file=sys.stderr,
+            flush=True,
+        )
+
+        write_message(progress_message(request_id, 100, "done", "Transcription complete"))
+        return result
+
+
+def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
+    """Convert TranscriptionResult to IPC payload dict."""
+    return {
+        "segments": [
+            {
+                "text": seg.text,
+                "start_ms": seg.start_ms,
+                "end_ms": seg.end_ms,
+                "words": [
+                    {
+                        "word": w.word,
+                        "start_ms": w.start_ms,
+                        "end_ms": w.end_ms,
+                        "confidence": w.confidence,
+                    }
+                    for w in seg.words
+                ],
+            }
+            for seg in result.segments
+        ],
+        "language": result.language,
+        "language_probability": result.language_probability,
+        "duration_ms": result.duration_ms,
+    }