Phase 2: Core transcription pipeline and audio playback
- Implement faster-whisper TranscribeService with word-level timestamps, progress reporting, and hardware auto-detection - Wire up Rust SidecarManager for Python process lifecycle (spawn, IPC, shutdown) - Add transcribe_file Tauri command bridging frontend to Python sidecar - Integrate wavesurfer.js WaveformPlayer with play/pause, skip, seek controls - Build TranscriptEditor with word-level click-to-seek and active highlighting - Connect file import flow: prompt → asset load → transcribe → display - Add typed tauri-bridge service with TranscriptionResult interface - Add Python tests for hardware detection and transcription result formatting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,8 +2,74 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# TODO: Implement hardware detection
|
||||
# - Check torch.cuda.is_available()
|
||||
# - Detect VRAM size
|
||||
# - Detect CPU cores and available RAM
|
||||
# - Return recommended model configuration
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class HardwareInfo:
|
||||
"""Detected hardware capabilities."""
|
||||
|
||||
has_cuda: bool = False
|
||||
cuda_device_name: str = ""
|
||||
vram_mb: int = 0
|
||||
ram_mb: int = 0
|
||||
cpu_cores: int = 0
|
||||
recommended_model: str = "base"
|
||||
recommended_device: str = "cpu"
|
||||
recommended_compute_type: str = "int8"
|
||||
|
||||
|
||||
def detect_hardware() -> HardwareInfo:
|
||||
"""Detect available hardware and recommend model configuration."""
|
||||
info = HardwareInfo()
|
||||
|
||||
# CPU info
|
||||
info.cpu_cores = os.cpu_count() or 1
|
||||
|
||||
# RAM info
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal:"):
|
||||
# Value is in kB
|
||||
info.ram_mb = int(line.split()[1]) // 1024
|
||||
break
|
||||
except (FileNotFoundError, ValueError):
|
||||
pass
|
||||
|
||||
# CUDA detection
|
||||
try:
|
||||
import torch
|
||||
|
||||
if torch.cuda.is_available():
|
||||
info.has_cuda = True
|
||||
info.cuda_device_name = torch.cuda.get_device_name(0)
|
||||
info.vram_mb = torch.cuda.get_device_properties(0).total_mem // (1024 * 1024)
|
||||
except ImportError:
|
||||
print("[sidecar] torch not available, GPU detection skipped", file=sys.stderr, flush=True)
|
||||
|
||||
# Model recommendation based on hardware
|
||||
if info.has_cuda and info.vram_mb >= 8000:
|
||||
info.recommended_model = "large-v3-turbo"
|
||||
info.recommended_device = "cuda"
|
||||
info.recommended_compute_type = "int8"
|
||||
elif info.has_cuda and info.vram_mb >= 4000:
|
||||
info.recommended_model = "medium"
|
||||
info.recommended_device = "cuda"
|
||||
info.recommended_compute_type = "int8"
|
||||
elif info.ram_mb >= 16000:
|
||||
info.recommended_model = "medium"
|
||||
info.recommended_device = "cpu"
|
||||
info.recommended_compute_type = "int8"
|
||||
elif info.ram_mb >= 8000:
|
||||
info.recommended_model = "small"
|
||||
info.recommended_device = "cpu"
|
||||
info.recommended_compute_type = "int8"
|
||||
else:
|
||||
info.recommended_model = "base"
|
||||
info.recommended_device = "cpu"
|
||||
info.recommended_compute_type = "int8"
|
||||
|
||||
return info
|
||||
|
||||
@@ -37,3 +37,49 @@ class HandlerRegistry:
|
||||
def ping_handler(msg: IPCMessage) -> IPCMessage:
|
||||
"""Simple ping handler for testing connectivity."""
|
||||
return IPCMessage(id=msg.id, type="pong", payload={"echo": msg.payload})
|
||||
|
||||
|
||||
def make_transcribe_handler() -> HandlerFunc:
|
||||
"""Create a transcription handler with a persistent TranscribeService."""
|
||||
from voice_to_notes.services.transcribe import TranscribeService, result_to_payload
|
||||
|
||||
service = TranscribeService()
|
||||
|
||||
def handler(msg: IPCMessage) -> IPCMessage:
|
||||
payload = msg.payload
|
||||
result = service.transcribe(
|
||||
request_id=msg.id,
|
||||
file_path=payload["file"],
|
||||
model_name=payload.get("model", "base"),
|
||||
device=payload.get("device", "cpu"),
|
||||
compute_type=payload.get("compute_type", "int8"),
|
||||
language=payload.get("language"),
|
||||
)
|
||||
return IPCMessage(
|
||||
id=msg.id,
|
||||
type="transcribe.result",
|
||||
payload=result_to_payload(result),
|
||||
)
|
||||
|
||||
return handler
|
||||
|
||||
|
||||
def hardware_detect_handler(msg: IPCMessage) -> IPCMessage:
|
||||
"""Detect hardware capabilities and return recommendations."""
|
||||
from voice_to_notes.hardware.detect import detect_hardware
|
||||
|
||||
info = detect_hardware()
|
||||
return IPCMessage(
|
||||
id=msg.id,
|
||||
type="hardware.info",
|
||||
payload={
|
||||
"has_cuda": info.has_cuda,
|
||||
"cuda_device_name": info.cuda_device_name,
|
||||
"vram_mb": info.vram_mb,
|
||||
"ram_mb": info.ram_mb,
|
||||
"cpu_cores": info.cpu_cores,
|
||||
"recommended_model": info.recommended_model,
|
||||
"recommended_device": info.recommended_device,
|
||||
"recommended_compute_type": info.recommended_compute_type,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -5,7 +5,12 @@ from __future__ import annotations
|
||||
import signal
|
||||
import sys
|
||||
|
||||
from voice_to_notes.ipc.handlers import HandlerRegistry, ping_handler
|
||||
from voice_to_notes.ipc.handlers import (
|
||||
HandlerRegistry,
|
||||
hardware_detect_handler,
|
||||
make_transcribe_handler,
|
||||
ping_handler,
|
||||
)
|
||||
from voice_to_notes.ipc.messages import ready_message
|
||||
from voice_to_notes.ipc.protocol import read_message, write_message
|
||||
|
||||
@@ -14,7 +19,9 @@ def create_registry() -> HandlerRegistry:
|
||||
"""Set up the message handler registry."""
|
||||
registry = HandlerRegistry()
|
||||
registry.register("ping", ping_handler)
|
||||
# TODO: Register transcribe, diarize, pipeline, ai, export handlers
|
||||
registry.register("transcribe.start", make_transcribe_handler())
|
||||
registry.register("hardware.detect", hardware_detect_handler)
|
||||
# TODO: Register diarize, pipeline, ai, export handlers
|
||||
return registry
|
||||
|
||||
|
||||
|
||||
@@ -1,13 +1,193 @@
|
||||
"""Transcription service — faster-whisper + wav2vec2 pipeline."""
|
||||
"""Transcription service — faster-whisper pipeline with word-level timestamps."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
from voice_to_notes.ipc.messages import progress_message
|
||||
from voice_to_notes.ipc.protocol import write_message
|
||||
|
||||
|
||||
@dataclass
|
||||
class WordResult:
|
||||
"""A single word with timestamp."""
|
||||
|
||||
word: str
|
||||
start_ms: int
|
||||
end_ms: int
|
||||
confidence: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class SegmentResult:
|
||||
"""A transcription segment with words."""
|
||||
|
||||
text: str
|
||||
start_ms: int
|
||||
end_ms: int
|
||||
words: list[WordResult] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptionResult:
|
||||
"""Full transcription output."""
|
||||
|
||||
segments: list[SegmentResult] = field(default_factory=list)
|
||||
language: str = ""
|
||||
language_probability: float = 0.0
|
||||
duration_ms: int = 0
|
||||
|
||||
|
||||
class TranscribeService:
|
||||
"""Handles audio transcription via faster-whisper."""
|
||||
|
||||
# TODO: Implement faster-whisper integration
|
||||
# - Load model based on hardware detection
|
||||
# - Transcribe audio with word-level timestamps
|
||||
# - Report progress via IPC
|
||||
pass
|
||||
def __init__(self) -> None:
|
||||
self._model: WhisperModel | None = None
|
||||
self._current_model_name: str = ""
|
||||
self._current_device: str = ""
|
||||
self._current_compute_type: str = ""
|
||||
|
||||
def _ensure_model(
|
||||
self,
|
||||
model_name: str = "base",
|
||||
device: str = "cpu",
|
||||
compute_type: str = "int8",
|
||||
) -> WhisperModel:
|
||||
"""Load or reuse the Whisper model."""
|
||||
if (
|
||||
self._model is not None
|
||||
and self._current_model_name == model_name
|
||||
and self._current_device == device
|
||||
and self._current_compute_type == compute_type
|
||||
):
|
||||
return self._model
|
||||
|
||||
print(
|
||||
f"[sidecar] Loading model {model_name} on {device} ({compute_type})",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
self._model = WhisperModel(
|
||||
model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
)
|
||||
self._current_model_name = model_name
|
||||
self._current_device = device
|
||||
self._current_compute_type = compute_type
|
||||
return self._model
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
request_id: str,
|
||||
file_path: str,
|
||||
model_name: str = "base",
|
||||
device: str = "cpu",
|
||||
compute_type: str = "int8",
|
||||
language: str | None = None,
|
||||
) -> TranscriptionResult:
|
||||
"""Transcribe an audio file with word-level timestamps.
|
||||
|
||||
Sends progress messages via IPC during processing.
|
||||
"""
|
||||
# Stage: loading model
|
||||
write_message(progress_message(request_id, 0, "loading_model", f"Loading {model_name}..."))
|
||||
model = self._ensure_model(model_name, device, compute_type)
|
||||
|
||||
# Stage: transcribing
|
||||
write_message(progress_message(request_id, 10, "transcribing", "Starting transcription..."))
|
||||
|
||||
start_time = time.time()
|
||||
segments_iter, info = model.transcribe(
|
||||
file_path,
|
||||
language=language,
|
||||
word_timestamps=True,
|
||||
vad_filter=True,
|
||||
)
|
||||
|
||||
result = TranscriptionResult(
|
||||
language=info.language,
|
||||
language_probability=info.language_probability,
|
||||
duration_ms=int(info.duration * 1000),
|
||||
)
|
||||
|
||||
# Process segments with progress reporting
|
||||
total_duration = info.duration if info.duration > 0 else 1.0
|
||||
segment_count = 0
|
||||
|
||||
for segment in segments_iter:
|
||||
segment_count += 1
|
||||
progress_pct = min(10 + int((segment.end / total_duration) * 80), 90)
|
||||
|
||||
words = []
|
||||
if segment.words:
|
||||
for w in segment.words:
|
||||
words.append(
|
||||
WordResult(
|
||||
word=w.word.strip(),
|
||||
start_ms=int(w.start * 1000),
|
||||
end_ms=int(w.end * 1000),
|
||||
confidence=round(w.probability, 4),
|
||||
)
|
||||
)
|
||||
|
||||
result.segments.append(
|
||||
SegmentResult(
|
||||
text=segment.text.strip(),
|
||||
start_ms=int(segment.start * 1000),
|
||||
end_ms=int(segment.end * 1000),
|
||||
words=words,
|
||||
)
|
||||
)
|
||||
|
||||
# Send progress every few segments
|
||||
if segment_count % 5 == 0:
|
||||
write_message(
|
||||
progress_message(
|
||||
request_id,
|
||||
progress_pct,
|
||||
"transcribing",
|
||||
f"Processed {segment_count} segments...",
|
||||
)
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(
|
||||
f"[sidecar] Transcription complete: {segment_count} segments in {elapsed:.1f}s",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
write_message(progress_message(request_id, 100, "done", "Transcription complete"))
|
||||
return result
|
||||
|
||||
|
||||
def result_to_payload(result: TranscriptionResult) -> dict[str, Any]:
|
||||
"""Convert TranscriptionResult to IPC payload dict."""
|
||||
return {
|
||||
"segments": [
|
||||
{
|
||||
"text": seg.text,
|
||||
"start_ms": seg.start_ms,
|
||||
"end_ms": seg.end_ms,
|
||||
"words": [
|
||||
{
|
||||
"word": w.word,
|
||||
"start_ms": w.start_ms,
|
||||
"end_ms": w.end_ms,
|
||||
"confidence": w.confidence,
|
||||
}
|
||||
for w in seg.words
|
||||
],
|
||||
}
|
||||
for seg in result.segments
|
||||
],
|
||||
"language": result.language,
|
||||
"language_probability": result.language_probability,
|
||||
"duration_ms": result.duration_ms,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user