Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
233 lines
6.8 KiB
Python
233 lines
6.8 KiB
Python
"""Transcription engine using faster-whisper for speech-to-text."""
|
|
|
|
import numpy as np
|
|
from faster_whisper import WhisperModel
|
|
from typing import Optional, List, Tuple
|
|
from datetime import datetime
|
|
import threading
|
|
|
|
|
|
class TranscriptionResult:
|
|
"""Represents a transcription result."""
|
|
|
|
def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""):
|
|
"""
|
|
Initialize transcription result.
|
|
|
|
Args:
|
|
text: Transcribed text
|
|
confidence: Confidence score (0.0 to 1.0)
|
|
timestamp: Timestamp of transcription
|
|
user_name: Name of the user/speaker
|
|
"""
|
|
self.text = text.strip()
|
|
self.confidence = confidence
|
|
self.timestamp = timestamp
|
|
self.user_name = user_name
|
|
|
|
def __repr__(self) -> str:
|
|
time_str = self.timestamp.strftime("%H:%M:%S")
|
|
if self.user_name:
|
|
return f"[{time_str}] {self.user_name}: {self.text}"
|
|
return f"[{time_str}] {self.text}"
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary."""
|
|
return {
|
|
'text': self.text,
|
|
'confidence': self.confidence,
|
|
'timestamp': self.timestamp.isoformat(),
|
|
'user_name': self.user_name
|
|
}
|
|
|
|
|
|
class TranscriptionEngine:
|
|
"""Handles speech-to-text transcription using faster-whisper."""
|
|
|
|
def __init__(
|
|
self,
|
|
model_size: str = "base",
|
|
device: str = "cpu",
|
|
compute_type: str = "int8",
|
|
language: str = "en",
|
|
min_confidence: float = 0.5
|
|
):
|
|
"""
|
|
Initialize transcription engine.
|
|
|
|
Args:
|
|
model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
|
|
device: Device to use ('cpu', 'cuda', 'auto')
|
|
compute_type: Compute type ('int8', 'float16', 'float32')
|
|
language: Language code for transcription
|
|
min_confidence: Minimum confidence threshold for transcriptions
|
|
"""
|
|
self.model_size = model_size
|
|
self.device = device
|
|
self.compute_type = compute_type
|
|
self.language = language
|
|
self.min_confidence = min_confidence
|
|
self.model: Optional[WhisperModel] = None
|
|
self.model_lock = threading.Lock()
|
|
self.is_loaded = False
|
|
|
|
def load_model(self) -> bool:
|
|
"""
|
|
Load the Whisper model.
|
|
|
|
Returns:
|
|
True if model loaded successfully, False otherwise
|
|
"""
|
|
try:
|
|
print(f"Loading Whisper {self.model_size} model on {self.device}...")
|
|
|
|
with self.model_lock:
|
|
self.model = WhisperModel(
|
|
self.model_size,
|
|
device=self.device,
|
|
compute_type=self.compute_type
|
|
)
|
|
self.is_loaded = True
|
|
|
|
print(f"Model loaded successfully!")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error loading model: {e}")
|
|
self.is_loaded = False
|
|
return False
|
|
|
|
def transcribe(
|
|
self,
|
|
audio: np.ndarray,
|
|
sample_rate: int = 16000,
|
|
user_name: str = ""
|
|
) -> Optional[TranscriptionResult]:
|
|
"""
|
|
Transcribe audio to text.
|
|
|
|
Args:
|
|
audio: Audio data as numpy array (float32)
|
|
sample_rate: Audio sample rate in Hz
|
|
user_name: Name of the user/speaker
|
|
|
|
Returns:
|
|
TranscriptionResult or None if transcription failed or confidence too low
|
|
"""
|
|
if not self.is_loaded or self.model is None:
|
|
print("Model not loaded")
|
|
return None
|
|
|
|
try:
|
|
# Ensure audio is float32
|
|
audio = audio.astype(np.float32)
|
|
|
|
# Transcribe using faster-whisper
|
|
with self.model_lock:
|
|
segments, info = self.model.transcribe(
|
|
audio,
|
|
language=self.language if self.language != "auto" else None,
|
|
vad_filter=True, # Use built-in VAD
|
|
vad_parameters=dict(
|
|
min_silence_duration_ms=500
|
|
)
|
|
)
|
|
|
|
# Collect all segments
|
|
full_text = ""
|
|
total_confidence = 0.0
|
|
segment_count = 0
|
|
|
|
for segment in segments:
|
|
full_text += segment.text + " "
|
|
total_confidence += segment.avg_logprob
|
|
segment_count += 1
|
|
|
|
# Calculate average confidence
|
|
if segment_count == 0:
|
|
return None
|
|
|
|
# Convert log probability to approximate confidence (0-1 range)
|
|
# avg_logprob is typically in range [-1, 0], so we transform it
|
|
avg_confidence = np.exp(total_confidence / segment_count)
|
|
|
|
# Filter by minimum confidence
|
|
if avg_confidence < self.min_confidence:
|
|
return None
|
|
|
|
# Clean up text
|
|
text = full_text.strip()
|
|
|
|
if not text:
|
|
return None
|
|
|
|
# Create result
|
|
result = TranscriptionResult(
|
|
text=text,
|
|
confidence=avg_confidence,
|
|
timestamp=datetime.now(),
|
|
user_name=user_name
|
|
)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f"Error during transcription: {e}")
|
|
return None
|
|
|
|
def change_model(self, model_size: str) -> bool:
|
|
"""
|
|
Change to a different model size.
|
|
|
|
Args:
|
|
model_size: New model size
|
|
|
|
Returns:
|
|
True if model changed successfully
|
|
"""
|
|
self.model_size = model_size
|
|
self.is_loaded = False
|
|
self.model = None
|
|
return self.load_model()
|
|
|
|
def change_device(self, device: str, compute_type: Optional[str] = None) -> bool:
|
|
"""
|
|
Change compute device.
|
|
|
|
Args:
|
|
device: New device ('cpu', 'cuda', etc.)
|
|
compute_type: Optional new compute type
|
|
|
|
Returns:
|
|
True if device changed successfully
|
|
"""
|
|
self.device = device
|
|
if compute_type:
|
|
self.compute_type = compute_type
|
|
|
|
self.is_loaded = False
|
|
self.model = None
|
|
return self.load_model()
|
|
|
|
def change_language(self, language: str):
|
|
"""
|
|
Change transcription language.
|
|
|
|
Args:
|
|
language: Language code or 'auto'
|
|
"""
|
|
self.language = language
|
|
|
|
def unload_model(self):
|
|
"""Unload the model from memory."""
|
|
with self.model_lock:
|
|
self.model = None
|
|
self.is_loaded = False
|
|
|
|
def __repr__(self) -> str:
|
|
return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})"
|
|
|
|
def __del__(self):
|
|
"""Cleanup when object is destroyed."""
|
|
self.unload_model()
|