Initial commit: Local Transcription App v1.0
Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
232
client/transcription_engine.py
Normal file
232
client/transcription_engine.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""Transcription engine using faster-whisper for speech-to-text."""
|
||||
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
from typing import Optional, List, Tuple
|
||||
from datetime import datetime
|
||||
import threading
|
||||
|
||||
|
||||
class TranscriptionResult:
|
||||
"""Represents a transcription result."""
|
||||
|
||||
def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""):
|
||||
"""
|
||||
Initialize transcription result.
|
||||
|
||||
Args:
|
||||
text: Transcribed text
|
||||
confidence: Confidence score (0.0 to 1.0)
|
||||
timestamp: Timestamp of transcription
|
||||
user_name: Name of the user/speaker
|
||||
"""
|
||||
self.text = text.strip()
|
||||
self.confidence = confidence
|
||||
self.timestamp = timestamp
|
||||
self.user_name = user_name
|
||||
|
||||
def __repr__(self) -> str:
|
||||
time_str = self.timestamp.strftime("%H:%M:%S")
|
||||
if self.user_name:
|
||||
return f"[{time_str}] {self.user_name}: {self.text}"
|
||||
return f"[{time_str}] {self.text}"
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
'text': self.text,
|
||||
'confidence': self.confidence,
|
||||
'timestamp': self.timestamp.isoformat(),
|
||||
'user_name': self.user_name
|
||||
}
|
||||
|
||||
|
||||
class TranscriptionEngine:
|
||||
"""Handles speech-to-text transcription using faster-whisper."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_size: str = "base",
|
||||
device: str = "cpu",
|
||||
compute_type: str = "int8",
|
||||
language: str = "en",
|
||||
min_confidence: float = 0.5
|
||||
):
|
||||
"""
|
||||
Initialize transcription engine.
|
||||
|
||||
Args:
|
||||
model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
|
||||
device: Device to use ('cpu', 'cuda', 'auto')
|
||||
compute_type: Compute type ('int8', 'float16', 'float32')
|
||||
language: Language code for transcription
|
||||
min_confidence: Minimum confidence threshold for transcriptions
|
||||
"""
|
||||
self.model_size = model_size
|
||||
self.device = device
|
||||
self.compute_type = compute_type
|
||||
self.language = language
|
||||
self.min_confidence = min_confidence
|
||||
self.model: Optional[WhisperModel] = None
|
||||
self.model_lock = threading.Lock()
|
||||
self.is_loaded = False
|
||||
|
||||
def load_model(self) -> bool:
|
||||
"""
|
||||
Load the Whisper model.
|
||||
|
||||
Returns:
|
||||
True if model loaded successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
print(f"Loading Whisper {self.model_size} model on {self.device}...")
|
||||
|
||||
with self.model_lock:
|
||||
self.model = WhisperModel(
|
||||
self.model_size,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type
|
||||
)
|
||||
self.is_loaded = True
|
||||
|
||||
print(f"Model loaded successfully!")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading model: {e}")
|
||||
self.is_loaded = False
|
||||
return False
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
audio: np.ndarray,
|
||||
sample_rate: int = 16000,
|
||||
user_name: str = ""
|
||||
) -> Optional[TranscriptionResult]:
|
||||
"""
|
||||
Transcribe audio to text.
|
||||
|
||||
Args:
|
||||
audio: Audio data as numpy array (float32)
|
||||
sample_rate: Audio sample rate in Hz
|
||||
user_name: Name of the user/speaker
|
||||
|
||||
Returns:
|
||||
TranscriptionResult or None if transcription failed or confidence too low
|
||||
"""
|
||||
if not self.is_loaded or self.model is None:
|
||||
print("Model not loaded")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Ensure audio is float32
|
||||
audio = audio.astype(np.float32)
|
||||
|
||||
# Transcribe using faster-whisper
|
||||
with self.model_lock:
|
||||
segments, info = self.model.transcribe(
|
||||
audio,
|
||||
language=self.language if self.language != "auto" else None,
|
||||
vad_filter=True, # Use built-in VAD
|
||||
vad_parameters=dict(
|
||||
min_silence_duration_ms=500
|
||||
)
|
||||
)
|
||||
|
||||
# Collect all segments
|
||||
full_text = ""
|
||||
total_confidence = 0.0
|
||||
segment_count = 0
|
||||
|
||||
for segment in segments:
|
||||
full_text += segment.text + " "
|
||||
total_confidence += segment.avg_logprob
|
||||
segment_count += 1
|
||||
|
||||
# Calculate average confidence
|
||||
if segment_count == 0:
|
||||
return None
|
||||
|
||||
# Convert log probability to approximate confidence (0-1 range)
|
||||
# avg_logprob is typically in range [-1, 0], so we transform it
|
||||
avg_confidence = np.exp(total_confidence / segment_count)
|
||||
|
||||
# Filter by minimum confidence
|
||||
if avg_confidence < self.min_confidence:
|
||||
return None
|
||||
|
||||
# Clean up text
|
||||
text = full_text.strip()
|
||||
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Create result
|
||||
result = TranscriptionResult(
|
||||
text=text,
|
||||
confidence=avg_confidence,
|
||||
timestamp=datetime.now(),
|
||||
user_name=user_name
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during transcription: {e}")
|
||||
return None
|
||||
|
||||
def change_model(self, model_size: str) -> bool:
|
||||
"""
|
||||
Change to a different model size.
|
||||
|
||||
Args:
|
||||
model_size: New model size
|
||||
|
||||
Returns:
|
||||
True if model changed successfully
|
||||
"""
|
||||
self.model_size = model_size
|
||||
self.is_loaded = False
|
||||
self.model = None
|
||||
return self.load_model()
|
||||
|
||||
def change_device(self, device: str, compute_type: Optional[str] = None) -> bool:
|
||||
"""
|
||||
Change compute device.
|
||||
|
||||
Args:
|
||||
device: New device ('cpu', 'cuda', etc.)
|
||||
compute_type: Optional new compute type
|
||||
|
||||
Returns:
|
||||
True if device changed successfully
|
||||
"""
|
||||
self.device = device
|
||||
if compute_type:
|
||||
self.compute_type = compute_type
|
||||
|
||||
self.is_loaded = False
|
||||
self.model = None
|
||||
return self.load_model()
|
||||
|
||||
def change_language(self, language: str):
|
||||
"""
|
||||
Change transcription language.
|
||||
|
||||
Args:
|
||||
language: Language code or 'auto'
|
||||
"""
|
||||
self.language = language
|
||||
|
||||
def unload_model(self):
|
||||
"""Unload the model from memory."""
|
||||
with self.model_lock:
|
||||
self.model = None
|
||||
self.is_loaded = False
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})"
|
||||
|
||||
def __del__(self):
|
||||
"""Cleanup when object is destroyed."""
|
||||
self.unload_model()
|
||||
Reference in New Issue
Block a user