"""Transcription engine using faster-whisper for speech-to-text.""" import numpy as np from faster_whisper import WhisperModel from typing import Optional, List, Tuple from datetime import datetime import threading class TranscriptionResult: """Represents a transcription result.""" def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""): """ Initialize transcription result. Args: text: Transcribed text confidence: Confidence score (0.0 to 1.0) timestamp: Timestamp of transcription user_name: Name of the user/speaker """ self.text = text.strip() self.confidence = confidence self.timestamp = timestamp self.user_name = user_name def __repr__(self) -> str: time_str = self.timestamp.strftime("%H:%M:%S") if self.user_name: return f"[{time_str}] {self.user_name}: {self.text}" return f"[{time_str}] {self.text}" def to_dict(self) -> dict: """Convert to dictionary.""" return { 'text': self.text, 'confidence': self.confidence, 'timestamp': self.timestamp.isoformat(), 'user_name': self.user_name } class TranscriptionEngine: """Handles speech-to-text transcription using faster-whisper.""" def __init__( self, model_size: str = "base", device: str = "cpu", compute_type: str = "int8", language: str = "en", min_confidence: float = 0.5 ): """ Initialize transcription engine. Args: model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large') device: Device to use ('cpu', 'cuda', 'auto') compute_type: Compute type ('int8', 'float16', 'float32') language: Language code for transcription min_confidence: Minimum confidence threshold for transcriptions """ self.model_size = model_size self.device = device self.compute_type = compute_type self.language = language self.min_confidence = min_confidence self.model: Optional[WhisperModel] = None self.model_lock = threading.Lock() self.is_loaded = False def load_model(self) -> bool: """ Load the Whisper model. Returns: True if model loaded successfully, False otherwise """ try: print(f"Loading Whisper {self.model_size} model on {self.device}...") with self.model_lock: self.model = WhisperModel( self.model_size, device=self.device, compute_type=self.compute_type ) self.is_loaded = True print(f"Model loaded successfully!") return True except Exception as e: print(f"Error loading model: {e}") self.is_loaded = False return False def transcribe( self, audio: np.ndarray, sample_rate: int = 16000, user_name: str = "" ) -> Optional[TranscriptionResult]: """ Transcribe audio to text. Args: audio: Audio data as numpy array (float32) sample_rate: Audio sample rate in Hz user_name: Name of the user/speaker Returns: TranscriptionResult or None if transcription failed or confidence too low """ if not self.is_loaded or self.model is None: print("Model not loaded") return None try: # Ensure audio is float32 audio = audio.astype(np.float32) # Transcribe using faster-whisper with self.model_lock: segments, info = self.model.transcribe( audio, language=self.language if self.language != "auto" else None, vad_filter=True, # Use built-in VAD vad_parameters=dict( min_silence_duration_ms=500 ) ) # Collect all segments full_text = "" total_confidence = 0.0 segment_count = 0 for segment in segments: full_text += segment.text + " " total_confidence += segment.avg_logprob segment_count += 1 # Calculate average confidence if segment_count == 0: return None # Convert log probability to approximate confidence (0-1 range) # avg_logprob is typically in range [-1, 0], so we transform it avg_confidence = np.exp(total_confidence / segment_count) # Filter by minimum confidence if avg_confidence < self.min_confidence: return None # Clean up text text = full_text.strip() if not text: return None # Create result result = TranscriptionResult( text=text, confidence=avg_confidence, timestamp=datetime.now(), user_name=user_name ) return result except Exception as e: print(f"Error during transcription: {e}") return None def change_model(self, model_size: str) -> bool: """ Change to a different model size. Args: model_size: New model size Returns: True if model changed successfully """ self.model_size = model_size self.is_loaded = False self.model = None return self.load_model() def change_device(self, device: str, compute_type: Optional[str] = None) -> bool: """ Change compute device. Args: device: New device ('cpu', 'cuda', etc.) compute_type: Optional new compute type Returns: True if device changed successfully """ self.device = device if compute_type: self.compute_type = compute_type self.is_loaded = False self.model = None return self.load_model() def change_language(self, language: str): """ Change transcription language. Args: language: Language code or 'auto' """ self.language = language def unload_model(self): """Unload the model from memory.""" with self.model_lock: self.model = None self.is_loaded = False def __repr__(self) -> str: return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})" def __del__(self): """Cleanup when object is destroyed.""" self.unload_model()