Files
local-transcription/client/transcription_engine.py

240 lines
7.0 KiB
Python
Raw Permalink Normal View History

"""Transcription engine using faster-whisper for speech-to-text."""
import numpy as np
from faster_whisper import WhisperModel
from typing import Optional, List, Tuple
from datetime import datetime
import threading
class TranscriptionResult:
"""Represents a transcription result."""
def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""):
"""
Initialize transcription result.
Args:
text: Transcribed text
confidence: Confidence score (0.0 to 1.0)
timestamp: Timestamp of transcription
user_name: Name of the user/speaker
"""
self.text = text.strip()
self.confidence = confidence
self.timestamp = timestamp
self.user_name = user_name
def __repr__(self) -> str:
time_str = self.timestamp.strftime("%H:%M:%S")
if self.user_name:
return f"[{time_str}] {self.user_name}: {self.text}"
return f"[{time_str}] {self.text}"
def to_dict(self) -> dict:
"""Convert to dictionary."""
return {
'text': self.text,
'confidence': self.confidence,
'timestamp': self.timestamp.isoformat(),
'user_name': self.user_name
}
class TranscriptionEngine:
"""Handles speech-to-text transcription using faster-whisper."""
def __init__(
self,
model_size: str = "base",
device: str = "cpu",
compute_type: str = "int8",
language: str = "en",
min_confidence: float = 0.5
):
"""
Initialize transcription engine.
Args:
model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
device: Device to use ('cpu', 'cuda', 'auto')
compute_type: Compute type ('int8', 'float16', 'float32')
language: Language code for transcription
min_confidence: Minimum confidence threshold for transcriptions
"""
self.model_size = model_size
self.device = device
self.compute_type = compute_type
self.language = language
self.min_confidence = min_confidence
self.model: Optional[WhisperModel] = None
self.model_lock = threading.Lock()
self.is_loaded = False
def load_model(self) -> bool:
"""
Load the Whisper model.
Returns:
True if model loaded successfully, False otherwise
"""
try:
print(f"Loading Whisper {self.model_size} model on {self.device}...")
with self.model_lock:
self.model = WhisperModel(
self.model_size,
device=self.device,
compute_type=self.compute_type
)
self.is_loaded = True
print(f"Model loaded successfully!")
return True
except Exception as e:
print(f"Error loading model: {e}")
self.is_loaded = False
return False
def transcribe(
self,
audio: np.ndarray,
sample_rate: int = 16000,
user_name: str = ""
) -> Optional[TranscriptionResult]:
"""
Transcribe audio to text.
Args:
audio: Audio data as numpy array (float32)
sample_rate: Audio sample rate in Hz
user_name: Name of the user/speaker
Returns:
TranscriptionResult or None if transcription failed or confidence too low
"""
if not self.is_loaded or self.model is None:
print("Model not loaded")
return None
try:
# Ensure audio is float32
audio = audio.astype(np.float32)
# Transcribe using faster-whisper
with self.model_lock:
segments, info = self.model.transcribe(
audio,
language=self.language if self.language != "auto" else None,
vad_filter=True, # Use built-in VAD
vad_parameters=dict(
min_silence_duration_ms=500
)
)
# Collect all segments
full_text = ""
total_confidence = 0.0
segment_count = 0
for segment in segments:
full_text += segment.text + " "
total_confidence += segment.avg_logprob
segment_count += 1
# Calculate average confidence
if segment_count == 0:
return None
# Convert log probability to approximate confidence (0-1 range)
# avg_logprob is typically in range [-1, 0], so we transform it
avg_confidence = np.exp(total_confidence / segment_count)
# Filter by minimum confidence
if avg_confidence < self.min_confidence:
return None
# Clean up text
text = full_text.strip()
if not text:
return None
# Create result
result = TranscriptionResult(
text=text,
confidence=avg_confidence,
timestamp=datetime.now(),
user_name=user_name
)
return result
except Exception as e:
print(f"Error during transcription: {e}")
return None
def change_model(self, model_size: str) -> bool:
"""
Change to a different model size.
Args:
model_size: New model size
Returns:
True if model changed successfully
"""
self.model_size = model_size
self.is_loaded = False
self.model = None
return self.load_model()
def change_device(self, device: str, compute_type: Optional[str] = None) -> bool:
"""
Change compute device.
Args:
device: New device ('cpu', 'cuda', etc.)
compute_type: Optional new compute type
Returns:
True if device changed successfully
"""
self.device = device
if compute_type:
self.compute_type = compute_type
self.is_loaded = False
self.model = None
return self.load_model()
def change_language(self, language: str):
"""
Change transcription language.
Args:
language: Language code or 'auto'
"""
self.language = language
def unload_model(self):
"""Unload the model from memory."""
with self.model_lock:
if self.model is not None:
# Delete the model reference
del self.model
self.model = None
self.is_loaded = False
# Force garbage collection to free memory
import gc
gc.collect()
def __repr__(self) -> str:
return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})"
def __del__(self):
"""Cleanup when object is destroyed."""
self.unload_model()