local-transcription/client/transcription_engine.py

"""Transcription engine using faster-whisper for speech-to-text."""

import numpy as np
from faster_whisper import WhisperModel
from typing import Optional, List, Tuple
from datetime import datetime
import threading


class TranscriptionResult:
    """Represents a transcription result."""

    def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""):
        """
        Initialize transcription result.

        Args:
            text: Transcribed text
            confidence: Confidence score (0.0 to 1.0)
            timestamp: Timestamp of transcription
            user_name: Name of the user/speaker
        """
        self.text = text.strip()
        self.confidence = confidence
        self.timestamp = timestamp
        self.user_name = user_name

    def __repr__(self) -> str:
        time_str = self.timestamp.strftime("%H:%M:%S")
        if self.user_name:
            return f"[{time_str}] {self.user_name}: {self.text}"
        return f"[{time_str}] {self.text}"

    def to_dict(self) -> dict:
        """Convert to dictionary."""
        return {
            'text': self.text,
            'confidence': self.confidence,
            'timestamp': self.timestamp.isoformat(),
            'user_name': self.user_name
        }


class TranscriptionEngine:
    """Handles speech-to-text transcription using faster-whisper."""

    def __init__(
        self,
        model_size: str = "base",
        device: str = "cpu",
        compute_type: str = "int8",
        language: str = "en",
        min_confidence: float = 0.5
    ):
        """
        Initialize transcription engine.

        Args:
            model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
            device: Device to use ('cpu', 'cuda', 'auto')
            compute_type: Compute type ('int8', 'float16', 'float32')
            language: Language code for transcription
            min_confidence: Minimum confidence threshold for transcriptions
        """
        self.model_size = model_size
        self.device = device
        self.compute_type = compute_type
        self.language = language
        self.min_confidence = min_confidence
        self.model: Optional[WhisperModel] = None
        self.model_lock = threading.Lock()
        self.is_loaded = False

    def load_model(self) -> bool:
        """
        Load the Whisper model.

        Returns:
            True if model loaded successfully, False otherwise
        """
        try:
            print(f"Loading Whisper {self.model_size} model on {self.device}...")

            with self.model_lock:
                self.model = WhisperModel(
                    self.model_size,
                    device=self.device,
                    compute_type=self.compute_type
                )
                self.is_loaded = True

            print(f"Model loaded successfully!")
            return True

        except Exception as e:
            print(f"Error loading model: {e}")
            self.is_loaded = False
            return False

    def transcribe(
        self,
        audio: np.ndarray,
        sample_rate: int = 16000,
        user_name: str = ""
    ) -> Optional[TranscriptionResult]:
        """
        Transcribe audio to text.

        Args:
            audio: Audio data as numpy array (float32)
            sample_rate: Audio sample rate in Hz
            user_name: Name of the user/speaker

        Returns:
            TranscriptionResult or None if transcription failed or confidence too low
        """
        if not self.is_loaded or self.model is None:
            print("Model not loaded")
            return None

        try:
            # Ensure audio is float32
            audio = audio.astype(np.float32)

            # Transcribe using faster-whisper
            with self.model_lock:
                segments, info = self.model.transcribe(
                    audio,
                    language=self.language if self.language != "auto" else None,
                    vad_filter=True,  # Use built-in VAD
                    vad_parameters=dict(
                        min_silence_duration_ms=500
                    )
                )

                # Collect all segments
                full_text = ""
                total_confidence = 0.0
                segment_count = 0

                for segment in segments:
                    full_text += segment.text + " "
                    total_confidence += segment.avg_logprob
                    segment_count += 1

            # Calculate average confidence
            if segment_count == 0:
                return None

            # Convert log probability to approximate confidence (0-1 range)
            # avg_logprob is typically in range [-1, 0], so we transform it
            avg_confidence = np.exp(total_confidence / segment_count)

            # Filter by minimum confidence
            if avg_confidence < self.min_confidence:
                return None

            # Clean up text
            text = full_text.strip()

            if not text:
                return None

            # Create result
            result = TranscriptionResult(
                text=text,
                confidence=avg_confidence,
                timestamp=datetime.now(),
                user_name=user_name
            )

            return result

        except Exception as e:
            print(f"Error during transcription: {e}")
            return None

    def change_model(self, model_size: str) -> bool:
        """
        Change to a different model size.

        Args:
            model_size: New model size

        Returns:
            True if model changed successfully
        """
        self.model_size = model_size
        self.is_loaded = False
        self.model = None
        return self.load_model()

    def change_device(self, device: str, compute_type: Optional[str] = None) -> bool:
        """
        Change compute device.

        Args:
            device: New device ('cpu', 'cuda', etc.)
            compute_type: Optional new compute type

        Returns:
            True if device changed successfully
        """
        self.device = device
        if compute_type:
            self.compute_type = compute_type

        self.is_loaded = False
        self.model = None
        return self.load_model()

    def change_language(self, language: str):
        """
        Change transcription language.

        Args:
            language: Language code or 'auto'
        """
        self.language = language

    def unload_model(self):
        """Unload the model from memory."""
        with self.model_lock:
            self.model = None
            self.is_loaded = False

    def __repr__(self) -> str:
        return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})"

    def __del__(self):
        """Cleanup when object is destroyed."""
        self.unload_model()