client/transcription_engine.py

"""Transcription engine using faster-whisper for speech-to-text."""

import numpy as np
from faster_whisper import WhisperModel
from typing import Optional, List, Tuple
from datetime import datetime
import threading


class TranscriptionResult:
    """Represents a transcription result."""

    def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""):
        """
        Initialize transcription result.

        Args:
            text: Transcribed text
            confidence: Confidence score (0.0 to 1.0)
            timestamp: Timestamp of transcription
            user_name: Name of the user/speaker
        """
        self.text = text.strip()
        self.confidence = confidence
        self.timestamp = timestamp
        self.user_name = user_name

    def __repr__(self) -> str:
        time_str = self.timestamp.strftime("%H:%M:%S")
        if self.user_name:
            return f"[{time_str}] {self.user_name}: {self.text}"
        return f"[{time_str}] {self.text}"

    def to_dict(self) -> dict:
        """Convert to dictionary."""
        return {
            'text': self.text,
            'confidence': self.confidence,
            'timestamp': self.timestamp.isoformat(),
            'user_name': self.user_name
        }


class TranscriptionEngine:
    """Handles speech-to-text transcription using faster-whisper."""

    def __init__(
        self,
        model_size: str = "base",
        device: str = "cpu",
        compute_type: str = "int8",
        language: str = "en",
        min_confidence: float = 0.5
    ):
        """
        Initialize transcription engine.

        Args:
            model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
            device: Device to use ('cpu', 'cuda', 'auto')
            compute_type: Compute type ('int8', 'float16', 'float32')
            language: Language code for transcription
            min_confidence: Minimum confidence threshold for transcriptions
        """
        self.model_size = model_size
        self.device = device
        self.compute_type = compute_type
        self.language = language
        self.min_confidence = min_confidence
        self.model: Optional[WhisperModel] = None
        self.model_lock = threading.Lock()
        self.is_loaded = False

    def load_model(self) -> bool:
        """
        Load the Whisper model.

        Returns:
            True if model loaded successfully, False otherwise
        """
        try:
            print(f"Loading Whisper {self.model_size} model on {self.device}...")

            with self.model_lock:
                self.model = WhisperModel(
                    self.model_size,
                    device=self.device,
                    compute_type=self.compute_type
                )
                self.is_loaded = True

            print(f"Model loaded successfully!")
            return True

        except Exception as e:
            print(f"Error loading model: {e}")
            self.is_loaded = False
            return False

    def transcribe(
        self,
        audio: np.ndarray,
        sample_rate: int = 16000,
        user_name: str = ""
    ) -> Optional[TranscriptionResult]:
        """
        Transcribe audio to text.

        Args:
            audio: Audio data as numpy array (float32)
            sample_rate: Audio sample rate in Hz
            user_name: Name of the user/speaker

        Returns:
            TranscriptionResult or None if transcription failed or confidence too low
        """
        if not self.is_loaded or self.model is None:
            print("Model not loaded")
            return None

        try:
            # Ensure audio is float32
            audio = audio.astype(np.float32)

            # Transcribe using faster-whisper
            with self.model_lock:
                segments, info = self.model.transcribe(
                    audio,
                    language=self.language if self.language != "auto" else None,
                    vad_filter=True,  # Use built-in VAD
                    vad_parameters=dict(
                        min_silence_duration_ms=500
                    )
                )

                # Collect all segments
                full_text = ""
                total_confidence = 0.0
                segment_count = 0

                for segment in segments:
                    full_text += segment.text + " "
                    total_confidence += segment.avg_logprob
                    segment_count += 1

            # Calculate average confidence
            if segment_count == 0:
                return None

            # Convert log probability to approximate confidence (0-1 range)
            # avg_logprob is typically in range [-1, 0], so we transform it
            avg_confidence = np.exp(total_confidence / segment_count)

            # Filter by minimum confidence
            if avg_confidence < self.min_confidence:
                return None

            # Clean up text
            text = full_text.strip()

            if not text:
                return None

            # Create result
            result = TranscriptionResult(
                text=text,
                confidence=avg_confidence,
                timestamp=datetime.now(),
                user_name=user_name
            )

            return result

        except Exception as e:
            print(f"Error during transcription: {e}")
            return None

    def change_model(self, model_size: str) -> bool:
        """
        Change to a different model size.

        Args:
            model_size: New model size

        Returns:
            True if model changed successfully
        """
        self.model_size = model_size
        self.is_loaded = False
        self.model = None
        return self.load_model()

    def change_device(self, device: str, compute_type: Optional[str] = None) -> bool:
        """
        Change compute device.

        Args:
            device: New device ('cpu', 'cuda', etc.)
            compute_type: Optional new compute type

        Returns:
            True if device changed successfully
        """
        self.device = device
        if compute_type:
            self.compute_type = compute_type

        self.is_loaded = False
        self.model = None
        return self.load_model()

    def change_language(self, language: str):
        """
        Change transcription language.

        Args:
            language: Language code or 'auto'
        """
        self.language = language

    def unload_model(self):
        """Unload the model from memory."""
        with self.model_lock:
            if self.model is not None:
                # Delete the model reference
                del self.model
            self.model = None
            self.is_loaded = False

        # Force garbage collection to free memory
        import gc
        gc.collect()

    def __repr__(self) -> str:
        return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})"

    def __del__(self):
        """Cleanup when object is destroyed."""
        self.unload_model()
Initial commit: Local Transcription App v1.0 Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-12-25 18:48:23 -08:00			`"""Transcription engine using faster-whisper for speech-to-text."""`

			`import numpy as np`
			`from faster_whisper import WhisperModel`
			`from typing import Optional, List, Tuple`
			`from datetime import datetime`
			`import threading`


			`class TranscriptionResult:`
			`"""Represents a transcription result."""`

			`def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""):`
			`"""`
			`Initialize transcription result.`

			`Args:`
			`text: Transcribed text`
			`confidence: Confidence score (0.0 to 1.0)`
			`timestamp: Timestamp of transcription`
			`user_name: Name of the user/speaker`
			`"""`
			`self.text = text.strip()`
			`self.confidence = confidence`
			`self.timestamp = timestamp`
			`self.user_name = user_name`

			`def __repr__(self) -> str:`
			`time_str = self.timestamp.strftime("%H:%M:%S")`
			`if self.user_name:`
			`return f"[{time_str}] {self.user_name}: {self.text}"`
			`return f"[{time_str}] {self.text}"`

			`def to_dict(self) -> dict:`
			`"""Convert to dictionary."""`
			`return {`
			`'text': self.text,`
			`'confidence': self.confidence,`
			`'timestamp': self.timestamp.isoformat(),`
			`'user_name': self.user_name`
			`}`


			`class TranscriptionEngine:`
			`"""Handles speech-to-text transcription using faster-whisper."""`

			`def __init__(`
			`self,`
			`model_size: str = "base",`
			`device: str = "cpu",`
			`compute_type: str = "int8",`
			`language: str = "en",`
			`min_confidence: float = 0.5`
			`):`
			`"""`
			`Initialize transcription engine.`

			`Args:`
			`model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')`
			`device: Device to use ('cpu', 'cuda', 'auto')`
			`compute_type: Compute type ('int8', 'float16', 'float32')`
			`language: Language code for transcription`
			`min_confidence: Minimum confidence threshold for transcriptions`
			`"""`
			`self.model_size = model_size`
			`self.device = device`
			`self.compute_type = compute_type`
			`self.language = language`
			`self.min_confidence = min_confidence`
			`self.model: Optional[WhisperModel] = None`
			`self.model_lock = threading.Lock()`
			`self.is_loaded = False`

			`def load_model(self) -> bool:`
			`"""`
			`Load the Whisper model.`

			`Returns:`
			`True if model loaded successfully, False otherwise`
			`"""`
			`try:`
			`print(f"Loading Whisper {self.model_size} model on {self.device}...")`

			`with self.model_lock:`
			`self.model = WhisperModel(`
			`self.model_size,`
			`device=self.device,`
			`compute_type=self.compute_type`
			`)`
			`self.is_loaded = True`

			`print(f"Model loaded successfully!")`
			`return True`

			`except Exception as e:`
			`print(f"Error loading model: {e}")`
			`self.is_loaded = False`
			`return False`

			`def transcribe(`
			`self,`
			`audio: np.ndarray,`
			`sample_rate: int = 16000,`
			`user_name: str = ""`
			`) -> Optional[TranscriptionResult]:`
			`"""`
			`Transcribe audio to text.`

			`Args:`
			`audio: Audio data as numpy array (float32)`
			`sample_rate: Audio sample rate in Hz`
			`user_name: Name of the user/speaker`

			`Returns:`
			`TranscriptionResult or None if transcription failed or confidence too low`
			`"""`
			`if not self.is_loaded or self.model is None:`
			`print("Model not loaded")`
			`return None`

			`try:`
			`# Ensure audio is float32`
			`audio = audio.astype(np.float32)`

			`# Transcribe using faster-whisper`
			`with self.model_lock:`
			`segments, info = self.model.transcribe(`
			`audio,`
			`language=self.language if self.language != "auto" else None,`
			`vad_filter=True, # Use built-in VAD`
			`vad_parameters=dict(`
			`min_silence_duration_ms=500`
			`)`
			`)`

			`# Collect all segments`
			`full_text = ""`
			`total_confidence = 0.0`
			`segment_count = 0`

			`for segment in segments:`
			`full_text += segment.text + " "`
			`total_confidence += segment.avg_logprob`
			`segment_count += 1`

			`# Calculate average confidence`
			`if segment_count == 0:`
			`return None`

			`# Convert log probability to approximate confidence (0-1 range)`
			`# avg_logprob is typically in range [-1, 0], so we transform it`
			`avg_confidence = np.exp(total_confidence / segment_count)`

			`# Filter by minimum confidence`
			`if avg_confidence < self.min_confidence:`
			`return None`

			`# Clean up text`
			`text = full_text.strip()`

			`if not text:`
			`return None`

			`# Create result`
			`result = TranscriptionResult(`
			`text=text,`
			`confidence=avg_confidence,`
			`timestamp=datetime.now(),`
			`user_name=user_name`
			`)`

			`return result`

			`except Exception as e:`
			`print(f"Error during transcription: {e}")`
			`return None`

			`def change_model(self, model_size: str) -> bool:`
			`"""`
			`Change to a different model size.`

			`Args:`
			`model_size: New model size`

			`Returns:`
			`True if model changed successfully`
			`"""`
			`self.model_size = model_size`
			`self.is_loaded = False`
			`self.model = None`
			`return self.load_model()`

			`def change_device(self, device: str, compute_type: Optional[str] = None) -> bool:`
			`"""`
			`Change compute device.`

			`Args:`
			`device: New device ('cpu', 'cuda', etc.)`
			`compute_type: Optional new compute type`

			`Returns:`
			`True if device changed successfully`
			`"""`
			`self.device = device`
			`if compute_type:`
			`self.compute_type = compute_type`

			`self.is_loaded = False`
			`self.model = None`
			`return self.load_model()`

			`def change_language(self, language: str):`
			`"""`
			`Change transcription language.`

			`Args:`
			`language: Language code or 'auto'`
			`"""`
			`self.language = language`

			`def unload_model(self):`
			`"""Unload the model from memory."""`
			`with self.model_lock:`
Fix model switching crash and improve error handling Model Reload Fixes: - Properly disconnect signals before reconnecting to prevent duplicate connections - Wait for previous model loader thread to finish before starting new one - Add garbage collection after unloading model to free memory - Improve error handling in model reload callback Settings Dialog: - Remove duplicate success message (callback handles it) - Only show message if no callback is defined Transcription Engine: - Explicitly delete model reference before setting to None - Force garbage collection to ensure memory is freed This prevents crashes when switching models, especially when done multiple times in succession or while the app is under load. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-12-27 06:28:40 -08:00			`if self.model is not None:`
			`# Delete the model reference`
			`del self.model`
Initial commit: Local Transcription App v1.0 Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-12-25 18:48:23 -08:00			`self.model = None`
			`self.is_loaded = False`

Fix model switching crash and improve error handling Model Reload Fixes: - Properly disconnect signals before reconnecting to prevent duplicate connections - Wait for previous model loader thread to finish before starting new one - Add garbage collection after unloading model to free memory - Improve error handling in model reload callback Settings Dialog: - Remove duplicate success message (callback handles it) - Only show message if no callback is defined Transcription Engine: - Explicitly delete model reference before setting to None - Force garbage collection to ensure memory is freed This prevents crashes when switching models, especially when done multiple times in succession or while the app is under load. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-12-27 06:28:40 -08:00			`# Force garbage collection to free memory`
			`import gc`
			`gc.collect()`

Initial commit: Local Transcription App v1.0 Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-12-25 18:48:23 -08:00			`def __repr__(self) -> str:`
			`return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})"`

			`def __del__(self):`
			`"""Cleanup when object is destroyed."""`
			`self.unload_model()`