Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/client/transcription_engine.py
+++ b/client/transcription_engine.py
@@ -0,0 +1,232 @@
+"""Transcription engine using faster-whisper for speech-to-text."""
+
+import numpy as np
+from faster_whisper import WhisperModel
+from typing import Optional, List, Tuple
+from datetime import datetime
+import threading
+
+
+class TranscriptionResult:
+    """Represents a transcription result."""
+
+    def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""):
+        """
+        Initialize transcription result.
+
+        Args:
+            text: Transcribed text
+            confidence: Confidence score (0.0 to 1.0)
+            timestamp: Timestamp of transcription
+            user_name: Name of the user/speaker
+        """
+        self.text = text.strip()
+        self.confidence = confidence
+        self.timestamp = timestamp
+        self.user_name = user_name
+
+    def __repr__(self) -> str:
+        time_str = self.timestamp.strftime("%H:%M:%S")
+        if self.user_name:
+            return f"[{time_str}] {self.user_name}: {self.text}"
+        return f"[{time_str}] {self.text}"
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        return {
+            'text': self.text,
+            'confidence': self.confidence,
+            'timestamp': self.timestamp.isoformat(),
+            'user_name': self.user_name
+        }
+
+
+class TranscriptionEngine:
+    """Handles speech-to-text transcription using faster-whisper."""
+
+    def __init__(
+        self,
+        model_size: str = "base",
+        device: str = "cpu",
+        compute_type: str = "int8",
+        language: str = "en",
+        min_confidence: float = 0.5
+    ):
+        """
+        Initialize transcription engine.
+
+        Args:
+            model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
+            device: Device to use ('cpu', 'cuda', 'auto')
+            compute_type: Compute type ('int8', 'float16', 'float32')
+            language: Language code for transcription
+            min_confidence: Minimum confidence threshold for transcriptions
+        """
+        self.model_size = model_size
+        self.device = device
+        self.compute_type = compute_type
+        self.language = language
+        self.min_confidence = min_confidence
+        self.model: Optional[WhisperModel] = None
+        self.model_lock = threading.Lock()
+        self.is_loaded = False
+
+    def load_model(self) -> bool:
+        """
+        Load the Whisper model.
+
+        Returns:
+            True if model loaded successfully, False otherwise
+        """
+        try:
+            print(f"Loading Whisper {self.model_size} model on {self.device}...")
+
+            with self.model_lock:
+                self.model = WhisperModel(
+                    self.model_size,
+                    device=self.device,
+                    compute_type=self.compute_type
+                )
+                self.is_loaded = True
+
+            print(f"Model loaded successfully!")
+            return True
+
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            self.is_loaded = False
+            return False
+
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 16000,
+        user_name: str = ""
+    ) -> Optional[TranscriptionResult]:
+        """
+        Transcribe audio to text.
+
+        Args:
+            audio: Audio data as numpy array (float32)
+            sample_rate: Audio sample rate in Hz
+            user_name: Name of the user/speaker
+
+        Returns:
+            TranscriptionResult or None if transcription failed or confidence too low
+        """
+        if not self.is_loaded or self.model is None:
+            print("Model not loaded")
+            return None
+
+        try:
+            # Ensure audio is float32
+            audio = audio.astype(np.float32)
+
+            # Transcribe using faster-whisper
+            with self.model_lock:
+                segments, info = self.model.transcribe(
+                    audio,
+                    language=self.language if self.language != "auto" else None,
+                    vad_filter=True,  # Use built-in VAD
+                    vad_parameters=dict(
+                        min_silence_duration_ms=500
+                    )
+                )
+
+                # Collect all segments
+                full_text = ""
+                total_confidence = 0.0
+                segment_count = 0
+
+                for segment in segments:
+                    full_text += segment.text + " "
+                    total_confidence += segment.avg_logprob
+                    segment_count += 1
+
+            # Calculate average confidence
+            if segment_count == 0:
+                return None
+
+            # Convert log probability to approximate confidence (0-1 range)
+            # avg_logprob is typically in range [-1, 0], so we transform it
+            avg_confidence = np.exp(total_confidence / segment_count)
+
+            # Filter by minimum confidence
+            if avg_confidence < self.min_confidence:
+                return None
+
+            # Clean up text
+            text = full_text.strip()
+
+            if not text:
+                return None
+
+            # Create result
+            result = TranscriptionResult(
+                text=text,
+                confidence=avg_confidence,
+                timestamp=datetime.now(),
+                user_name=user_name
+            )
+
+            return result
+
+        except Exception as e:
+            print(f"Error during transcription: {e}")
+            return None
+
+    def change_model(self, model_size: str) -> bool:
+        """
+        Change to a different model size.
+
+        Args:
+            model_size: New model size
+
+        Returns:
+            True if model changed successfully
+        """
+        self.model_size = model_size
+        self.is_loaded = False
+        self.model = None
+        return self.load_model()
+
+    def change_device(self, device: str, compute_type: Optional[str] = None) -> bool:
+        """
+        Change compute device.
+
+        Args:
+            device: New device ('cpu', 'cuda', etc.)
+            compute_type: Optional new compute type
+
+        Returns:
+            True if device changed successfully
+        """
+        self.device = device
+        if compute_type:
+            self.compute_type = compute_type
+
+        self.is_loaded = False
+        self.model = None
+        return self.load_model()
+
+    def change_language(self, language: str):
+        """
+        Change transcription language.
+
+        Args:
+            language: Language code or 'auto'
+        """
+        self.language = language
+
+    def unload_model(self):
+        """Unload the model from memory."""
+        with self.model_lock:
+            self.model = None
+            self.is_loaded = False
+
+    def __repr__(self) -> str:
+        return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})"
+
+    def __del__(self):
+        """Cleanup when object is destroyed."""
+        self.unload_model()