Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/client/noise_suppression.py
+++ b/client/noise_suppression.py
@@ -0,0 +1,164 @@
+"""Noise suppression module for reducing background noise in audio."""
+
+import warnings
+# Suppress pkg_resources deprecation warning from webrtcvad
+warnings.filterwarnings("ignore", message=".*pkg_resources.*", category=UserWarning)
+
+import numpy as np
+import noisereduce as nr
+import webrtcvad
+from typing import Optional
+
+
+class NoiseSuppressor:
+    """Handles noise reduction and voice activity detection."""
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        method: str = "noisereduce",
+        strength: float = 0.7,
+        use_vad: bool = True
+    ):
+        """
+        Initialize noise suppressor.
+
+        Args:
+            sample_rate: Audio sample rate in Hz
+            method: Noise reduction method ('noisereduce' or 'none')
+            strength: Noise reduction strength (0.0 to 1.0)
+            use_vad: Whether to use Voice Activity Detection
+        """
+        self.sample_rate = sample_rate
+        self.method = method
+        self.strength = max(0.0, min(1.0, strength))  # Clamp to [0, 1]
+        self.use_vad = use_vad
+
+        # Initialize VAD if requested
+        self.vad = None
+        if use_vad:
+            try:
+                # WebRTC VAD supports 16kHz, 32kHz, and 48kHz
+                if sample_rate in [8000, 16000, 32000, 48000]:
+                    self.vad = webrtcvad.Vad(2)  # Aggressiveness: 0-3 (2 is balanced)
+                else:
+                    print(f"Warning: VAD not supported for sample rate {sample_rate}Hz")
+                    self.use_vad = False
+            except Exception as e:
+                print(f"Warning: Failed to initialize VAD: {e}")
+                self.use_vad = False
+
+        # Store noise profile for adaptive reduction
+        self.noise_profile: Optional[np.ndarray] = None
+
+    def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
+        """
+        Apply noise reduction to audio.
+
+        Args:
+            audio: Audio data as numpy array (float32, range [-1, 1])
+
+        Returns:
+            Noise-reduced audio
+        """
+        if self.method == "none" or self.strength == 0.0:
+            return audio
+
+        try:
+            # Ensure audio is float32
+            audio = audio.astype(np.float32)
+
+            if self.method == "noisereduce":
+                # Apply noisereduce noise reduction
+                reduced = nr.reduce_noise(
+                    y=audio,
+                    sr=self.sample_rate,
+                    prop_decrease=self.strength,
+                    stationary=True
+                )
+                return reduced.astype(np.float32)
+            else:
+                return audio
+
+        except Exception as e:
+            print(f"Error in noise reduction: {e}")
+            return audio
+
+    def is_speech(self, audio: np.ndarray) -> bool:
+        """
+        Detect if audio contains speech using VAD.
+
+        Args:
+            audio: Audio data as numpy array (float32, range [-1, 1])
+
+        Returns:
+            True if speech is detected, False otherwise
+        """
+        if not self.use_vad or self.vad is None:
+            return True  # Assume speech if VAD not available
+
+        try:
+            # Convert float32 audio to int16 for VAD
+            audio_int16 = (audio * 32767).astype(np.int16)
+
+            # VAD requires specific frame sizes (10, 20, or 30 ms)
+            frame_duration_ms = 30
+            frame_size = int(self.sample_rate * frame_duration_ms / 1000)
+
+            # Process audio in frames
+            num_frames = len(audio_int16) // frame_size
+            speech_frames = 0
+
+            for i in range(num_frames):
+                frame = audio_int16[i * frame_size:(i + 1) * frame_size]
+                if self.vad.is_speech(frame.tobytes(), self.sample_rate):
+                    speech_frames += 1
+
+            # Consider it speech if more than 30% of frames contain speech
+            return speech_frames > (num_frames * 0.3)
+
+        except Exception as e:
+            print(f"Error in VAD: {e}")
+            return True  # Assume speech on error
+
+    def process(self, audio: np.ndarray, skip_silent: bool = True) -> Optional[np.ndarray]:
+        """
+        Process audio with noise reduction and optional VAD filtering.
+
+        Args:
+            audio: Audio data as numpy array
+            skip_silent: If True, return None for non-speech audio
+
+        Returns:
+            Processed audio or None if silent (when skip_silent=True)
+        """
+        # Check for speech first (before noise reduction)
+        if skip_silent and self.use_vad:
+            if not self.is_speech(audio):
+                return None
+
+        # Apply noise reduction
+        processed_audio = self.reduce_noise(audio)
+
+        return processed_audio
+
+    def set_strength(self, strength: float):
+        """
+        Update noise reduction strength.
+
+        Args:
+            strength: New strength value (0.0 to 1.0)
+        """
+        self.strength = max(0.0, min(1.0, strength))
+
+    def set_vad_enabled(self, enabled: bool):
+        """
+        Enable or disable Voice Activity Detection.
+
+        Args:
+            enabled: True to enable VAD, False to disable
+        """
+        self.use_vad = enabled and self.vad is not None
+
+    def __repr__(self) -> str:
+        return f"NoiseSuppressor(method={self.method}, strength={self.strength}, vad={self.use_vad})"