client/noise_suppression.py

"""Noise suppression module for reducing background noise in audio."""

import warnings
# Suppress pkg_resources deprecation warning from webrtcvad
warnings.filterwarnings("ignore", message=".*pkg_resources.*", category=UserWarning)

import numpy as np
import noisereduce as nr
import webrtcvad
from typing import Optional


class NoiseSuppressor:
    """Handles noise reduction and voice activity detection."""

    def __init__(
        self,
        sample_rate: int = 16000,
        method: str = "noisereduce",
        strength: float = 0.7,
        use_vad: bool = True
    ):
        """
        Initialize noise suppressor.

        Args:
            sample_rate: Audio sample rate in Hz
            method: Noise reduction method ('noisereduce' or 'none')
            strength: Noise reduction strength (0.0 to 1.0)
            use_vad: Whether to use Voice Activity Detection
        """
        self.sample_rate = sample_rate
        self.method = method
        self.strength = max(0.0, min(1.0, strength))  # Clamp to [0, 1]
        self.use_vad = use_vad

        # Initialize VAD if requested
        self.vad = None
        if use_vad:
            try:
                # WebRTC VAD supports 16kHz, 32kHz, and 48kHz
                if sample_rate in [8000, 16000, 32000, 48000]:
                    self.vad = webrtcvad.Vad(2)  # Aggressiveness: 0-3 (2 is balanced)
                else:
                    print(f"Warning: VAD not supported for sample rate {sample_rate}Hz")
                    self.use_vad = False
            except Exception as e:
                print(f"Warning: Failed to initialize VAD: {e}")
                self.use_vad = False

        # Store noise profile for adaptive reduction
        self.noise_profile: Optional[np.ndarray] = None

    def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
        """
        Apply noise reduction to audio.

        Args:
            audio: Audio data as numpy array (float32, range [-1, 1])

        Returns:
            Noise-reduced audio
        """
        if self.method == "none" or self.strength == 0.0:
            return audio

        try:
            # Ensure audio is float32
            audio = audio.astype(np.float32)

            if self.method == "noisereduce":
                # Apply noisereduce noise reduction
                reduced = nr.reduce_noise(
                    y=audio,
                    sr=self.sample_rate,
                    prop_decrease=self.strength,
                    stationary=True
                )
                return reduced.astype(np.float32)
            else:
                return audio

        except Exception as e:
            print(f"Error in noise reduction: {e}")
            return audio

    def is_speech(self, audio: np.ndarray) -> bool:
        """
        Detect if audio contains speech using VAD.

        Args:
            audio: Audio data as numpy array (float32, range [-1, 1])

        Returns:
            True if speech is detected, False otherwise
        """
        if not self.use_vad or self.vad is None:
            return True  # Assume speech if VAD not available

        try:
            # Convert float32 audio to int16 for VAD
            audio_int16 = (audio * 32767).astype(np.int16)

            # VAD requires specific frame sizes (10, 20, or 30 ms)
            frame_duration_ms = 30
            frame_size = int(self.sample_rate * frame_duration_ms / 1000)

            # Process audio in frames
            num_frames = len(audio_int16) // frame_size
            speech_frames = 0

            for i in range(num_frames):
                frame = audio_int16[i * frame_size:(i + 1) * frame_size]
                if self.vad.is_speech(frame.tobytes(), self.sample_rate):
                    speech_frames += 1

            # Consider it speech if more than 30% of frames contain speech
            return speech_frames > (num_frames * 0.3)

        except Exception as e:
            print(f"Error in VAD: {e}")
            return True  # Assume speech on error

    def process(self, audio: np.ndarray, skip_silent: bool = True) -> Optional[np.ndarray]:
        """
        Process audio with noise reduction and optional VAD filtering.

        Args:
            audio: Audio data as numpy array
            skip_silent: If True, return None for non-speech audio

        Returns:
            Processed audio or None if silent (when skip_silent=True)
        """
        # Check for speech first (before noise reduction)
        if skip_silent and self.use_vad:
            if not self.is_speech(audio):
                return None

        # Apply noise reduction
        processed_audio = self.reduce_noise(audio)

        return processed_audio

    def set_strength(self, strength: float):
        """
        Update noise reduction strength.

        Args:
            strength: New strength value (0.0 to 1.0)
        """
        self.strength = max(0.0, min(1.0, strength))

    def set_vad_enabled(self, enabled: bool):
        """
        Enable or disable Voice Activity Detection.

        Args:
            enabled: True to enable VAD, False to disable
        """
        self.use_vad = enabled and self.vad is not None

    def __repr__(self) -> str:
        return f"NoiseSuppressor(method={self.method}, strength={self.strength}, vad={self.use_vad})"
Initial commit: Local Transcription App v1.0 Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-12-25 18:48:23 -08:00			`"""Noise suppression module for reducing background noise in audio."""`

			`import warnings`
			`# Suppress pkg_resources deprecation warning from webrtcvad`
			`warnings.filterwarnings("ignore", message=".pkg_resources.", category=UserWarning)`

			`import numpy as np`
			`import noisereduce as nr`
			`import webrtcvad`
			`from typing import Optional`


			`class NoiseSuppressor:`
			`"""Handles noise reduction and voice activity detection."""`

			`def __init__(`
			`self,`
			`sample_rate: int = 16000,`
			`method: str = "noisereduce",`
			`strength: float = 0.7,`
			`use_vad: bool = True`
			`):`
			`"""`
			`Initialize noise suppressor.`

			`Args:`
			`sample_rate: Audio sample rate in Hz`
			`method: Noise reduction method ('noisereduce' or 'none')`
			`strength: Noise reduction strength (0.0 to 1.0)`
			`use_vad: Whether to use Voice Activity Detection`
			`"""`
			`self.sample_rate = sample_rate`
			`self.method = method`
			`self.strength = max(0.0, min(1.0, strength)) # Clamp to [0, 1]`
			`self.use_vad = use_vad`

			`# Initialize VAD if requested`
			`self.vad = None`
			`if use_vad:`
			`try:`
			`# WebRTC VAD supports 16kHz, 32kHz, and 48kHz`
			`if sample_rate in [8000, 16000, 32000, 48000]:`
			`self.vad = webrtcvad.Vad(2) # Aggressiveness: 0-3 (2 is balanced)`
			`else:`
			`print(f"Warning: VAD not supported for sample rate {sample_rate}Hz")`
			`self.use_vad = False`
			`except Exception as e:`
			`print(f"Warning: Failed to initialize VAD: {e}")`
			`self.use_vad = False`

			`# Store noise profile for adaptive reduction`
			`self.noise_profile: Optional[np.ndarray] = None`

			`def reduce_noise(self, audio: np.ndarray) -> np.ndarray:`
			`"""`
			`Apply noise reduction to audio.`

			`Args:`
			`audio: Audio data as numpy array (float32, range [-1, 1])`

			`Returns:`
			`Noise-reduced audio`
			`"""`
			`if self.method == "none" or self.strength == 0.0:`
			`return audio`

			`try:`
			`# Ensure audio is float32`
			`audio = audio.astype(np.float32)`

			`if self.method == "noisereduce":`
			`# Apply noisereduce noise reduction`
			`reduced = nr.reduce_noise(`
			`y=audio,`
			`sr=self.sample_rate,`
			`prop_decrease=self.strength,`
			`stationary=True`
			`)`
			`return reduced.astype(np.float32)`
			`else:`
			`return audio`

			`except Exception as e:`
			`print(f"Error in noise reduction: {e}")`
			`return audio`

			`def is_speech(self, audio: np.ndarray) -> bool:`
			`"""`
			`Detect if audio contains speech using VAD.`

			`Args:`
			`audio: Audio data as numpy array (float32, range [-1, 1])`

			`Returns:`
			`True if speech is detected, False otherwise`
			`"""`
			`if not self.use_vad or self.vad is None:`
			`return True # Assume speech if VAD not available`

			`try:`
			`# Convert float32 audio to int16 for VAD`
			`audio_int16 = (audio * 32767).astype(np.int16)`

			`# VAD requires specific frame sizes (10, 20, or 30 ms)`
			`frame_duration_ms = 30`
			`frame_size = int(self.sample_rate * frame_duration_ms / 1000)`

			`# Process audio in frames`
			`num_frames = len(audio_int16) // frame_size`
			`speech_frames = 0`

			`for i in range(num_frames):`
			`frame = audio_int16[i * frame_size:(i + 1) * frame_size]`
			`if self.vad.is_speech(frame.tobytes(), self.sample_rate):`
			`speech_frames += 1`

			`# Consider it speech if more than 30% of frames contain speech`
			`return speech_frames > (num_frames * 0.3)`

			`except Exception as e:`
			`print(f"Error in VAD: {e}")`
			`return True # Assume speech on error`

			`def process(self, audio: np.ndarray, skip_silent: bool = True) -> Optional[np.ndarray]:`
			`"""`
			`Process audio with noise reduction and optional VAD filtering.`

			`Args:`
			`audio: Audio data as numpy array`
			`skip_silent: If True, return None for non-speech audio`

			`Returns:`
			`Processed audio or None if silent (when skip_silent=True)`
			`"""`
			`# Check for speech first (before noise reduction)`
			`if skip_silent and self.use_vad:`
			`if not self.is_speech(audio):`
			`return None`

			`# Apply noise reduction`
			`processed_audio = self.reduce_noise(audio)`

			`return processed_audio`

			`def set_strength(self, strength: float):`
			`"""`
			`Update noise reduction strength.`

			`Args:`
			`strength: New strength value (0.0 to 1.0)`
			`"""`
			`self.strength = max(0.0, min(1.0, strength))`

			`def set_vad_enabled(self, enabled: bool):`
			`"""`
			`Enable or disable Voice Activity Detection.`

			`Args:`
			`enabled: True to enable VAD, False to disable`
			`"""`
			`self.use_vad = enabled and self.vad is not None`

			`def __repr__(self) -> str:`
			`return f"NoiseSuppressor(method={self.method}, strength={self.strength}, vad={self.use_vad})"`