Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application

Features:
- Real-time speech-to-text with Whisper (faster-whisper)
- PySide6 desktop GUI with settings dialog
- Web server for OBS browser source integration
- Audio capture with automatic sample rate detection and resampling
- Noise suppression with Voice Activity Detection (VAD)
- Configurable display settings (font, timestamps, fade duration)
- Settings apply without restart (with automatic model reloading)
- Auto-fade for web display transcriptions
- CPU/GPU support with automatic device detection
- Standalone executable builds (PyInstaller)
- CUDA build support (works on systems without CUDA hardware)

Components:
- Audio capture with sounddevice
- Noise reduction with noisereduce + webrtcvad
- Transcription with faster-whisper
- GUI with PySide6
- Web server with FastAPI + WebSocket
- Configuration system with YAML

Build System:
- Standard builds (CPU-only): build.sh / build.bat
- CUDA builds (universal): build-cuda.sh / build-cuda.bat
- Comprehensive BUILD.md documentation
- Cross-platform support (Linux, Windows)

Documentation:
- README.md with project overview and quick start
- BUILD.md with detailed build instructions
- NEXT_STEPS.md with future enhancement roadmap
- INSTALL.md with setup instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions

246
client/audio_capture.py Normal file
View File

@@ -0,0 +1,246 @@
"""Audio capture module for recording microphone or system audio."""
import numpy as np
import sounddevice as sd
from scipy import signal
from typing import Callable, Optional, List, Tuple
from threading import Thread, Event
import queue
class AudioCapture:
"""Captures audio from input devices and provides chunks for processing."""
def __init__(
self,
sample_rate: int = 16000,
chunk_duration: float = 3.0,
device: Optional[int] = None
):
"""
Initialize audio capture.
Args:
sample_rate: Target audio sample rate in Hz (16000 for Whisper)
chunk_duration: Duration of each audio chunk in seconds
device: Input device index, or None for default
"""
self.target_sample_rate = sample_rate
self.chunk_duration = chunk_duration
self.device = device
self.chunk_size = int(sample_rate * chunk_duration)
# Hardware sample rate (will be auto-detected)
self.hardware_sample_rate = None
self.audio_queue = queue.Queue()
self.is_recording = False
self.stop_event = Event()
self.recording_thread: Optional[Thread] = None
def _detect_sample_rate(self) -> int:
"""
Detect a supported sample rate for the audio device.
Returns:
Supported sample rate
"""
# Try common sample rates in order of preference
common_rates = [self.target_sample_rate, 48000, 44100, 22050, 32000, 8000]
for rate in common_rates:
try:
# Try to create a test stream
with sd.InputStream(
device=self.device,
channels=1,
samplerate=rate,
blocksize=1024
):
print(f"Using hardware sample rate: {rate} Hz")
return rate
except sd.PortAudioError:
continue
# If nothing works, default to 48000
print(f"Warning: Could not detect sample rate, defaulting to 48000 Hz")
return 48000
def _resample(self, audio: np.ndarray, from_rate: int, to_rate: int) -> np.ndarray:
"""
Resample audio from one sample rate to another.
Args:
audio: Input audio data
from_rate: Source sample rate
to_rate: Target sample rate
Returns:
Resampled audio
"""
if from_rate == to_rate:
return audio
# Calculate resampling ratio
num_samples = int(len(audio) * to_rate / from_rate)
# Use scipy's resample for high-quality resampling
resampled = signal.resample(audio, num_samples)
return resampled.astype(np.float32)
@staticmethod
def get_input_devices() -> List[Tuple[int, str]]:
"""
Get list of available input audio devices.
Returns:
List of (device_index, device_name) tuples
"""
devices = []
device_list = sd.query_devices()
for i, device in enumerate(device_list):
# Only include devices with input channels
if device['max_input_channels'] > 0:
devices.append((i, device['name']))
return devices
@staticmethod
def get_default_device() -> Optional[Tuple[int, str]]:
"""
Get the default input device.
Returns:
(device_index, device_name) tuple or None
"""
try:
default_device = sd.query_devices(kind='input')
device_list = sd.query_devices()
for i, device in enumerate(device_list):
if device['name'] == default_device['name']:
return (i, device['name'])
except:
pass
return None
def _audio_callback(self, indata, frames, time_info, status):
"""Callback function for sounddevice stream."""
if status:
print(f"Audio status: {status}")
# Copy audio data to queue
audio_data = indata.copy().flatten()
self.audio_queue.put(audio_data)
def start_recording(self, callback: Optional[Callable[[np.ndarray], None]] = None):
"""
Start recording audio.
Args:
callback: Optional callback function to receive audio chunks
"""
if self.is_recording:
return
# Detect supported sample rate
self.hardware_sample_rate = self._detect_sample_rate()
self.is_recording = True
self.stop_event.clear()
def record_loop():
"""Recording loop that runs in a separate thread."""
buffer = np.array([], dtype=np.float32)
# Calculate hardware chunk size
hardware_chunk_size = int(self.hardware_sample_rate * self.chunk_duration)
try:
with sd.InputStream(
device=self.device,
channels=1,
samplerate=self.hardware_sample_rate,
callback=self._audio_callback,
blocksize=int(self.hardware_sample_rate * 0.1) # 100ms blocks
):
while not self.stop_event.is_set():
try:
# Get audio data from queue (with timeout)
audio_chunk = self.audio_queue.get(timeout=0.1)
buffer = np.concatenate([buffer, audio_chunk])
# If we have enough data for a full chunk
if len(buffer) >= hardware_chunk_size:
# Extract chunk
chunk = buffer[:hardware_chunk_size]
buffer = buffer[hardware_chunk_size:]
# Resample to target rate if needed
if self.hardware_sample_rate != self.target_sample_rate:
chunk = self._resample(
chunk,
self.hardware_sample_rate,
self.target_sample_rate
)
# Send to callback if provided
if callback:
callback(chunk)
except queue.Empty:
continue
except Exception as e:
print(f"Error in recording loop: {e}")
except Exception as e:
print(f"Error opening audio stream: {e}")
self.is_recording = False
self.recording_thread = Thread(target=record_loop, daemon=True)
self.recording_thread.start()
def stop_recording(self):
"""Stop recording audio."""
if not self.is_recording:
return
self.is_recording = False
self.stop_event.set()
if self.recording_thread:
self.recording_thread.join(timeout=2.0)
self.recording_thread = None
def get_audio_chunk(self, timeout: float = 1.0) -> Optional[np.ndarray]:
"""
Get the next audio chunk from the queue.
Args:
timeout: Maximum time to wait for a chunk
Returns:
Audio chunk as numpy array or None if timeout
"""
try:
return self.audio_queue.get(timeout=timeout)
except queue.Empty:
return None
def is_recording_active(self) -> bool:
"""Check if recording is currently active."""
return self.is_recording
def clear_queue(self):
"""Clear any pending audio chunks from the queue."""
while not self.audio_queue.empty():
try:
self.audio_queue.get_nowait()
except queue.Empty:
break
def __del__(self):
"""Cleanup when object is destroyed."""
self.stop_recording()