Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application

Features:
- Real-time speech-to-text with Whisper (faster-whisper)
- PySide6 desktop GUI with settings dialog
- Web server for OBS browser source integration
- Audio capture with automatic sample rate detection and resampling
- Noise suppression with Voice Activity Detection (VAD)
- Configurable display settings (font, timestamps, fade duration)
- Settings apply without restart (with automatic model reloading)
- Auto-fade for web display transcriptions
- CPU/GPU support with automatic device detection
- Standalone executable builds (PyInstaller)
- CUDA build support (works on systems without CUDA hardware)

Components:
- Audio capture with sounddevice
- Noise reduction with noisereduce + webrtcvad
- Transcription with faster-whisper
- GUI with PySide6
- Web server with FastAPI + WebSocket
- Configuration system with YAML

Build System:
- Standard builds (CPU-only): build.sh / build.bat
- CUDA builds (universal): build-cuda.sh / build-cuda.bat
- Comprehensive BUILD.md documentation
- Cross-platform support (Linux, Windows)

Documentation:
- README.md with project overview and quick start
- BUILD.md with detailed build instructions
- NEXT_STEPS.md with future enhancement roadmap
- INSTALL.md with setup instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions

124
test_components.py Normal file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""
Test script to verify all components work without GUI.
This can run in headless environments.
"""
import sys
from pathlib import Path
# Add project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
print("=" * 60)
print("Testing Local Transcription Components (No GUI)")
print("=" * 60)
# Test 1: Configuration
print("\n1. Testing Configuration System...")
try:
from client.config import Config
config = Config()
print(f" ✓ Config loaded: {config.config_path}")
print(f" ✓ User name: {config.get('user.name')}")
print(f" ✓ Model: {config.get('transcription.model')}")
except Exception as e:
print(f" ✗ Config failed: {e}")
sys.exit(1)
# Test 2: Device Detection
print("\n2. Testing Device Detection...")
try:
from client.device_utils import DeviceManager
device_mgr = DeviceManager()
print(f" ✓ Available devices: {device_mgr.available_devices}")
print(f" ✓ Current device: {device_mgr.current_device}")
print(f" ✓ GPU available: {device_mgr.is_gpu_available()}")
device_info = device_mgr.get_device_info()
for dev_id, dev_desc in device_info:
print(f" - {dev_id}: {dev_desc}")
except Exception as e:
print(f" ✗ Device detection failed: {e}")
sys.exit(1)
# Test 3: Audio Devices
print("\n3. Testing Audio Capture...")
try:
from client.audio_capture import AudioCapture
devices = AudioCapture.get_input_devices()
print(f" ✓ Found {len(devices)} audio input device(s)")
for idx, name in devices[:5]: # Show first 5
print(f" - [{idx}] {name}")
if len(devices) > 5:
print(f" ... and {len(devices) - 5} more")
except Exception as e:
print(f" ✗ Audio capture failed: {e}")
# Test 4: Noise Suppression
print("\n4. Testing Noise Suppression...")
try:
from client.noise_suppression import NoiseSuppressor
import numpy as np
suppressor = NoiseSuppressor(sample_rate=16000, method="noisereduce", strength=0.7)
print(f" ✓ Noise suppressor created: {suppressor}")
# Test with dummy audio
test_audio = np.random.randn(16000).astype(np.float32) * 0.1
processed = suppressor.process(test_audio, skip_silent=False)
print(f" ✓ Processed audio shape: {processed.shape}")
except Exception as e:
print(f" ✗ Noise suppression failed: {e}")
# Test 5: Transcription Engine
print("\n5. Testing Transcription Engine (Loading Model)...")
try:
from client.transcription_engine import TranscriptionEngine
device = device_mgr.get_device_for_whisper()
compute_type = device_mgr.get_compute_type()
print(f" → Using device: {device} with compute type: {compute_type}")
print(f" → Loading model (this may take 1-2 minutes on first run)...")
engine = TranscriptionEngine(
model_size="tiny", # Use tiny for faster testing
device=device,
compute_type=compute_type,
language="en"
)
success = engine.load_model()
if success:
print(f" ✓ Model loaded successfully!")
print(f" ✓ Engine: {engine}")
# Test transcription with dummy audio
print(f"\n Testing transcription with silent audio...")
test_audio = np.zeros(48000, dtype=np.float32) # 3 seconds of silence
result = engine.transcribe(test_audio, sample_rate=16000, user_name="Test")
if result:
print(f" ✓ Transcription result: '{result.text}'")
else:
print(f" No transcription (expected for silent audio)")
engine.unload_model()
else:
print(f" ✗ Model loading failed")
sys.exit(1)
except Exception as e:
print(f" ✗ Transcription engine failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print("\n" + "=" * 60)
print("✓ All Components Tested Successfully!")
print("=" * 60)
print("\nThe application is ready to use!")
print("Run 'uv run python main.py' on a system with a display.")
print("=" * 60)