Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/test_components.py
+++ b/test_components.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Test script to verify all components work without GUI.
+This can run in headless environments.
+"""
+
+import sys
+from pathlib import Path
+
+# Add project root to Python path
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+
+print("=" * 60)
+print("Testing Local Transcription Components (No GUI)")
+print("=" * 60)
+
+# Test 1: Configuration
+print("\n1. Testing Configuration System...")
+try:
+    from client.config import Config
+    config = Config()
+    print(f"   ✓ Config loaded: {config.config_path}")
+    print(f"   ✓ User name: {config.get('user.name')}")
+    print(f"   ✓ Model: {config.get('transcription.model')}")
+except Exception as e:
+    print(f"   ✗ Config failed: {e}")
+    sys.exit(1)
+
+# Test 2: Device Detection
+print("\n2. Testing Device Detection...")
+try:
+    from client.device_utils import DeviceManager
+    device_mgr = DeviceManager()
+    print(f"   ✓ Available devices: {device_mgr.available_devices}")
+    print(f"   ✓ Current device: {device_mgr.current_device}")
+    print(f"   ✓ GPU available: {device_mgr.is_gpu_available()}")
+
+    device_info = device_mgr.get_device_info()
+    for dev_id, dev_desc in device_info:
+        print(f"      - {dev_id}: {dev_desc}")
+except Exception as e:
+    print(f"   ✗ Device detection failed: {e}")
+    sys.exit(1)
+
+# Test 3: Audio Devices
+print("\n3. Testing Audio Capture...")
+try:
+    from client.audio_capture import AudioCapture
+    devices = AudioCapture.get_input_devices()
+    print(f"   ✓ Found {len(devices)} audio input device(s)")
+    for idx, name in devices[:5]:  # Show first 5
+        print(f"      - [{idx}] {name}")
+    if len(devices) > 5:
+        print(f"      ... and {len(devices) - 5} more")
+except Exception as e:
+    print(f"   ✗ Audio capture failed: {e}")
+
+# Test 4: Noise Suppression
+print("\n4. Testing Noise Suppression...")
+try:
+    from client.noise_suppression import NoiseSuppressor
+    import numpy as np
+
+    suppressor = NoiseSuppressor(sample_rate=16000, method="noisereduce", strength=0.7)
+    print(f"   ✓ Noise suppressor created: {suppressor}")
+
+    # Test with dummy audio
+    test_audio = np.random.randn(16000).astype(np.float32) * 0.1
+    processed = suppressor.process(test_audio, skip_silent=False)
+    print(f"   ✓ Processed audio shape: {processed.shape}")
+except Exception as e:
+    print(f"   ✗ Noise suppression failed: {e}")
+
+# Test 5: Transcription Engine
+print("\n5. Testing Transcription Engine (Loading Model)...")
+try:
+    from client.transcription_engine import TranscriptionEngine
+
+    device = device_mgr.get_device_for_whisper()
+    compute_type = device_mgr.get_compute_type()
+
+    print(f"   → Using device: {device} with compute type: {compute_type}")
+    print(f"   → Loading model (this may take 1-2 minutes on first run)...")
+
+    engine = TranscriptionEngine(
+        model_size="tiny",  # Use tiny for faster testing
+        device=device,
+        compute_type=compute_type,
+        language="en"
+    )
+
+    success = engine.load_model()
+    if success:
+        print(f"   ✓ Model loaded successfully!")
+        print(f"   ✓ Engine: {engine}")
+
+        # Test transcription with dummy audio
+        print(f"\n   Testing transcription with silent audio...")
+        test_audio = np.zeros(48000, dtype=np.float32)  # 3 seconds of silence
+        result = engine.transcribe(test_audio, sample_rate=16000, user_name="Test")
+
+        if result:
+            print(f"   ✓ Transcription result: '{result.text}'")
+        else:
+            print(f"   ℹ No transcription (expected for silent audio)")
+
+        engine.unload_model()
+    else:
+        print(f"   ✗ Model loading failed")
+        sys.exit(1)
+
+except Exception as e:
+    print(f"   ✗ Transcription engine failed: {e}")
+    import traceback
+    traceback.print_exc()
+    sys.exit(1)
+
+print("\n" + "=" * 60)
+print("✓ All Components Tested Successfully!")
+print("=" * 60)
+print("\nThe application is ready to use!")
+print("Run 'uv run python main.py' on a system with a display.")
+print("=" * 60)