Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/main_cli.py
+++ b/main_cli.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+Local Transcription CLI
+
+Command-line version of the transcription application.
+Works without GUI - perfect for testing and headless operation.
+"""
+
+import sys
+import os
+from pathlib import Path
+import signal
+import argparse
+
+# Add project root to Python path
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+
+from client.config import Config
+from client.device_utils import DeviceManager
+from client.audio_capture import AudioCapture
+from client.noise_suppression import NoiseSuppressor
+from client.transcription_engine import TranscriptionEngine
+
+
+class TranscriptionCLI:
+    """CLI transcription application."""
+
+    def __init__(self, args):
+        """Initialize the CLI application."""
+        self.args = args
+        self.config = Config()
+        self.device_manager = DeviceManager()
+        self.is_running = False
+
+        # Override config with command-line arguments
+        if args.model:
+            self.config.set('transcription.model', args.model)
+        if args.device:
+            self.config.set('transcription.device', args.device)
+        if args.language:
+            self.config.set('transcription.language', args.language)
+        if args.user:
+            self.config.set('user.name', args.user)
+
+        # Components
+        self.audio_capture = None
+        self.noise_suppressor = None
+        self.transcription_engine = None
+
+    def initialize(self):
+        """Initialize all components."""
+        print("=" * 60)
+        print("Local Transcription CLI")
+        print("=" * 60)
+
+        # Device setup
+        device_config = self.config.get('transcription.device', 'auto')
+        self.device_manager.set_device(device_config)
+
+        print(f"\nUser: {self.config.get('user.name', 'User')}")
+        print(f"Model: {self.config.get('transcription.model', 'base')}")
+        print(f"Language: {self.config.get('transcription.language', 'en')}")
+        print(f"Device: {self.device_manager.current_device}")
+
+        # Initialize transcription engine
+        print(f"\nLoading Whisper model...")
+        model_size = self.config.get('transcription.model', 'base')
+        language = self.config.get('transcription.language', 'en')
+        device = self.device_manager.get_device_for_whisper()
+        compute_type = self.device_manager.get_compute_type()
+
+        self.transcription_engine = TranscriptionEngine(
+            model_size=model_size,
+            device=device,
+            compute_type=compute_type,
+            language=language,
+            min_confidence=self.config.get('processing.min_confidence', 0.5)
+        )
+
+        success = self.transcription_engine.load_model()
+        if not success:
+            print("❌ Failed to load model!")
+            return False
+
+        print("✓ Model loaded successfully!")
+
+        # Initialize audio capture
+        audio_device_str = self.config.get('audio.input_device', 'default')
+        audio_device = None if audio_device_str == 'default' else int(audio_device_str)
+
+        self.audio_capture = AudioCapture(
+            sample_rate=self.config.get('audio.sample_rate', 16000),
+            chunk_duration=self.config.get('audio.chunk_duration', 3.0),
+            device=audio_device
+        )
+
+        # Initialize noise suppressor
+        self.noise_suppressor = NoiseSuppressor(
+            sample_rate=self.config.get('audio.sample_rate', 16000),
+            method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
+            strength=self.config.get('noise_suppression.strength', 0.7),
+            use_vad=self.config.get('processing.use_vad', True)
+        )
+
+        print("\n✓ All components initialized!")
+        return True
+
+    def process_audio_chunk(self, audio_chunk):
+        """Process an audio chunk."""
+        try:
+            # Apply noise suppression
+            processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
+
+            # Skip if silent
+            if processed_audio is None:
+                return
+
+            # Transcribe
+            user_name = self.config.get('user.name', 'User')
+            result = self.transcription_engine.transcribe(
+                processed_audio,
+                sample_rate=self.config.get('audio.sample_rate', 16000),
+                user_name=user_name
+            )
+
+            # Display result
+            if result:
+                print(f"{result}")
+
+        except Exception as e:
+            print(f"Error processing audio: {e}")
+
+    def run(self):
+        """Run the transcription loop."""
+        if not self.initialize():
+            return 1
+
+        # Setup signal handler for graceful shutdown
+        def signal_handler(sig, frame):
+            print("\n\nStopping transcription...")
+            self.is_running = False
+
+        signal.signal(signal.SIGINT, signal_handler)
+
+        print("\n" + "=" * 60)
+        print("🎤 Recording... (Press Ctrl+C to stop)")
+        print("=" * 60)
+        print()
+
+        # Start recording
+        self.is_running = True
+        self.audio_capture.start_recording(callback=self.process_audio_chunk)
+
+        # Keep running until interrupted
+        try:
+            while self.is_running:
+                signal.pause()
+        except AttributeError:
+            # signal.pause() not available on Windows
+            import time
+            while self.is_running:
+                time.sleep(0.1)
+
+        # Cleanup
+        self.audio_capture.stop_recording()
+        self.transcription_engine.unload_model()
+
+        print("\n" + "=" * 60)
+        print("✓ Transcription stopped")
+        print("=" * 60)
+
+        return 0
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='Local Transcription CLI - Real-time speech-to-text'
+    )
+    parser.add_argument(
+        '-m', '--model',
+        choices=['tiny', 'base', 'small', 'medium', 'large'],
+        help='Whisper model size'
+    )
+    parser.add_argument(
+        '-d', '--device',
+        choices=['cpu', 'cuda', 'auto'],
+        help='Compute device'
+    )
+    parser.add_argument(
+        '-l', '--language',
+        help='Language code (e.g., en, es, fr) or "auto"'
+    )
+    parser.add_argument(
+        '-u', '--user',
+        help='User/speaker name'
+    )
+    parser.add_argument(
+        '--list-devices',
+        action='store_true',
+        help='List available audio input devices'
+    )
+
+    args = parser.parse_args()
+
+    # List devices if requested
+    if args.list_devices:
+        print("Available audio input devices:")
+        devices = AudioCapture.get_input_devices()
+        for idx, name in devices:
+            print(f"  [{idx}] {name}")
+        return 0
+
+    # Run application
+    app = TranscriptionCLI(args)
+    return app.run()
+
+
+if __name__ == "__main__":
+    sys.exit(main())