Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application

Features:
- Real-time speech-to-text with Whisper (faster-whisper)
- PySide6 desktop GUI with settings dialog
- Web server for OBS browser source integration
- Audio capture with automatic sample rate detection and resampling
- Noise suppression with Voice Activity Detection (VAD)
- Configurable display settings (font, timestamps, fade duration)
- Settings apply without restart (with automatic model reloading)
- Auto-fade for web display transcriptions
- CPU/GPU support with automatic device detection
- Standalone executable builds (PyInstaller)
- CUDA build support (works on systems without CUDA hardware)

Components:
- Audio capture with sounddevice
- Noise reduction with noisereduce + webrtcvad
- Transcription with faster-whisper
- GUI with PySide6
- Web server with FastAPI + WebSocket
- Configuration system with YAML

Build System:
- Standard builds (CPU-only): build.sh / build.bat
- CUDA builds (universal): build-cuda.sh / build-cuda.bat
- Comprehensive BUILD.md documentation
- Cross-platform support (Linux, Windows)

Documentation:
- README.md with project overview and quick start
- BUILD.md with detailed build instructions
- NEXT_STEPS.md with future enhancement roadmap
- INSTALL.md with setup instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions

221
main_cli.py Executable file
View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
Local Transcription CLI
Command-line version of the transcription application.
Works without GUI - perfect for testing and headless operation.
"""
import sys
import os
from pathlib import Path
import signal
import argparse
# Add project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from client.config import Config
from client.device_utils import DeviceManager
from client.audio_capture import AudioCapture
from client.noise_suppression import NoiseSuppressor
from client.transcription_engine import TranscriptionEngine
class TranscriptionCLI:
"""CLI transcription application."""
def __init__(self, args):
"""Initialize the CLI application."""
self.args = args
self.config = Config()
self.device_manager = DeviceManager()
self.is_running = False
# Override config with command-line arguments
if args.model:
self.config.set('transcription.model', args.model)
if args.device:
self.config.set('transcription.device', args.device)
if args.language:
self.config.set('transcription.language', args.language)
if args.user:
self.config.set('user.name', args.user)
# Components
self.audio_capture = None
self.noise_suppressor = None
self.transcription_engine = None
def initialize(self):
"""Initialize all components."""
print("=" * 60)
print("Local Transcription CLI")
print("=" * 60)
# Device setup
device_config = self.config.get('transcription.device', 'auto')
self.device_manager.set_device(device_config)
print(f"\nUser: {self.config.get('user.name', 'User')}")
print(f"Model: {self.config.get('transcription.model', 'base')}")
print(f"Language: {self.config.get('transcription.language', 'en')}")
print(f"Device: {self.device_manager.current_device}")
# Initialize transcription engine
print(f"\nLoading Whisper model...")
model_size = self.config.get('transcription.model', 'base')
language = self.config.get('transcription.language', 'en')
device = self.device_manager.get_device_for_whisper()
compute_type = self.device_manager.get_compute_type()
self.transcription_engine = TranscriptionEngine(
model_size=model_size,
device=device,
compute_type=compute_type,
language=language,
min_confidence=self.config.get('processing.min_confidence', 0.5)
)
success = self.transcription_engine.load_model()
if not success:
print("❌ Failed to load model!")
return False
print("✓ Model loaded successfully!")
# Initialize audio capture
audio_device_str = self.config.get('audio.input_device', 'default')
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
self.audio_capture = AudioCapture(
sample_rate=self.config.get('audio.sample_rate', 16000),
chunk_duration=self.config.get('audio.chunk_duration', 3.0),
device=audio_device
)
# Initialize noise suppressor
self.noise_suppressor = NoiseSuppressor(
sample_rate=self.config.get('audio.sample_rate', 16000),
method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
strength=self.config.get('noise_suppression.strength', 0.7),
use_vad=self.config.get('processing.use_vad', True)
)
print("\n✓ All components initialized!")
return True
def process_audio_chunk(self, audio_chunk):
"""Process an audio chunk."""
try:
# Apply noise suppression
processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
# Skip if silent
if processed_audio is None:
return
# Transcribe
user_name = self.config.get('user.name', 'User')
result = self.transcription_engine.transcribe(
processed_audio,
sample_rate=self.config.get('audio.sample_rate', 16000),
user_name=user_name
)
# Display result
if result:
print(f"{result}")
except Exception as e:
print(f"Error processing audio: {e}")
def run(self):
"""Run the transcription loop."""
if not self.initialize():
return 1
# Setup signal handler for graceful shutdown
def signal_handler(sig, frame):
print("\n\nStopping transcription...")
self.is_running = False
signal.signal(signal.SIGINT, signal_handler)
print("\n" + "=" * 60)
print("🎤 Recording... (Press Ctrl+C to stop)")
print("=" * 60)
print()
# Start recording
self.is_running = True
self.audio_capture.start_recording(callback=self.process_audio_chunk)
# Keep running until interrupted
try:
while self.is_running:
signal.pause()
except AttributeError:
# signal.pause() not available on Windows
import time
while self.is_running:
time.sleep(0.1)
# Cleanup
self.audio_capture.stop_recording()
self.transcription_engine.unload_model()
print("\n" + "=" * 60)
print("✓ Transcription stopped")
print("=" * 60)
return 0
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Local Transcription CLI - Real-time speech-to-text'
)
parser.add_argument(
'-m', '--model',
choices=['tiny', 'base', 'small', 'medium', 'large'],
help='Whisper model size'
)
parser.add_argument(
'-d', '--device',
choices=['cpu', 'cuda', 'auto'],
help='Compute device'
)
parser.add_argument(
'-l', '--language',
help='Language code (e.g., en, es, fr) or "auto"'
)
parser.add_argument(
'-u', '--user',
help='User/speaker name'
)
parser.add_argument(
'--list-devices',
action='store_true',
help='List available audio input devices'
)
args = parser.parse_args()
# List devices if requested
if args.list_devices:
print("Available audio input devices:")
devices = AudioCapture.get_input_devices()
for idx, name in devices:
print(f" [{idx}] {name}")
return 0
# Run application
app = TranscriptionCLI(args)
return app.run()
if __name__ == "__main__":
sys.exit(main())