Initial commit: Local Transcription App v1.0
Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
221
main_cli.py
Executable file
221
main_cli.py
Executable file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Local Transcription CLI
|
||||
|
||||
Command-line version of the transcription application.
|
||||
Works without GUI - perfect for testing and headless operation.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
import signal
|
||||
import argparse
|
||||
|
||||
# Add project root to Python path
|
||||
project_root = Path(__file__).parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from client.config import Config
|
||||
from client.device_utils import DeviceManager
|
||||
from client.audio_capture import AudioCapture
|
||||
from client.noise_suppression import NoiseSuppressor
|
||||
from client.transcription_engine import TranscriptionEngine
|
||||
|
||||
|
||||
class TranscriptionCLI:
|
||||
"""CLI transcription application."""
|
||||
|
||||
def __init__(self, args):
|
||||
"""Initialize the CLI application."""
|
||||
self.args = args
|
||||
self.config = Config()
|
||||
self.device_manager = DeviceManager()
|
||||
self.is_running = False
|
||||
|
||||
# Override config with command-line arguments
|
||||
if args.model:
|
||||
self.config.set('transcription.model', args.model)
|
||||
if args.device:
|
||||
self.config.set('transcription.device', args.device)
|
||||
if args.language:
|
||||
self.config.set('transcription.language', args.language)
|
||||
if args.user:
|
||||
self.config.set('user.name', args.user)
|
||||
|
||||
# Components
|
||||
self.audio_capture = None
|
||||
self.noise_suppressor = None
|
||||
self.transcription_engine = None
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize all components."""
|
||||
print("=" * 60)
|
||||
print("Local Transcription CLI")
|
||||
print("=" * 60)
|
||||
|
||||
# Device setup
|
||||
device_config = self.config.get('transcription.device', 'auto')
|
||||
self.device_manager.set_device(device_config)
|
||||
|
||||
print(f"\nUser: {self.config.get('user.name', 'User')}")
|
||||
print(f"Model: {self.config.get('transcription.model', 'base')}")
|
||||
print(f"Language: {self.config.get('transcription.language', 'en')}")
|
||||
print(f"Device: {self.device_manager.current_device}")
|
||||
|
||||
# Initialize transcription engine
|
||||
print(f"\nLoading Whisper model...")
|
||||
model_size = self.config.get('transcription.model', 'base')
|
||||
language = self.config.get('transcription.language', 'en')
|
||||
device = self.device_manager.get_device_for_whisper()
|
||||
compute_type = self.device_manager.get_compute_type()
|
||||
|
||||
self.transcription_engine = TranscriptionEngine(
|
||||
model_size=model_size,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
min_confidence=self.config.get('processing.min_confidence', 0.5)
|
||||
)
|
||||
|
||||
success = self.transcription_engine.load_model()
|
||||
if not success:
|
||||
print("❌ Failed to load model!")
|
||||
return False
|
||||
|
||||
print("✓ Model loaded successfully!")
|
||||
|
||||
# Initialize audio capture
|
||||
audio_device_str = self.config.get('audio.input_device', 'default')
|
||||
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
|
||||
|
||||
self.audio_capture = AudioCapture(
|
||||
sample_rate=self.config.get('audio.sample_rate', 16000),
|
||||
chunk_duration=self.config.get('audio.chunk_duration', 3.0),
|
||||
device=audio_device
|
||||
)
|
||||
|
||||
# Initialize noise suppressor
|
||||
self.noise_suppressor = NoiseSuppressor(
|
||||
sample_rate=self.config.get('audio.sample_rate', 16000),
|
||||
method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
|
||||
strength=self.config.get('noise_suppression.strength', 0.7),
|
||||
use_vad=self.config.get('processing.use_vad', True)
|
||||
)
|
||||
|
||||
print("\n✓ All components initialized!")
|
||||
return True
|
||||
|
||||
def process_audio_chunk(self, audio_chunk):
|
||||
"""Process an audio chunk."""
|
||||
try:
|
||||
# Apply noise suppression
|
||||
processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
|
||||
|
||||
# Skip if silent
|
||||
if processed_audio is None:
|
||||
return
|
||||
|
||||
# Transcribe
|
||||
user_name = self.config.get('user.name', 'User')
|
||||
result = self.transcription_engine.transcribe(
|
||||
processed_audio,
|
||||
sample_rate=self.config.get('audio.sample_rate', 16000),
|
||||
user_name=user_name
|
||||
)
|
||||
|
||||
# Display result
|
||||
if result:
|
||||
print(f"{result}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing audio: {e}")
|
||||
|
||||
def run(self):
|
||||
"""Run the transcription loop."""
|
||||
if not self.initialize():
|
||||
return 1
|
||||
|
||||
# Setup signal handler for graceful shutdown
|
||||
def signal_handler(sig, frame):
|
||||
print("\n\nStopping transcription...")
|
||||
self.is_running = False
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("🎤 Recording... (Press Ctrl+C to stop)")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Start recording
|
||||
self.is_running = True
|
||||
self.audio_capture.start_recording(callback=self.process_audio_chunk)
|
||||
|
||||
# Keep running until interrupted
|
||||
try:
|
||||
while self.is_running:
|
||||
signal.pause()
|
||||
except AttributeError:
|
||||
# signal.pause() not available on Windows
|
||||
import time
|
||||
while self.is_running:
|
||||
time.sleep(0.1)
|
||||
|
||||
# Cleanup
|
||||
self.audio_capture.stop_recording()
|
||||
self.transcription_engine.unload_model()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✓ Transcription stopped")
|
||||
print("=" * 60)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Local Transcription CLI - Real-time speech-to-text'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-m', '--model',
|
||||
choices=['tiny', 'base', 'small', 'medium', 'large'],
|
||||
help='Whisper model size'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d', '--device',
|
||||
choices=['cpu', 'cuda', 'auto'],
|
||||
help='Compute device'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-l', '--language',
|
||||
help='Language code (e.g., en, es, fr) or "auto"'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-u', '--user',
|
||||
help='User/speaker name'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--list-devices',
|
||||
action='store_true',
|
||||
help='List available audio input devices'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# List devices if requested
|
||||
if args.list_devices:
|
||||
print("Available audio input devices:")
|
||||
devices = AudioCapture.get_input_devices()
|
||||
for idx, name in devices:
|
||||
print(f" [{idx}] {name}")
|
||||
return 0
|
||||
|
||||
# Run application
|
||||
app = TranscriptionCLI(args)
|
||||
return app.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user