#!/usr/bin/env python3 """ Local Transcription CLI Command-line version of the transcription application. Works without GUI - perfect for testing and headless operation. """ import sys import os from pathlib import Path import signal import argparse # Add project root to Python path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) from client.config import Config from client.device_utils import DeviceManager from client.audio_capture import AudioCapture from client.noise_suppression import NoiseSuppressor from client.transcription_engine import TranscriptionEngine class TranscriptionCLI: """CLI transcription application.""" def __init__(self, args): """Initialize the CLI application.""" self.args = args self.config = Config() self.device_manager = DeviceManager() self.is_running = False # Override config with command-line arguments if args.model: self.config.set('transcription.model', args.model) if args.device: self.config.set('transcription.device', args.device) if args.language: self.config.set('transcription.language', args.language) if args.user: self.config.set('user.name', args.user) # Components self.audio_capture = None self.noise_suppressor = None self.transcription_engine = None def initialize(self): """Initialize all components.""" print("=" * 60) print("Local Transcription CLI") print("=" * 60) # Device setup device_config = self.config.get('transcription.device', 'auto') self.device_manager.set_device(device_config) print(f"\nUser: {self.config.get('user.name', 'User')}") print(f"Model: {self.config.get('transcription.model', 'base')}") print(f"Language: {self.config.get('transcription.language', 'en')}") print(f"Device: {self.device_manager.current_device}") # Initialize transcription engine print(f"\nLoading Whisper model...") model_size = self.config.get('transcription.model', 'base') language = self.config.get('transcription.language', 'en') device = self.device_manager.get_device_for_whisper() compute_type = self.device_manager.get_compute_type() self.transcription_engine = TranscriptionEngine( model_size=model_size, device=device, compute_type=compute_type, language=language, min_confidence=self.config.get('processing.min_confidence', 0.5) ) success = self.transcription_engine.load_model() if not success: print("āŒ Failed to load model!") return False print("āœ“ Model loaded successfully!") # Initialize audio capture audio_device_str = self.config.get('audio.input_device', 'default') audio_device = None if audio_device_str == 'default' else int(audio_device_str) self.audio_capture = AudioCapture( sample_rate=self.config.get('audio.sample_rate', 16000), chunk_duration=self.config.get('audio.chunk_duration', 3.0), overlap_duration=self.config.get('audio.overlap_duration', 0.5), device=audio_device ) # Initialize noise suppressor self.noise_suppressor = NoiseSuppressor( sample_rate=self.config.get('audio.sample_rate', 16000), method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none", strength=self.config.get('noise_suppression.strength', 0.7), use_vad=self.config.get('processing.use_vad', True) ) print("\nāœ“ All components initialized!") return True def process_audio_chunk(self, audio_chunk): """Process an audio chunk.""" try: # Apply noise suppression processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True) # Skip if silent if processed_audio is None: return # Transcribe user_name = self.config.get('user.name', 'User') result = self.transcription_engine.transcribe( processed_audio, sample_rate=self.config.get('audio.sample_rate', 16000), user_name=user_name ) # Display result if result: print(f"{result}") except Exception as e: print(f"Error processing audio: {e}") def run(self): """Run the transcription loop.""" if not self.initialize(): return 1 # Setup signal handler for graceful shutdown def signal_handler(sig, frame): print("\n\nStopping transcription...") self.is_running = False signal.signal(signal.SIGINT, signal_handler) print("\n" + "=" * 60) print("šŸŽ¤ Recording... (Press Ctrl+C to stop)") print("=" * 60) print() # Start recording self.is_running = True self.audio_capture.start_recording(callback=self.process_audio_chunk) # Keep running until interrupted try: while self.is_running: signal.pause() except AttributeError: # signal.pause() not available on Windows import time while self.is_running: time.sleep(0.1) # Cleanup self.audio_capture.stop_recording() self.transcription_engine.unload_model() print("\n" + "=" * 60) print("āœ“ Transcription stopped") print("=" * 60) return 0 def main(): """Main entry point.""" parser = argparse.ArgumentParser( description='Local Transcription CLI - Real-time speech-to-text' ) parser.add_argument( '-m', '--model', choices=['tiny', 'base', 'small', 'medium', 'large'], help='Whisper model size' ) parser.add_argument( '-d', '--device', choices=['cpu', 'cuda', 'auto'], help='Compute device' ) parser.add_argument( '-l', '--language', help='Language code (e.g., en, es, fr) or "auto"' ) parser.add_argument( '-u', '--user', help='User/speaker name' ) parser.add_argument( '--list-devices', action='store_true', help='List available audio input devices' ) args = parser.parse_args() # List devices if requested if args.list_devices: print("Available audio input devices:") devices = AudioCapture.get_input_devices() for idx, name in devices: print(f" [{idx}] {name}") return 0 # Run application app = TranscriptionCLI(args) return app.run() if __name__ == "__main__": sys.exit(main())