#!/usr/bin/env python3 """ Local Transcription CLI Command-line version of the transcription application. Works without GUI - perfect for testing and headless operation. """ import sys import os from pathlib import Path import signal import argparse # Add project root to Python path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) from client.config import Config from client.device_utils import DeviceManager from client.transcription_engine_realtime import RealtimeTranscriptionEngine, TranscriptionResult class TranscriptionCLI: """CLI transcription application.""" def __init__(self, args): """Initialize the CLI application.""" self.args = args self.config = Config() self.device_manager = DeviceManager() self.is_running = False # Override config with command-line arguments if args.model: self.config.set('transcription.model', args.model) if args.device: self.config.set('transcription.device', args.device) if args.language: self.config.set('transcription.language', args.language) if args.user: self.config.set('user.name', args.user) # Components self.transcription_engine = None def initialize(self): """Initialize all components.""" print("=" * 60) print("Local Transcription CLI (RealtimeSTT)") print("=" * 60) # Device setup device_config = self.config.get('transcription.device', 'auto') self.device_manager.set_device(device_config) user_name = self.config.get('user.name', 'User') model = self.config.get('transcription.model', 'base.en') language = self.config.get('transcription.language', 'en') print(f"\nUser: {user_name}") print(f"Model: {model}") print(f"Language: {language}") print(f"Device: {self.device_manager.current_device}") # Get audio device audio_device_str = self.config.get('audio.input_device', 'default') audio_device = None if audio_device_str == 'default' else int(audio_device_str) # Initialize transcription engine print(f"\nInitializing RealtimeSTT engine...") device = self.device_manager.get_device_for_whisper() compute_type = self.config.get('transcription.compute_type', 'default') self.transcription_engine = RealtimeTranscriptionEngine( model=model, device=device, language=language, compute_type=compute_type, enable_realtime_transcription=self.config.get('transcription.enable_realtime_transcription', False), realtime_model=self.config.get('transcription.realtime_model', 'tiny.en'), silero_sensitivity=self.config.get('transcription.silero_sensitivity', 0.4), silero_use_onnx=self.config.get('transcription.silero_use_onnx', True), webrtc_sensitivity=self.config.get('transcription.webrtc_sensitivity', 3), post_speech_silence_duration=self.config.get('transcription.post_speech_silence_duration', 0.3), min_length_of_recording=self.config.get('transcription.min_length_of_recording', 0.5), min_gap_between_recordings=self.config.get('transcription.min_gap_between_recordings', 0.0), pre_recording_buffer_duration=self.config.get('transcription.pre_recording_buffer_duration', 0.2), beam_size=self.config.get('transcription.beam_size', 5), initial_prompt=self.config.get('transcription.initial_prompt', ''), no_log_file=True, input_device_index=audio_device, user_name=user_name ) # Set up callbacks self.transcription_engine.set_callbacks( realtime_callback=self._on_realtime_transcription, final_callback=self._on_final_transcription ) # Initialize engine (loads models, sets up VAD) success = self.transcription_engine.initialize() if not success: print("❌ Failed to initialize engine!") return False print("✓ Engine initialized successfully!") # Start recording success = self.transcription_engine.start_recording() if not success: print("❌ Failed to start recording!") return False print("✓ Recording started!") print("\n✓ All components ready!") return True def _on_realtime_transcription(self, result: TranscriptionResult): """Handle realtime transcription callback.""" if self.is_running: print(f"[PREVIEW] {result}") def _on_final_transcription(self, result: TranscriptionResult): """Handle final transcription callback.""" if self.is_running: print(f"{result}") def run(self): """Run the transcription loop.""" if not self.initialize(): return 1 # Setup signal handler for graceful shutdown def signal_handler(sig, frame): print("\n\nStopping transcription...") self.is_running = False signal.signal(signal.SIGINT, signal_handler) print("\n" + "=" * 60) print("🎤 Recording... (Press Ctrl+C to stop)") print("=" * 60) print() # Recording is already started by the engine self.is_running = True # Keep running until interrupted try: while self.is_running: signal.pause() except AttributeError: # signal.pause() not available on Windows import time while self.is_running: time.sleep(0.1) # Cleanup self.transcription_engine.stop_recording() self.transcription_engine.stop() print("\n" + "=" * 60) print("✓ Transcription stopped") print("=" * 60) return 0 def main(): """Main entry point.""" parser = argparse.ArgumentParser( description='Local Transcription CLI - Real-time speech-to-text' ) parser.add_argument( '-m', '--model', choices=['tiny', 'base', 'small', 'medium', 'large'], help='Whisper model size' ) parser.add_argument( '-d', '--device', choices=['cpu', 'cuda', 'auto'], help='Compute device' ) parser.add_argument( '-l', '--language', help='Language code (e.g., en, es, fr) or "auto"' ) parser.add_argument( '-u', '--user', help='User/speaker name' ) parser.add_argument( '--list-devices', action='store_true', help='List available audio input devices' ) args = parser.parse_args() # List devices if requested if args.list_devices: print("Available audio input devices:") devices = AudioCapture.get_input_devices() for idx, name in devices: print(f" [{idx}] {name}") return 0 # Run application app = TranscriptionCLI(args) return app.run() if __name__ == "__main__": sys.exit(main())