2025-12-25 18:48:23 -08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""
|
|
|
|
|
Local Transcription CLI
|
|
|
|
|
|
|
|
|
|
Command-line version of the transcription application.
|
|
|
|
|
Works without GUI - perfect for testing and headless operation.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
import os
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
import signal
|
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
|
# Add project root to Python path
|
|
|
|
|
project_root = Path(__file__).parent
|
|
|
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
|
|
|
|
from client.config import Config
|
|
|
|
|
from client.device_utils import DeviceManager
|
2025-12-28 18:48:29 -08:00
|
|
|
from client.transcription_engine_realtime import RealtimeTranscriptionEngine, TranscriptionResult
|
2025-12-25 18:48:23 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TranscriptionCLI:
|
|
|
|
|
"""CLI transcription application."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, args):
|
|
|
|
|
"""Initialize the CLI application."""
|
|
|
|
|
self.args = args
|
|
|
|
|
self.config = Config()
|
|
|
|
|
self.device_manager = DeviceManager()
|
|
|
|
|
self.is_running = False
|
|
|
|
|
|
|
|
|
|
# Override config with command-line arguments
|
|
|
|
|
if args.model:
|
|
|
|
|
self.config.set('transcription.model', args.model)
|
|
|
|
|
if args.device:
|
|
|
|
|
self.config.set('transcription.device', args.device)
|
|
|
|
|
if args.language:
|
|
|
|
|
self.config.set('transcription.language', args.language)
|
|
|
|
|
if args.user:
|
|
|
|
|
self.config.set('user.name', args.user)
|
|
|
|
|
|
|
|
|
|
# Components
|
|
|
|
|
self.transcription_engine = None
|
|
|
|
|
|
|
|
|
|
def initialize(self):
|
|
|
|
|
"""Initialize all components."""
|
|
|
|
|
print("=" * 60)
|
2025-12-28 18:48:29 -08:00
|
|
|
print("Local Transcription CLI (RealtimeSTT)")
|
2025-12-25 18:48:23 -08:00
|
|
|
print("=" * 60)
|
|
|
|
|
|
|
|
|
|
# Device setup
|
|
|
|
|
device_config = self.config.get('transcription.device', 'auto')
|
|
|
|
|
self.device_manager.set_device(device_config)
|
|
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
user_name = self.config.get('user.name', 'User')
|
|
|
|
|
model = self.config.get('transcription.model', 'base.en')
|
|
|
|
|
language = self.config.get('transcription.language', 'en')
|
|
|
|
|
|
|
|
|
|
print(f"\nUser: {user_name}")
|
|
|
|
|
print(f"Model: {model}")
|
|
|
|
|
print(f"Language: {language}")
|
2025-12-25 18:48:23 -08:00
|
|
|
print(f"Device: {self.device_manager.current_device}")
|
|
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
# Get audio device
|
|
|
|
|
audio_device_str = self.config.get('audio.input_device', 'default')
|
|
|
|
|
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
|
|
|
|
|
|
2025-12-25 18:48:23 -08:00
|
|
|
# Initialize transcription engine
|
2025-12-28 18:48:29 -08:00
|
|
|
print(f"\nInitializing RealtimeSTT engine...")
|
2025-12-25 18:48:23 -08:00
|
|
|
device = self.device_manager.get_device_for_whisper()
|
2025-12-28 18:48:29 -08:00
|
|
|
compute_type = self.config.get('transcription.compute_type', 'default')
|
2025-12-25 18:48:23 -08:00
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
self.transcription_engine = RealtimeTranscriptionEngine(
|
|
|
|
|
model=model,
|
2025-12-25 18:48:23 -08:00
|
|
|
device=device,
|
|
|
|
|
language=language,
|
2025-12-28 18:48:29 -08:00
|
|
|
compute_type=compute_type,
|
|
|
|
|
enable_realtime_transcription=self.config.get('transcription.enable_realtime_transcription', False),
|
|
|
|
|
realtime_model=self.config.get('transcription.realtime_model', 'tiny.en'),
|
|
|
|
|
silero_sensitivity=self.config.get('transcription.silero_sensitivity', 0.4),
|
|
|
|
|
silero_use_onnx=self.config.get('transcription.silero_use_onnx', True),
|
|
|
|
|
webrtc_sensitivity=self.config.get('transcription.webrtc_sensitivity', 3),
|
|
|
|
|
post_speech_silence_duration=self.config.get('transcription.post_speech_silence_duration', 0.3),
|
|
|
|
|
min_length_of_recording=self.config.get('transcription.min_length_of_recording', 0.5),
|
|
|
|
|
min_gap_between_recordings=self.config.get('transcription.min_gap_between_recordings', 0.0),
|
|
|
|
|
pre_recording_buffer_duration=self.config.get('transcription.pre_recording_buffer_duration', 0.2),
|
|
|
|
|
beam_size=self.config.get('transcription.beam_size', 5),
|
|
|
|
|
initial_prompt=self.config.get('transcription.initial_prompt', ''),
|
|
|
|
|
no_log_file=True,
|
|
|
|
|
input_device_index=audio_device,
|
|
|
|
|
user_name=user_name
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Set up callbacks
|
|
|
|
|
self.transcription_engine.set_callbacks(
|
|
|
|
|
realtime_callback=self._on_realtime_transcription,
|
|
|
|
|
final_callback=self._on_final_transcription
|
2025-12-25 18:48:23 -08:00
|
|
|
)
|
|
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
# Initialize engine (loads models, sets up VAD)
|
|
|
|
|
success = self.transcription_engine.initialize()
|
2025-12-25 18:48:23 -08:00
|
|
|
if not success:
|
2025-12-28 18:48:29 -08:00
|
|
|
print("❌ Failed to initialize engine!")
|
2025-12-25 18:48:23 -08:00
|
|
|
return False
|
|
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
print("✓ Engine initialized successfully!")
|
2025-12-25 18:48:23 -08:00
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
# Start recording
|
|
|
|
|
success = self.transcription_engine.start_recording()
|
|
|
|
|
if not success:
|
|
|
|
|
print("❌ Failed to start recording!")
|
|
|
|
|
return False
|
2025-12-25 18:48:23 -08:00
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
print("✓ Recording started!")
|
|
|
|
|
print("\n✓ All components ready!")
|
2025-12-25 18:48:23 -08:00
|
|
|
return True
|
|
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
def _on_realtime_transcription(self, result: TranscriptionResult):
|
|
|
|
|
"""Handle realtime transcription callback."""
|
|
|
|
|
if self.is_running:
|
|
|
|
|
print(f"[PREVIEW] {result}")
|
2025-12-25 18:48:23 -08:00
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
def _on_final_transcription(self, result: TranscriptionResult):
|
|
|
|
|
"""Handle final transcription callback."""
|
|
|
|
|
if self.is_running:
|
|
|
|
|
print(f"{result}")
|
2025-12-25 18:48:23 -08:00
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
|
"""Run the transcription loop."""
|
|
|
|
|
if not self.initialize():
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# Setup signal handler for graceful shutdown
|
|
|
|
|
def signal_handler(sig, frame):
|
|
|
|
|
print("\n\nStopping transcription...")
|
|
|
|
|
self.is_running = False
|
|
|
|
|
|
|
|
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60)
|
|
|
|
|
print("🎤 Recording... (Press Ctrl+C to stop)")
|
|
|
|
|
print("=" * 60)
|
|
|
|
|
print()
|
|
|
|
|
|
2025-12-28 18:48:29 -08:00
|
|
|
# Recording is already started by the engine
|
2025-12-25 18:48:23 -08:00
|
|
|
self.is_running = True
|
|
|
|
|
|
|
|
|
|
# Keep running until interrupted
|
|
|
|
|
try:
|
|
|
|
|
while self.is_running:
|
|
|
|
|
signal.pause()
|
|
|
|
|
except AttributeError:
|
|
|
|
|
# signal.pause() not available on Windows
|
|
|
|
|
import time
|
|
|
|
|
while self.is_running:
|
|
|
|
|
time.sleep(0.1)
|
|
|
|
|
|
|
|
|
|
# Cleanup
|
2025-12-28 18:48:29 -08:00
|
|
|
self.transcription_engine.stop_recording()
|
|
|
|
|
self.transcription_engine.stop()
|
2025-12-25 18:48:23 -08:00
|
|
|
|
|
|
|
|
print("\n" + "=" * 60)
|
|
|
|
|
print("✓ Transcription stopped")
|
|
|
|
|
print("=" * 60)
|
|
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
"""Main entry point."""
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
description='Local Transcription CLI - Real-time speech-to-text'
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'-m', '--model',
|
|
|
|
|
choices=['tiny', 'base', 'small', 'medium', 'large'],
|
|
|
|
|
help='Whisper model size'
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'-d', '--device',
|
|
|
|
|
choices=['cpu', 'cuda', 'auto'],
|
|
|
|
|
help='Compute device'
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'-l', '--language',
|
|
|
|
|
help='Language code (e.g., en, es, fr) or "auto"'
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'-u', '--user',
|
|
|
|
|
help='User/speaker name'
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--list-devices',
|
|
|
|
|
action='store_true',
|
|
|
|
|
help='List available audio input devices'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
# List devices if requested
|
|
|
|
|
if args.list_devices:
|
|
|
|
|
print("Available audio input devices:")
|
|
|
|
|
devices = AudioCapture.get_input_devices()
|
|
|
|
|
for idx, name in devices:
|
|
|
|
|
print(f" [{idx}] {name}")
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
# Run application
|
|
|
|
|
app = TranscriptionCLI(args)
|
|
|
|
|
return app.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
sys.exit(main())
|