Files
local-transcription/main_cli.py
Developer 1c8c6ad7e8
All checks were successful
Tests / Python Backend Tests (push) Successful in 5s
Tests / Frontend Tests (push) Successful in 7s
Tests / Rust Sidecar Tests (push) Successful in 3m12s
Fix display user not updating locally until app restart
Engines now read user.name from the config object at transcription time
instead of caching it at init, so name changes take effect immediately.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 10:40:46 -07:00

217 lines
7.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Local Transcription CLI
Command-line version of the transcription application.
Works without GUI - perfect for testing and headless operation.
"""
import sys
import os
from pathlib import Path
import signal
import argparse
# Add project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from client.config import Config
from client.device_utils import DeviceManager
from client.transcription_engine_realtime import RealtimeTranscriptionEngine, TranscriptionResult
class TranscriptionCLI:
"""CLI transcription application."""
def __init__(self, args):
"""Initialize the CLI application."""
self.args = args
self.config = Config()
self.device_manager = DeviceManager()
self.is_running = False
# Override config with command-line arguments
if args.model:
self.config.set('transcription.model', args.model)
if args.device:
self.config.set('transcription.device', args.device)
if args.language:
self.config.set('transcription.language', args.language)
if args.user:
self.config.set('user.name', args.user)
# Components
self.transcription_engine = None
def initialize(self):
"""Initialize all components."""
print("=" * 60)
print("Local Transcription CLI (RealtimeSTT)")
print("=" * 60)
# Device setup
device_config = self.config.get('transcription.device', 'auto')
self.device_manager.set_device(device_config)
user_name = self.config.get('user.name', 'User')
model = self.config.get('transcription.model', 'base.en')
language = self.config.get('transcription.language', 'en')
print(f"\nUser: {user_name}")
print(f"Model: {model}")
print(f"Language: {language}")
print(f"Device: {self.device_manager.current_device}")
# Get audio device
audio_device_str = self.config.get('audio.input_device', 'default')
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
# Initialize transcription engine
print(f"\nInitializing RealtimeSTT engine...")
device = self.device_manager.get_device_for_whisper()
compute_type = self.config.get('transcription.compute_type', 'default')
self.transcription_engine = RealtimeTranscriptionEngine(
model=model,
device=device,
language=language,
compute_type=compute_type,
enable_realtime_transcription=self.config.get('transcription.enable_realtime_transcription', False),
realtime_model=self.config.get('transcription.realtime_model', 'tiny.en'),
silero_sensitivity=self.config.get('transcription.silero_sensitivity', 0.4),
silero_use_onnx=self.config.get('transcription.silero_use_onnx', True),
webrtc_sensitivity=self.config.get('transcription.webrtc_sensitivity', 3),
post_speech_silence_duration=self.config.get('transcription.post_speech_silence_duration', 0.3),
min_length_of_recording=self.config.get('transcription.min_length_of_recording', 0.5),
min_gap_between_recordings=self.config.get('transcription.min_gap_between_recordings', 0.0),
pre_recording_buffer_duration=self.config.get('transcription.pre_recording_buffer_duration', 0.2),
beam_size=self.config.get('transcription.beam_size', 5),
initial_prompt=self.config.get('transcription.initial_prompt', ''),
no_log_file=True,
input_device_index=audio_device,
app_config=self.config
)
# Set up callbacks
self.transcription_engine.set_callbacks(
realtime_callback=self._on_realtime_transcription,
final_callback=self._on_final_transcription
)
# Initialize engine (loads models, sets up VAD)
success = self.transcription_engine.initialize()
if not success:
print("❌ Failed to initialize engine!")
return False
print("✓ Engine initialized successfully!")
# Start recording
success = self.transcription_engine.start_recording()
if not success:
print("❌ Failed to start recording!")
return False
print("✓ Recording started!")
print("\n✓ All components ready!")
return True
def _on_realtime_transcription(self, result: TranscriptionResult):
"""Handle realtime transcription callback."""
if self.is_running:
print(f"[PREVIEW] {result}")
def _on_final_transcription(self, result: TranscriptionResult):
"""Handle final transcription callback."""
if self.is_running:
print(f"{result}")
def run(self):
"""Run the transcription loop."""
if not self.initialize():
return 1
# Setup signal handler for graceful shutdown
def signal_handler(sig, frame):
print("\n\nStopping transcription...")
self.is_running = False
signal.signal(signal.SIGINT, signal_handler)
print("\n" + "=" * 60)
print("🎤 Recording... (Press Ctrl+C to stop)")
print("=" * 60)
print()
# Recording is already started by the engine
self.is_running = True
# Keep running until interrupted
try:
while self.is_running:
signal.pause()
except AttributeError:
# signal.pause() not available on Windows
import time
while self.is_running:
time.sleep(0.1)
# Cleanup
self.transcription_engine.stop_recording()
self.transcription_engine.stop()
print("\n" + "=" * 60)
print("✓ Transcription stopped")
print("=" * 60)
return 0
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Local Transcription CLI - Real-time speech-to-text'
)
parser.add_argument(
'-m', '--model',
choices=['tiny', 'base', 'small', 'medium', 'large'],
help='Whisper model size'
)
parser.add_argument(
'-d', '--device',
choices=['cpu', 'cuda', 'auto'],
help='Compute device'
)
parser.add_argument(
'-l', '--language',
help='Language code (e.g., en, es, fr) or "auto"'
)
parser.add_argument(
'-u', '--user',
help='User/speaker name'
)
parser.add_argument(
'--list-devices',
action='store_true',
help='List available audio input devices'
)
args = parser.parse_args()
# List devices if requested
if args.list_devices:
print("Available audio input devices:")
devices = AudioCapture.get_input_devices()
for idx, name in devices:
print(f" [{idx}] {name}")
return 0
# Run application
app = TranscriptionCLI(args)
return app.run()
if __name__ == "__main__":
sys.exit(main())