Files
local-transcription/main_cli.py
Josh Knapp 0ba84e6ddd Improve transcription accuracy with overlapping audio chunks
Changes:
1. Changed UI text from "Recording" to "Transcribing" for clarity
2. Implemented overlapping audio chunks to prevent word cutoff

Audio Overlap Feature:
- Added overlap_duration parameter (default: 0.5 seconds)
- Audio chunks now overlap by 0.5s to capture words at boundaries
- Prevents missed words when chunks are processed separately
- Configurable via audio.overlap_duration in config.yaml

How it works:
- Each 3-second chunk includes 0.5s from the previous chunk
- Buffer advances by (chunk_size - overlap_size) instead of full chunk
- Ensures words at chunk boundaries are captured in at least one chunk
- No duplicate transcription due to Whisper's context handling

Example with 3s chunks and 0.5s overlap:
  Chunk 1: [0.0s - 3.0s]
  Chunk 2: [2.5s - 5.5s]  <- 0.5s overlap
  Chunk 3: [5.0s - 8.0s]  <- 0.5s overlap

Files modified:
- client/audio_capture.py: Implemented overlapping buffer logic
- config/default_config.yaml: Added overlap_duration setting
- gui/main_window_qt.py: Updated UI text, passed overlap param
- main_cli.py: Passed overlap param

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-26 08:47:19 -08:00

223 lines
6.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Local Transcription CLI
Command-line version of the transcription application.
Works without GUI - perfect for testing and headless operation.
"""
import sys
import os
from pathlib import Path
import signal
import argparse
# Add project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from client.config import Config
from client.device_utils import DeviceManager
from client.audio_capture import AudioCapture
from client.noise_suppression import NoiseSuppressor
from client.transcription_engine import TranscriptionEngine
class TranscriptionCLI:
"""CLI transcription application."""
def __init__(self, args):
"""Initialize the CLI application."""
self.args = args
self.config = Config()
self.device_manager = DeviceManager()
self.is_running = False
# Override config with command-line arguments
if args.model:
self.config.set('transcription.model', args.model)
if args.device:
self.config.set('transcription.device', args.device)
if args.language:
self.config.set('transcription.language', args.language)
if args.user:
self.config.set('user.name', args.user)
# Components
self.audio_capture = None
self.noise_suppressor = None
self.transcription_engine = None
def initialize(self):
"""Initialize all components."""
print("=" * 60)
print("Local Transcription CLI")
print("=" * 60)
# Device setup
device_config = self.config.get('transcription.device', 'auto')
self.device_manager.set_device(device_config)
print(f"\nUser: {self.config.get('user.name', 'User')}")
print(f"Model: {self.config.get('transcription.model', 'base')}")
print(f"Language: {self.config.get('transcription.language', 'en')}")
print(f"Device: {self.device_manager.current_device}")
# Initialize transcription engine
print(f"\nLoading Whisper model...")
model_size = self.config.get('transcription.model', 'base')
language = self.config.get('transcription.language', 'en')
device = self.device_manager.get_device_for_whisper()
compute_type = self.device_manager.get_compute_type()
self.transcription_engine = TranscriptionEngine(
model_size=model_size,
device=device,
compute_type=compute_type,
language=language,
min_confidence=self.config.get('processing.min_confidence', 0.5)
)
success = self.transcription_engine.load_model()
if not success:
print("❌ Failed to load model!")
return False
print("✓ Model loaded successfully!")
# Initialize audio capture
audio_device_str = self.config.get('audio.input_device', 'default')
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
self.audio_capture = AudioCapture(
sample_rate=self.config.get('audio.sample_rate', 16000),
chunk_duration=self.config.get('audio.chunk_duration', 3.0),
overlap_duration=self.config.get('audio.overlap_duration', 0.5),
device=audio_device
)
# Initialize noise suppressor
self.noise_suppressor = NoiseSuppressor(
sample_rate=self.config.get('audio.sample_rate', 16000),
method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
strength=self.config.get('noise_suppression.strength', 0.7),
use_vad=self.config.get('processing.use_vad', True)
)
print("\n✓ All components initialized!")
return True
def process_audio_chunk(self, audio_chunk):
"""Process an audio chunk."""
try:
# Apply noise suppression
processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
# Skip if silent
if processed_audio is None:
return
# Transcribe
user_name = self.config.get('user.name', 'User')
result = self.transcription_engine.transcribe(
processed_audio,
sample_rate=self.config.get('audio.sample_rate', 16000),
user_name=user_name
)
# Display result
if result:
print(f"{result}")
except Exception as e:
print(f"Error processing audio: {e}")
def run(self):
"""Run the transcription loop."""
if not self.initialize():
return 1
# Setup signal handler for graceful shutdown
def signal_handler(sig, frame):
print("\n\nStopping transcription...")
self.is_running = False
signal.signal(signal.SIGINT, signal_handler)
print("\n" + "=" * 60)
print("🎤 Recording... (Press Ctrl+C to stop)")
print("=" * 60)
print()
# Start recording
self.is_running = True
self.audio_capture.start_recording(callback=self.process_audio_chunk)
# Keep running until interrupted
try:
while self.is_running:
signal.pause()
except AttributeError:
# signal.pause() not available on Windows
import time
while self.is_running:
time.sleep(0.1)
# Cleanup
self.audio_capture.stop_recording()
self.transcription_engine.unload_model()
print("\n" + "=" * 60)
print("✓ Transcription stopped")
print("=" * 60)
return 0
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Local Transcription CLI - Real-time speech-to-text'
)
parser.add_argument(
'-m', '--model',
choices=['tiny', 'base', 'small', 'medium', 'large'],
help='Whisper model size'
)
parser.add_argument(
'-d', '--device',
choices=['cpu', 'cuda', 'auto'],
help='Compute device'
)
parser.add_argument(
'-l', '--language',
help='Language code (e.g., en, es, fr) or "auto"'
)
parser.add_argument(
'-u', '--user',
help='User/speaker name'
)
parser.add_argument(
'--list-devices',
action='store_true',
help='List available audio input devices'
)
args = parser.parse_args()
# List devices if requested
if args.list_devices:
print("Available audio input devices:")
devices = AudioCapture.get_input_devices()
for idx, name in devices:
print(f" [{idx}] {name}")
return 0
# Run application
app = TranscriptionCLI(args)
return app.run()
if __name__ == "__main__":
sys.exit(main())