Files
local-transcription/gui/main_window.py
Josh Knapp 472233aec4 Initial commit: Local Transcription App v1.0
Phase 1 Complete - Standalone Desktop Application

Features:
- Real-time speech-to-text with Whisper (faster-whisper)
- PySide6 desktop GUI with settings dialog
- Web server for OBS browser source integration
- Audio capture with automatic sample rate detection and resampling
- Noise suppression with Voice Activity Detection (VAD)
- Configurable display settings (font, timestamps, fade duration)
- Settings apply without restart (with automatic model reloading)
- Auto-fade for web display transcriptions
- CPU/GPU support with automatic device detection
- Standalone executable builds (PyInstaller)
- CUDA build support (works on systems without CUDA hardware)

Components:
- Audio capture with sounddevice
- Noise reduction with noisereduce + webrtcvad
- Transcription with faster-whisper
- GUI with PySide6
- Web server with FastAPI + WebSocket
- Configuration system with YAML

Build System:
- Standard builds (CPU-only): build.sh / build.bat
- CUDA builds (universal): build-cuda.sh / build-cuda.bat
- Comprehensive BUILD.md documentation
- Cross-platform support (Linux, Windows)

Documentation:
- README.md with project overview and quick start
- BUILD.md with detailed build instructions
- NEXT_STEPS.md with future enhancement roadmap
- INSTALL.md with setup instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00

365 lines
13 KiB
Python

"""Main application window for the local transcription app."""
import customtkinter as ctk
from tkinter import filedialog, messagebox
import threading
from pathlib import Path
import sys
# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))
from client.config import Config
from client.device_utils import DeviceManager
from client.audio_capture import AudioCapture
from client.noise_suppression import NoiseSuppressor
from client.transcription_engine import TranscriptionEngine
from gui.transcription_display import TranscriptionDisplay
from gui.settings_dialog import SettingsDialog
class MainWindow(ctk.CTk):
"""Main application window."""
def __init__(self):
"""Initialize the main window."""
super().__init__()
# Application state
self.is_transcribing = False
self.config = Config()
self.device_manager = DeviceManager()
# Components (initialized later)
self.audio_capture: AudioCapture = None
self.noise_suppressor: NoiseSuppressor = None
self.transcription_engine: TranscriptionEngine = None
# Configure window
self.title("Local Transcription")
self.geometry("900x700")
# Set theme
ctk.set_appearance_mode(self.config.get('display.theme', 'dark'))
ctk.set_default_color_theme("blue")
# Create UI
self._create_widgets()
# Handle window close
self.protocol("WM_DELETE_WINDOW", self._on_closing)
# Initialize components after GUI is ready (delay to avoid XCB threading issues)
self.after(100, self._initialize_components)
def _create_widgets(self):
"""Create all UI widgets."""
# Header frame
header_frame = ctk.CTkFrame(self, height=80)
header_frame.pack(fill="x", padx=10, pady=(10, 0))
header_frame.pack_propagate(False)
# Title
title_label = ctk.CTkLabel(
header_frame,
text="Local Transcription",
font=("", 24, "bold")
)
title_label.pack(side="left", padx=20, pady=20)
# Settings button
self.settings_button = ctk.CTkButton(
header_frame,
text="⚙ Settings",
command=self._open_settings,
width=120
)
self.settings_button.pack(side="right", padx=20, pady=20)
# Status frame
status_frame = ctk.CTkFrame(self, height=60)
status_frame.pack(fill="x", padx=10, pady=(10, 0))
status_frame.pack_propagate(False)
# Status label
self.status_label = ctk.CTkLabel(
status_frame,
text="⚫ Ready",
font=("", 14)
)
self.status_label.pack(side="left", padx=20)
# Device info
device_info = self.device_manager.get_device_info()
device_text = device_info[0][1] if device_info else "No device"
self.device_label = ctk.CTkLabel(
status_frame,
text=f"Device: {device_text}",
font=("", 12)
)
self.device_label.pack(side="left", padx=20)
# User name display
user_name = self.config.get('user.name', 'User')
self.user_label = ctk.CTkLabel(
status_frame,
text=f"User: {user_name}",
font=("", 12)
)
self.user_label.pack(side="left", padx=20)
# Transcription display frame
display_frame = ctk.CTkFrame(self)
display_frame.pack(fill="both", expand=True, padx=10, pady=10)
# Transcription display
self.transcription_display = TranscriptionDisplay(
display_frame,
max_lines=self.config.get('display.max_lines', 100),
show_timestamps=self.config.get('display.show_timestamps', True),
font=("Courier", self.config.get('display.font_size', 12))
)
self.transcription_display.pack(fill="both", expand=True, padx=10, pady=10)
# Control frame
control_frame = ctk.CTkFrame(self, height=80)
control_frame.pack(fill="x", padx=10, pady=(0, 10))
control_frame.pack_propagate(False)
# Start/Stop button
self.start_button = ctk.CTkButton(
control_frame,
text="▶ Start Transcription",
command=self._toggle_transcription,
width=200,
height=50,
font=("", 16, "bold"),
fg_color="green"
)
self.start_button.pack(side="left", padx=20, pady=15)
# Clear button
self.clear_button = ctk.CTkButton(
control_frame,
text="Clear",
command=self._clear_transcriptions,
width=120,
height=50
)
self.clear_button.pack(side="left", padx=10, pady=15)
# Save button
self.save_button = ctk.CTkButton(
control_frame,
text="💾 Save",
command=self._save_transcriptions,
width=120,
height=50
)
self.save_button.pack(side="left", padx=10, pady=15)
def _initialize_components(self):
"""Initialize audio, noise suppression, and transcription components."""
# Update status
self.status_label.configure(text="⚙ Initializing...")
self.update()
try:
# Set device based on config
device_config = self.config.get('transcription.device', 'auto')
self.device_manager.set_device(device_config)
# Initialize transcription engine
model_size = self.config.get('transcription.model', 'base')
language = self.config.get('transcription.language', 'en')
device = self.device_manager.get_device_for_whisper()
compute_type = self.device_manager.get_compute_type()
self.transcription_engine = TranscriptionEngine(
model_size=model_size,
device=device,
compute_type=compute_type,
language=language,
min_confidence=self.config.get('processing.min_confidence', 0.5)
)
# Load model (synchronously to avoid X11 threading issues)
success = self.transcription_engine.load_model()
if success:
self.status_label.configure(text="✓ Ready")
else:
self.status_label.configure(text="❌ Model loading failed")
messagebox.showerror("Error", "Failed to load transcription model")
except Exception as e:
print(f"Error initializing components: {e}")
self.status_label.configure(text="❌ Initialization failed")
messagebox.showerror("Error", f"Failed to initialize:\n{e}")
def _update_status(self, status: str):
"""Update status label (thread-safe)."""
self.after(0, lambda: self.status_label.configure(text=status))
def _toggle_transcription(self):
"""Start or stop transcription."""
if not self.is_transcribing:
self._start_transcription()
else:
self._stop_transcription()
def _start_transcription(self):
"""Start transcription."""
try:
# Check if engine is ready
if not self.transcription_engine or not self.transcription_engine.is_loaded:
messagebox.showerror("Error", "Transcription engine not ready")
return
# Get audio device
audio_device_str = self.config.get('audio.input_device', 'default')
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
# Initialize audio capture
self.audio_capture = AudioCapture(
sample_rate=self.config.get('audio.sample_rate', 16000),
chunk_duration=self.config.get('audio.chunk_duration', 3.0),
device=audio_device
)
# Initialize noise suppressor
self.noise_suppressor = NoiseSuppressor(
sample_rate=self.config.get('audio.sample_rate', 16000),
method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
strength=self.config.get('noise_suppression.strength', 0.7),
use_vad=self.config.get('processing.use_vad', True)
)
# Start recording
self.audio_capture.start_recording(callback=self._process_audio_chunk)
# Update UI
self.is_transcribing = True
self.start_button.configure(text="⏸ Stop Transcription", fg_color="red")
self.status_label.configure(text="🔴 Recording...")
except Exception as e:
messagebox.showerror("Error", f"Failed to start transcription:\n{e}")
print(f"Error starting transcription: {e}")
def _stop_transcription(self):
"""Stop transcription."""
try:
# Stop recording
if self.audio_capture:
self.audio_capture.stop_recording()
# Update UI
self.is_transcribing = False
self.start_button.configure(text="▶ Start Transcription", fg_color="green")
self.status_label.configure(text="✓ Ready")
except Exception as e:
messagebox.showerror("Error", f"Failed to stop transcription:\n{e}")
print(f"Error stopping transcription: {e}")
def _process_audio_chunk(self, audio_chunk):
"""Process an audio chunk (noise suppression + transcription)."""
def process():
try:
# Apply noise suppression
processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
# Skip if silent (VAD filtered it out)
if processed_audio is None:
return
# Transcribe
user_name = self.config.get('user.name', 'User')
result = self.transcription_engine.transcribe(
processed_audio,
sample_rate=self.config.get('audio.sample_rate', 16000),
user_name=user_name
)
# Display result
if result:
self.after(0, lambda: self.transcription_display.add_transcription(
text=result.text,
user_name=result.user_name,
timestamp=result.timestamp
))
except Exception as e:
print(f"Error processing audio: {e}")
# Run in background thread
threading.Thread(target=process, daemon=True).start()
def _clear_transcriptions(self):
"""Clear all transcriptions."""
if messagebox.askyesno("Clear Transcriptions", "Are you sure you want to clear all transcriptions?"):
self.transcription_display.clear()
def _save_transcriptions(self):
"""Save transcriptions to file."""
filepath = filedialog.asksaveasfilename(
defaultextension=".txt",
filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
)
if filepath:
if self.transcription_display.save_to_file(filepath):
messagebox.showinfo("Saved", f"Transcriptions saved to:\n{filepath}")
else:
messagebox.showerror("Error", "Failed to save transcriptions")
def _open_settings(self):
"""Open settings dialog."""
# Get audio devices
audio_devices = AudioCapture.get_input_devices()
if not audio_devices:
audio_devices = [(0, "Default")]
# Get compute devices
compute_devices = self.device_manager.get_device_info()
compute_devices.insert(0, ("auto", "Auto-detect"))
# Open settings dialog
SettingsDialog(
self,
self.config,
audio_devices,
compute_devices,
on_save=self._on_settings_saved
)
def _on_settings_saved(self):
"""Handle settings being saved."""
# Update user label
user_name = self.config.get('user.name', 'User')
self.user_label.configure(text=f"User: {user_name}")
# Update display settings
self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100))
self.transcription_display.set_show_timestamps(self.config.get('display.show_timestamps', True))
# Note: Model/device changes require restart
messagebox.showinfo(
"Settings Saved",
"Some settings (model size, device) require restarting the application to take effect."
)
def _on_closing(self):
"""Handle window closing."""
# Stop transcription if running
if self.is_transcribing:
self._stop_transcription()
# Unload model
if self.transcription_engine:
self.transcription_engine.unload_model()
# Close window
self.destroy()