Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
365 lines
13 KiB
Python
365 lines
13 KiB
Python
"""Main application window for the local transcription app."""
|
|
|
|
import customtkinter as ctk
|
|
from tkinter import filedialog, messagebox
|
|
import threading
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
from client.config import Config
|
|
from client.device_utils import DeviceManager
|
|
from client.audio_capture import AudioCapture
|
|
from client.noise_suppression import NoiseSuppressor
|
|
from client.transcription_engine import TranscriptionEngine
|
|
from gui.transcription_display import TranscriptionDisplay
|
|
from gui.settings_dialog import SettingsDialog
|
|
|
|
|
|
class MainWindow(ctk.CTk):
|
|
"""Main application window."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the main window."""
|
|
super().__init__()
|
|
|
|
# Application state
|
|
self.is_transcribing = False
|
|
self.config = Config()
|
|
self.device_manager = DeviceManager()
|
|
|
|
# Components (initialized later)
|
|
self.audio_capture: AudioCapture = None
|
|
self.noise_suppressor: NoiseSuppressor = None
|
|
self.transcription_engine: TranscriptionEngine = None
|
|
|
|
# Configure window
|
|
self.title("Local Transcription")
|
|
self.geometry("900x700")
|
|
|
|
# Set theme
|
|
ctk.set_appearance_mode(self.config.get('display.theme', 'dark'))
|
|
ctk.set_default_color_theme("blue")
|
|
|
|
# Create UI
|
|
self._create_widgets()
|
|
|
|
# Handle window close
|
|
self.protocol("WM_DELETE_WINDOW", self._on_closing)
|
|
|
|
# Initialize components after GUI is ready (delay to avoid XCB threading issues)
|
|
self.after(100, self._initialize_components)
|
|
|
|
def _create_widgets(self):
|
|
"""Create all UI widgets."""
|
|
# Header frame
|
|
header_frame = ctk.CTkFrame(self, height=80)
|
|
header_frame.pack(fill="x", padx=10, pady=(10, 0))
|
|
header_frame.pack_propagate(False)
|
|
|
|
# Title
|
|
title_label = ctk.CTkLabel(
|
|
header_frame,
|
|
text="Local Transcription",
|
|
font=("", 24, "bold")
|
|
)
|
|
title_label.pack(side="left", padx=20, pady=20)
|
|
|
|
# Settings button
|
|
self.settings_button = ctk.CTkButton(
|
|
header_frame,
|
|
text="⚙ Settings",
|
|
command=self._open_settings,
|
|
width=120
|
|
)
|
|
self.settings_button.pack(side="right", padx=20, pady=20)
|
|
|
|
# Status frame
|
|
status_frame = ctk.CTkFrame(self, height=60)
|
|
status_frame.pack(fill="x", padx=10, pady=(10, 0))
|
|
status_frame.pack_propagate(False)
|
|
|
|
# Status label
|
|
self.status_label = ctk.CTkLabel(
|
|
status_frame,
|
|
text="⚫ Ready",
|
|
font=("", 14)
|
|
)
|
|
self.status_label.pack(side="left", padx=20)
|
|
|
|
# Device info
|
|
device_info = self.device_manager.get_device_info()
|
|
device_text = device_info[0][1] if device_info else "No device"
|
|
self.device_label = ctk.CTkLabel(
|
|
status_frame,
|
|
text=f"Device: {device_text}",
|
|
font=("", 12)
|
|
)
|
|
self.device_label.pack(side="left", padx=20)
|
|
|
|
# User name display
|
|
user_name = self.config.get('user.name', 'User')
|
|
self.user_label = ctk.CTkLabel(
|
|
status_frame,
|
|
text=f"User: {user_name}",
|
|
font=("", 12)
|
|
)
|
|
self.user_label.pack(side="left", padx=20)
|
|
|
|
# Transcription display frame
|
|
display_frame = ctk.CTkFrame(self)
|
|
display_frame.pack(fill="both", expand=True, padx=10, pady=10)
|
|
|
|
# Transcription display
|
|
self.transcription_display = TranscriptionDisplay(
|
|
display_frame,
|
|
max_lines=self.config.get('display.max_lines', 100),
|
|
show_timestamps=self.config.get('display.show_timestamps', True),
|
|
font=("Courier", self.config.get('display.font_size', 12))
|
|
)
|
|
self.transcription_display.pack(fill="both", expand=True, padx=10, pady=10)
|
|
|
|
# Control frame
|
|
control_frame = ctk.CTkFrame(self, height=80)
|
|
control_frame.pack(fill="x", padx=10, pady=(0, 10))
|
|
control_frame.pack_propagate(False)
|
|
|
|
# Start/Stop button
|
|
self.start_button = ctk.CTkButton(
|
|
control_frame,
|
|
text="▶ Start Transcription",
|
|
command=self._toggle_transcription,
|
|
width=200,
|
|
height=50,
|
|
font=("", 16, "bold"),
|
|
fg_color="green"
|
|
)
|
|
self.start_button.pack(side="left", padx=20, pady=15)
|
|
|
|
# Clear button
|
|
self.clear_button = ctk.CTkButton(
|
|
control_frame,
|
|
text="Clear",
|
|
command=self._clear_transcriptions,
|
|
width=120,
|
|
height=50
|
|
)
|
|
self.clear_button.pack(side="left", padx=10, pady=15)
|
|
|
|
# Save button
|
|
self.save_button = ctk.CTkButton(
|
|
control_frame,
|
|
text="💾 Save",
|
|
command=self._save_transcriptions,
|
|
width=120,
|
|
height=50
|
|
)
|
|
self.save_button.pack(side="left", padx=10, pady=15)
|
|
|
|
def _initialize_components(self):
|
|
"""Initialize audio, noise suppression, and transcription components."""
|
|
# Update status
|
|
self.status_label.configure(text="⚙ Initializing...")
|
|
self.update()
|
|
|
|
try:
|
|
# Set device based on config
|
|
device_config = self.config.get('transcription.device', 'auto')
|
|
self.device_manager.set_device(device_config)
|
|
|
|
# Initialize transcription engine
|
|
model_size = self.config.get('transcription.model', 'base')
|
|
language = self.config.get('transcription.language', 'en')
|
|
device = self.device_manager.get_device_for_whisper()
|
|
compute_type = self.device_manager.get_compute_type()
|
|
|
|
self.transcription_engine = TranscriptionEngine(
|
|
model_size=model_size,
|
|
device=device,
|
|
compute_type=compute_type,
|
|
language=language,
|
|
min_confidence=self.config.get('processing.min_confidence', 0.5)
|
|
)
|
|
|
|
# Load model (synchronously to avoid X11 threading issues)
|
|
success = self.transcription_engine.load_model()
|
|
|
|
if success:
|
|
self.status_label.configure(text="✓ Ready")
|
|
else:
|
|
self.status_label.configure(text="❌ Model loading failed")
|
|
messagebox.showerror("Error", "Failed to load transcription model")
|
|
|
|
except Exception as e:
|
|
print(f"Error initializing components: {e}")
|
|
self.status_label.configure(text="❌ Initialization failed")
|
|
messagebox.showerror("Error", f"Failed to initialize:\n{e}")
|
|
|
|
def _update_status(self, status: str):
|
|
"""Update status label (thread-safe)."""
|
|
self.after(0, lambda: self.status_label.configure(text=status))
|
|
|
|
def _toggle_transcription(self):
|
|
"""Start or stop transcription."""
|
|
if not self.is_transcribing:
|
|
self._start_transcription()
|
|
else:
|
|
self._stop_transcription()
|
|
|
|
def _start_transcription(self):
|
|
"""Start transcription."""
|
|
try:
|
|
# Check if engine is ready
|
|
if not self.transcription_engine or not self.transcription_engine.is_loaded:
|
|
messagebox.showerror("Error", "Transcription engine not ready")
|
|
return
|
|
|
|
# Get audio device
|
|
audio_device_str = self.config.get('audio.input_device', 'default')
|
|
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
|
|
|
|
# Initialize audio capture
|
|
self.audio_capture = AudioCapture(
|
|
sample_rate=self.config.get('audio.sample_rate', 16000),
|
|
chunk_duration=self.config.get('audio.chunk_duration', 3.0),
|
|
device=audio_device
|
|
)
|
|
|
|
# Initialize noise suppressor
|
|
self.noise_suppressor = NoiseSuppressor(
|
|
sample_rate=self.config.get('audio.sample_rate', 16000),
|
|
method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
|
|
strength=self.config.get('noise_suppression.strength', 0.7),
|
|
use_vad=self.config.get('processing.use_vad', True)
|
|
)
|
|
|
|
# Start recording
|
|
self.audio_capture.start_recording(callback=self._process_audio_chunk)
|
|
|
|
# Update UI
|
|
self.is_transcribing = True
|
|
self.start_button.configure(text="⏸ Stop Transcription", fg_color="red")
|
|
self.status_label.configure(text="🔴 Recording...")
|
|
|
|
except Exception as e:
|
|
messagebox.showerror("Error", f"Failed to start transcription:\n{e}")
|
|
print(f"Error starting transcription: {e}")
|
|
|
|
def _stop_transcription(self):
|
|
"""Stop transcription."""
|
|
try:
|
|
# Stop recording
|
|
if self.audio_capture:
|
|
self.audio_capture.stop_recording()
|
|
|
|
# Update UI
|
|
self.is_transcribing = False
|
|
self.start_button.configure(text="▶ Start Transcription", fg_color="green")
|
|
self.status_label.configure(text="✓ Ready")
|
|
|
|
except Exception as e:
|
|
messagebox.showerror("Error", f"Failed to stop transcription:\n{e}")
|
|
print(f"Error stopping transcription: {e}")
|
|
|
|
def _process_audio_chunk(self, audio_chunk):
|
|
"""Process an audio chunk (noise suppression + transcription)."""
|
|
def process():
|
|
try:
|
|
# Apply noise suppression
|
|
processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
|
|
|
|
# Skip if silent (VAD filtered it out)
|
|
if processed_audio is None:
|
|
return
|
|
|
|
# Transcribe
|
|
user_name = self.config.get('user.name', 'User')
|
|
result = self.transcription_engine.transcribe(
|
|
processed_audio,
|
|
sample_rate=self.config.get('audio.sample_rate', 16000),
|
|
user_name=user_name
|
|
)
|
|
|
|
# Display result
|
|
if result:
|
|
self.after(0, lambda: self.transcription_display.add_transcription(
|
|
text=result.text,
|
|
user_name=result.user_name,
|
|
timestamp=result.timestamp
|
|
))
|
|
|
|
except Exception as e:
|
|
print(f"Error processing audio: {e}")
|
|
|
|
# Run in background thread
|
|
threading.Thread(target=process, daemon=True).start()
|
|
|
|
def _clear_transcriptions(self):
|
|
"""Clear all transcriptions."""
|
|
if messagebox.askyesno("Clear Transcriptions", "Are you sure you want to clear all transcriptions?"):
|
|
self.transcription_display.clear()
|
|
|
|
def _save_transcriptions(self):
|
|
"""Save transcriptions to file."""
|
|
filepath = filedialog.asksaveasfilename(
|
|
defaultextension=".txt",
|
|
filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
|
|
)
|
|
|
|
if filepath:
|
|
if self.transcription_display.save_to_file(filepath):
|
|
messagebox.showinfo("Saved", f"Transcriptions saved to:\n{filepath}")
|
|
else:
|
|
messagebox.showerror("Error", "Failed to save transcriptions")
|
|
|
|
def _open_settings(self):
|
|
"""Open settings dialog."""
|
|
# Get audio devices
|
|
audio_devices = AudioCapture.get_input_devices()
|
|
if not audio_devices:
|
|
audio_devices = [(0, "Default")]
|
|
|
|
# Get compute devices
|
|
compute_devices = self.device_manager.get_device_info()
|
|
compute_devices.insert(0, ("auto", "Auto-detect"))
|
|
|
|
# Open settings dialog
|
|
SettingsDialog(
|
|
self,
|
|
self.config,
|
|
audio_devices,
|
|
compute_devices,
|
|
on_save=self._on_settings_saved
|
|
)
|
|
|
|
def _on_settings_saved(self):
|
|
"""Handle settings being saved."""
|
|
# Update user label
|
|
user_name = self.config.get('user.name', 'User')
|
|
self.user_label.configure(text=f"User: {user_name}")
|
|
|
|
# Update display settings
|
|
self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100))
|
|
self.transcription_display.set_show_timestamps(self.config.get('display.show_timestamps', True))
|
|
|
|
# Note: Model/device changes require restart
|
|
messagebox.showinfo(
|
|
"Settings Saved",
|
|
"Some settings (model size, device) require restarting the application to take effect."
|
|
)
|
|
|
|
def _on_closing(self):
|
|
"""Handle window closing."""
|
|
# Stop transcription if running
|
|
if self.is_transcribing:
|
|
self._stop_transcription()
|
|
|
|
# Unload model
|
|
if self.transcription_engine:
|
|
self.transcription_engine.unload_model()
|
|
|
|
# Close window
|
|
self.destroy()
|