Initial commit: Local Transcription App v1.0
Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
364
gui/main_window.py
Normal file
364
gui/main_window.py
Normal file
@@ -0,0 +1,364 @@
|
||||
"""Main application window for the local transcription app."""
|
||||
|
||||
import customtkinter as ctk
|
||||
from tkinter import filedialog, messagebox
|
||||
import threading
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from client.config import Config
|
||||
from client.device_utils import DeviceManager
|
||||
from client.audio_capture import AudioCapture
|
||||
from client.noise_suppression import NoiseSuppressor
|
||||
from client.transcription_engine import TranscriptionEngine
|
||||
from gui.transcription_display import TranscriptionDisplay
|
||||
from gui.settings_dialog import SettingsDialog
|
||||
|
||||
|
||||
class MainWindow(ctk.CTk):
|
||||
"""Main application window."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the main window."""
|
||||
super().__init__()
|
||||
|
||||
# Application state
|
||||
self.is_transcribing = False
|
||||
self.config = Config()
|
||||
self.device_manager = DeviceManager()
|
||||
|
||||
# Components (initialized later)
|
||||
self.audio_capture: AudioCapture = None
|
||||
self.noise_suppressor: NoiseSuppressor = None
|
||||
self.transcription_engine: TranscriptionEngine = None
|
||||
|
||||
# Configure window
|
||||
self.title("Local Transcription")
|
||||
self.geometry("900x700")
|
||||
|
||||
# Set theme
|
||||
ctk.set_appearance_mode(self.config.get('display.theme', 'dark'))
|
||||
ctk.set_default_color_theme("blue")
|
||||
|
||||
# Create UI
|
||||
self._create_widgets()
|
||||
|
||||
# Handle window close
|
||||
self.protocol("WM_DELETE_WINDOW", self._on_closing)
|
||||
|
||||
# Initialize components after GUI is ready (delay to avoid XCB threading issues)
|
||||
self.after(100, self._initialize_components)
|
||||
|
||||
def _create_widgets(self):
|
||||
"""Create all UI widgets."""
|
||||
# Header frame
|
||||
header_frame = ctk.CTkFrame(self, height=80)
|
||||
header_frame.pack(fill="x", padx=10, pady=(10, 0))
|
||||
header_frame.pack_propagate(False)
|
||||
|
||||
# Title
|
||||
title_label = ctk.CTkLabel(
|
||||
header_frame,
|
||||
text="Local Transcription",
|
||||
font=("", 24, "bold")
|
||||
)
|
||||
title_label.pack(side="left", padx=20, pady=20)
|
||||
|
||||
# Settings button
|
||||
self.settings_button = ctk.CTkButton(
|
||||
header_frame,
|
||||
text="⚙ Settings",
|
||||
command=self._open_settings,
|
||||
width=120
|
||||
)
|
||||
self.settings_button.pack(side="right", padx=20, pady=20)
|
||||
|
||||
# Status frame
|
||||
status_frame = ctk.CTkFrame(self, height=60)
|
||||
status_frame.pack(fill="x", padx=10, pady=(10, 0))
|
||||
status_frame.pack_propagate(False)
|
||||
|
||||
# Status label
|
||||
self.status_label = ctk.CTkLabel(
|
||||
status_frame,
|
||||
text="⚫ Ready",
|
||||
font=("", 14)
|
||||
)
|
||||
self.status_label.pack(side="left", padx=20)
|
||||
|
||||
# Device info
|
||||
device_info = self.device_manager.get_device_info()
|
||||
device_text = device_info[0][1] if device_info else "No device"
|
||||
self.device_label = ctk.CTkLabel(
|
||||
status_frame,
|
||||
text=f"Device: {device_text}",
|
||||
font=("", 12)
|
||||
)
|
||||
self.device_label.pack(side="left", padx=20)
|
||||
|
||||
# User name display
|
||||
user_name = self.config.get('user.name', 'User')
|
||||
self.user_label = ctk.CTkLabel(
|
||||
status_frame,
|
||||
text=f"User: {user_name}",
|
||||
font=("", 12)
|
||||
)
|
||||
self.user_label.pack(side="left", padx=20)
|
||||
|
||||
# Transcription display frame
|
||||
display_frame = ctk.CTkFrame(self)
|
||||
display_frame.pack(fill="both", expand=True, padx=10, pady=10)
|
||||
|
||||
# Transcription display
|
||||
self.transcription_display = TranscriptionDisplay(
|
||||
display_frame,
|
||||
max_lines=self.config.get('display.max_lines', 100),
|
||||
show_timestamps=self.config.get('display.show_timestamps', True),
|
||||
font=("Courier", self.config.get('display.font_size', 12))
|
||||
)
|
||||
self.transcription_display.pack(fill="both", expand=True, padx=10, pady=10)
|
||||
|
||||
# Control frame
|
||||
control_frame = ctk.CTkFrame(self, height=80)
|
||||
control_frame.pack(fill="x", padx=10, pady=(0, 10))
|
||||
control_frame.pack_propagate(False)
|
||||
|
||||
# Start/Stop button
|
||||
self.start_button = ctk.CTkButton(
|
||||
control_frame,
|
||||
text="▶ Start Transcription",
|
||||
command=self._toggle_transcription,
|
||||
width=200,
|
||||
height=50,
|
||||
font=("", 16, "bold"),
|
||||
fg_color="green"
|
||||
)
|
||||
self.start_button.pack(side="left", padx=20, pady=15)
|
||||
|
||||
# Clear button
|
||||
self.clear_button = ctk.CTkButton(
|
||||
control_frame,
|
||||
text="Clear",
|
||||
command=self._clear_transcriptions,
|
||||
width=120,
|
||||
height=50
|
||||
)
|
||||
self.clear_button.pack(side="left", padx=10, pady=15)
|
||||
|
||||
# Save button
|
||||
self.save_button = ctk.CTkButton(
|
||||
control_frame,
|
||||
text="💾 Save",
|
||||
command=self._save_transcriptions,
|
||||
width=120,
|
||||
height=50
|
||||
)
|
||||
self.save_button.pack(side="left", padx=10, pady=15)
|
||||
|
||||
def _initialize_components(self):
|
||||
"""Initialize audio, noise suppression, and transcription components."""
|
||||
# Update status
|
||||
self.status_label.configure(text="⚙ Initializing...")
|
||||
self.update()
|
||||
|
||||
try:
|
||||
# Set device based on config
|
||||
device_config = self.config.get('transcription.device', 'auto')
|
||||
self.device_manager.set_device(device_config)
|
||||
|
||||
# Initialize transcription engine
|
||||
model_size = self.config.get('transcription.model', 'base')
|
||||
language = self.config.get('transcription.language', 'en')
|
||||
device = self.device_manager.get_device_for_whisper()
|
||||
compute_type = self.device_manager.get_compute_type()
|
||||
|
||||
self.transcription_engine = TranscriptionEngine(
|
||||
model_size=model_size,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
min_confidence=self.config.get('processing.min_confidence', 0.5)
|
||||
)
|
||||
|
||||
# Load model (synchronously to avoid X11 threading issues)
|
||||
success = self.transcription_engine.load_model()
|
||||
|
||||
if success:
|
||||
self.status_label.configure(text="✓ Ready")
|
||||
else:
|
||||
self.status_label.configure(text="❌ Model loading failed")
|
||||
messagebox.showerror("Error", "Failed to load transcription model")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error initializing components: {e}")
|
||||
self.status_label.configure(text="❌ Initialization failed")
|
||||
messagebox.showerror("Error", f"Failed to initialize:\n{e}")
|
||||
|
||||
def _update_status(self, status: str):
|
||||
"""Update status label (thread-safe)."""
|
||||
self.after(0, lambda: self.status_label.configure(text=status))
|
||||
|
||||
def _toggle_transcription(self):
|
||||
"""Start or stop transcription."""
|
||||
if not self.is_transcribing:
|
||||
self._start_transcription()
|
||||
else:
|
||||
self._stop_transcription()
|
||||
|
||||
def _start_transcription(self):
|
||||
"""Start transcription."""
|
||||
try:
|
||||
# Check if engine is ready
|
||||
if not self.transcription_engine or not self.transcription_engine.is_loaded:
|
||||
messagebox.showerror("Error", "Transcription engine not ready")
|
||||
return
|
||||
|
||||
# Get audio device
|
||||
audio_device_str = self.config.get('audio.input_device', 'default')
|
||||
audio_device = None if audio_device_str == 'default' else int(audio_device_str)
|
||||
|
||||
# Initialize audio capture
|
||||
self.audio_capture = AudioCapture(
|
||||
sample_rate=self.config.get('audio.sample_rate', 16000),
|
||||
chunk_duration=self.config.get('audio.chunk_duration', 3.0),
|
||||
device=audio_device
|
||||
)
|
||||
|
||||
# Initialize noise suppressor
|
||||
self.noise_suppressor = NoiseSuppressor(
|
||||
sample_rate=self.config.get('audio.sample_rate', 16000),
|
||||
method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
|
||||
strength=self.config.get('noise_suppression.strength', 0.7),
|
||||
use_vad=self.config.get('processing.use_vad', True)
|
||||
)
|
||||
|
||||
# Start recording
|
||||
self.audio_capture.start_recording(callback=self._process_audio_chunk)
|
||||
|
||||
# Update UI
|
||||
self.is_transcribing = True
|
||||
self.start_button.configure(text="⏸ Stop Transcription", fg_color="red")
|
||||
self.status_label.configure(text="🔴 Recording...")
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("Error", f"Failed to start transcription:\n{e}")
|
||||
print(f"Error starting transcription: {e}")
|
||||
|
||||
def _stop_transcription(self):
|
||||
"""Stop transcription."""
|
||||
try:
|
||||
# Stop recording
|
||||
if self.audio_capture:
|
||||
self.audio_capture.stop_recording()
|
||||
|
||||
# Update UI
|
||||
self.is_transcribing = False
|
||||
self.start_button.configure(text="▶ Start Transcription", fg_color="green")
|
||||
self.status_label.configure(text="✓ Ready")
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("Error", f"Failed to stop transcription:\n{e}")
|
||||
print(f"Error stopping transcription: {e}")
|
||||
|
||||
def _process_audio_chunk(self, audio_chunk):
|
||||
"""Process an audio chunk (noise suppression + transcription)."""
|
||||
def process():
|
||||
try:
|
||||
# Apply noise suppression
|
||||
processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
|
||||
|
||||
# Skip if silent (VAD filtered it out)
|
||||
if processed_audio is None:
|
||||
return
|
||||
|
||||
# Transcribe
|
||||
user_name = self.config.get('user.name', 'User')
|
||||
result = self.transcription_engine.transcribe(
|
||||
processed_audio,
|
||||
sample_rate=self.config.get('audio.sample_rate', 16000),
|
||||
user_name=user_name
|
||||
)
|
||||
|
||||
# Display result
|
||||
if result:
|
||||
self.after(0, lambda: self.transcription_display.add_transcription(
|
||||
text=result.text,
|
||||
user_name=result.user_name,
|
||||
timestamp=result.timestamp
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing audio: {e}")
|
||||
|
||||
# Run in background thread
|
||||
threading.Thread(target=process, daemon=True).start()
|
||||
|
||||
def _clear_transcriptions(self):
|
||||
"""Clear all transcriptions."""
|
||||
if messagebox.askyesno("Clear Transcriptions", "Are you sure you want to clear all transcriptions?"):
|
||||
self.transcription_display.clear()
|
||||
|
||||
def _save_transcriptions(self):
|
||||
"""Save transcriptions to file."""
|
||||
filepath = filedialog.asksaveasfilename(
|
||||
defaultextension=".txt",
|
||||
filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
|
||||
)
|
||||
|
||||
if filepath:
|
||||
if self.transcription_display.save_to_file(filepath):
|
||||
messagebox.showinfo("Saved", f"Transcriptions saved to:\n{filepath}")
|
||||
else:
|
||||
messagebox.showerror("Error", "Failed to save transcriptions")
|
||||
|
||||
def _open_settings(self):
|
||||
"""Open settings dialog."""
|
||||
# Get audio devices
|
||||
audio_devices = AudioCapture.get_input_devices()
|
||||
if not audio_devices:
|
||||
audio_devices = [(0, "Default")]
|
||||
|
||||
# Get compute devices
|
||||
compute_devices = self.device_manager.get_device_info()
|
||||
compute_devices.insert(0, ("auto", "Auto-detect"))
|
||||
|
||||
# Open settings dialog
|
||||
SettingsDialog(
|
||||
self,
|
||||
self.config,
|
||||
audio_devices,
|
||||
compute_devices,
|
||||
on_save=self._on_settings_saved
|
||||
)
|
||||
|
||||
def _on_settings_saved(self):
|
||||
"""Handle settings being saved."""
|
||||
# Update user label
|
||||
user_name = self.config.get('user.name', 'User')
|
||||
self.user_label.configure(text=f"User: {user_name}")
|
||||
|
||||
# Update display settings
|
||||
self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100))
|
||||
self.transcription_display.set_show_timestamps(self.config.get('display.show_timestamps', True))
|
||||
|
||||
# Note: Model/device changes require restart
|
||||
messagebox.showinfo(
|
||||
"Settings Saved",
|
||||
"Some settings (model size, device) require restarting the application to take effect."
|
||||
)
|
||||
|
||||
def _on_closing(self):
|
||||
"""Handle window closing."""
|
||||
# Stop transcription if running
|
||||
if self.is_transcribing:
|
||||
self._stop_transcription()
|
||||
|
||||
# Unload model
|
||||
if self.transcription_engine:
|
||||
self.transcription_engine.unload_model()
|
||||
|
||||
# Close window
|
||||
self.destroy()
|
||||
Reference in New Issue
Block a user