Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/gui/main_window.py
+++ b/gui/main_window.py
@@ -0,0 +1,364 @@
+"""Main application window for the local transcription app."""
+
+import customtkinter as ctk
+from tkinter import filedialog, messagebox
+import threading
+from pathlib import Path
+import sys
+
+# Add parent directory to path for imports
+sys.path.append(str(Path(__file__).parent.parent))
+
+from client.config import Config
+from client.device_utils import DeviceManager
+from client.audio_capture import AudioCapture
+from client.noise_suppression import NoiseSuppressor
+from client.transcription_engine import TranscriptionEngine
+from gui.transcription_display import TranscriptionDisplay
+from gui.settings_dialog import SettingsDialog
+
+
+class MainWindow(ctk.CTk):
+    """Main application window."""
+
+    def __init__(self):
+        """Initialize the main window."""
+        super().__init__()
+
+        # Application state
+        self.is_transcribing = False
+        self.config = Config()
+        self.device_manager = DeviceManager()
+
+        # Components (initialized later)
+        self.audio_capture: AudioCapture = None
+        self.noise_suppressor: NoiseSuppressor = None
+        self.transcription_engine: TranscriptionEngine = None
+
+        # Configure window
+        self.title("Local Transcription")
+        self.geometry("900x700")
+
+        # Set theme
+        ctk.set_appearance_mode(self.config.get('display.theme', 'dark'))
+        ctk.set_default_color_theme("blue")
+
+        # Create UI
+        self._create_widgets()
+
+        # Handle window close
+        self.protocol("WM_DELETE_WINDOW", self._on_closing)
+
+        # Initialize components after GUI is ready (delay to avoid XCB threading issues)
+        self.after(100, self._initialize_components)
+
+    def _create_widgets(self):
+        """Create all UI widgets."""
+        # Header frame
+        header_frame = ctk.CTkFrame(self, height=80)
+        header_frame.pack(fill="x", padx=10, pady=(10, 0))
+        header_frame.pack_propagate(False)
+
+        # Title
+        title_label = ctk.CTkLabel(
+            header_frame,
+            text="Local Transcription",
+            font=("", 24, "bold")
+        )
+        title_label.pack(side="left", padx=20, pady=20)
+
+        # Settings button
+        self.settings_button = ctk.CTkButton(
+            header_frame,
+            text="⚙ Settings",
+            command=self._open_settings,
+            width=120
+        )
+        self.settings_button.pack(side="right", padx=20, pady=20)
+
+        # Status frame
+        status_frame = ctk.CTkFrame(self, height=60)
+        status_frame.pack(fill="x", padx=10, pady=(10, 0))
+        status_frame.pack_propagate(False)
+
+        # Status label
+        self.status_label = ctk.CTkLabel(
+            status_frame,
+            text="⚫ Ready",
+            font=("", 14)
+        )
+        self.status_label.pack(side="left", padx=20)
+
+        # Device info
+        device_info = self.device_manager.get_device_info()
+        device_text = device_info[0][1] if device_info else "No device"
+        self.device_label = ctk.CTkLabel(
+            status_frame,
+            text=f"Device: {device_text}",
+            font=("", 12)
+        )
+        self.device_label.pack(side="left", padx=20)
+
+        # User name display
+        user_name = self.config.get('user.name', 'User')
+        self.user_label = ctk.CTkLabel(
+            status_frame,
+            text=f"User: {user_name}",
+            font=("", 12)
+        )
+        self.user_label.pack(side="left", padx=20)
+
+        # Transcription display frame
+        display_frame = ctk.CTkFrame(self)
+        display_frame.pack(fill="both", expand=True, padx=10, pady=10)
+
+        # Transcription display
+        self.transcription_display = TranscriptionDisplay(
+            display_frame,
+            max_lines=self.config.get('display.max_lines', 100),
+            show_timestamps=self.config.get('display.show_timestamps', True),
+            font=("Courier", self.config.get('display.font_size', 12))
+        )
+        self.transcription_display.pack(fill="both", expand=True, padx=10, pady=10)
+
+        # Control frame
+        control_frame = ctk.CTkFrame(self, height=80)
+        control_frame.pack(fill="x", padx=10, pady=(0, 10))
+        control_frame.pack_propagate(False)
+
+        # Start/Stop button
+        self.start_button = ctk.CTkButton(
+            control_frame,
+            text="▶ Start Transcription",
+            command=self._toggle_transcription,
+            width=200,
+            height=50,
+            font=("", 16, "bold"),
+            fg_color="green"
+        )
+        self.start_button.pack(side="left", padx=20, pady=15)
+
+        # Clear button
+        self.clear_button = ctk.CTkButton(
+            control_frame,
+            text="Clear",
+            command=self._clear_transcriptions,
+            width=120,
+            height=50
+        )
+        self.clear_button.pack(side="left", padx=10, pady=15)
+
+        # Save button
+        self.save_button = ctk.CTkButton(
+            control_frame,
+            text="💾 Save",
+            command=self._save_transcriptions,
+            width=120,
+            height=50
+        )
+        self.save_button.pack(side="left", padx=10, pady=15)
+
+    def _initialize_components(self):
+        """Initialize audio, noise suppression, and transcription components."""
+        # Update status
+        self.status_label.configure(text="⚙ Initializing...")
+        self.update()
+
+        try:
+            # Set device based on config
+            device_config = self.config.get('transcription.device', 'auto')
+            self.device_manager.set_device(device_config)
+
+            # Initialize transcription engine
+            model_size = self.config.get('transcription.model', 'base')
+            language = self.config.get('transcription.language', 'en')
+            device = self.device_manager.get_device_for_whisper()
+            compute_type = self.device_manager.get_compute_type()
+
+            self.transcription_engine = TranscriptionEngine(
+                model_size=model_size,
+                device=device,
+                compute_type=compute_type,
+                language=language,
+                min_confidence=self.config.get('processing.min_confidence', 0.5)
+            )
+
+            # Load model (synchronously to avoid X11 threading issues)
+            success = self.transcription_engine.load_model()
+
+            if success:
+                self.status_label.configure(text="✓ Ready")
+            else:
+                self.status_label.configure(text="❌ Model loading failed")
+                messagebox.showerror("Error", "Failed to load transcription model")
+
+        except Exception as e:
+            print(f"Error initializing components: {e}")
+            self.status_label.configure(text="❌ Initialization failed")
+            messagebox.showerror("Error", f"Failed to initialize:\n{e}")
+
+    def _update_status(self, status: str):
+        """Update status label (thread-safe)."""
+        self.after(0, lambda: self.status_label.configure(text=status))
+
+    def _toggle_transcription(self):
+        """Start or stop transcription."""
+        if not self.is_transcribing:
+            self._start_transcription()
+        else:
+            self._stop_transcription()
+
+    def _start_transcription(self):
+        """Start transcription."""
+        try:
+            # Check if engine is ready
+            if not self.transcription_engine or not self.transcription_engine.is_loaded:
+                messagebox.showerror("Error", "Transcription engine not ready")
+                return
+
+            # Get audio device
+            audio_device_str = self.config.get('audio.input_device', 'default')
+            audio_device = None if audio_device_str == 'default' else int(audio_device_str)
+
+            # Initialize audio capture
+            self.audio_capture = AudioCapture(
+                sample_rate=self.config.get('audio.sample_rate', 16000),
+                chunk_duration=self.config.get('audio.chunk_duration', 3.0),
+                device=audio_device
+            )
+
+            # Initialize noise suppressor
+            self.noise_suppressor = NoiseSuppressor(
+                sample_rate=self.config.get('audio.sample_rate', 16000),
+                method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
+                strength=self.config.get('noise_suppression.strength', 0.7),
+                use_vad=self.config.get('processing.use_vad', True)
+            )
+
+            # Start recording
+            self.audio_capture.start_recording(callback=self._process_audio_chunk)
+
+            # Update UI
+            self.is_transcribing = True
+            self.start_button.configure(text="⏸ Stop Transcription", fg_color="red")
+            self.status_label.configure(text="🔴 Recording...")
+
+        except Exception as e:
+            messagebox.showerror("Error", f"Failed to start transcription:\n{e}")
+            print(f"Error starting transcription: {e}")
+
+    def _stop_transcription(self):
+        """Stop transcription."""
+        try:
+            # Stop recording
+            if self.audio_capture:
+                self.audio_capture.stop_recording()
+
+            # Update UI
+            self.is_transcribing = False
+            self.start_button.configure(text="▶ Start Transcription", fg_color="green")
+            self.status_label.configure(text="✓ Ready")
+
+        except Exception as e:
+            messagebox.showerror("Error", f"Failed to stop transcription:\n{e}")
+            print(f"Error stopping transcription: {e}")
+
+    def _process_audio_chunk(self, audio_chunk):
+        """Process an audio chunk (noise suppression + transcription)."""
+        def process():
+            try:
+                # Apply noise suppression
+                processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
+
+                # Skip if silent (VAD filtered it out)
+                if processed_audio is None:
+                    return
+
+                # Transcribe
+                user_name = self.config.get('user.name', 'User')
+                result = self.transcription_engine.transcribe(
+                    processed_audio,
+                    sample_rate=self.config.get('audio.sample_rate', 16000),
+                    user_name=user_name
+                )
+
+                # Display result
+                if result:
+                    self.after(0, lambda: self.transcription_display.add_transcription(
+                        text=result.text,
+                        user_name=result.user_name,
+                        timestamp=result.timestamp
+                    ))
+
+            except Exception as e:
+                print(f"Error processing audio: {e}")
+
+        # Run in background thread
+        threading.Thread(target=process, daemon=True).start()
+
+    def _clear_transcriptions(self):
+        """Clear all transcriptions."""
+        if messagebox.askyesno("Clear Transcriptions", "Are you sure you want to clear all transcriptions?"):
+            self.transcription_display.clear()
+
+    def _save_transcriptions(self):
+        """Save transcriptions to file."""
+        filepath = filedialog.asksaveasfilename(
+            defaultextension=".txt",
+            filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
+        )
+
+        if filepath:
+            if self.transcription_display.save_to_file(filepath):
+                messagebox.showinfo("Saved", f"Transcriptions saved to:\n{filepath}")
+            else:
+                messagebox.showerror("Error", "Failed to save transcriptions")
+
+    def _open_settings(self):
+        """Open settings dialog."""
+        # Get audio devices
+        audio_devices = AudioCapture.get_input_devices()
+        if not audio_devices:
+            audio_devices = [(0, "Default")]
+
+        # Get compute devices
+        compute_devices = self.device_manager.get_device_info()
+        compute_devices.insert(0, ("auto", "Auto-detect"))
+
+        # Open settings dialog
+        SettingsDialog(
+            self,
+            self.config,
+            audio_devices,
+            compute_devices,
+            on_save=self._on_settings_saved
+        )
+
+    def _on_settings_saved(self):
+        """Handle settings being saved."""
+        # Update user label
+        user_name = self.config.get('user.name', 'User')
+        self.user_label.configure(text=f"User: {user_name}")
+
+        # Update display settings
+        self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100))
+        self.transcription_display.set_show_timestamps(self.config.get('display.show_timestamps', True))
+
+        # Note: Model/device changes require restart
+        messagebox.showinfo(
+            "Settings Saved",
+            "Some settings (model size, device) require restarting the application to take effect."
+        )
+
+    def _on_closing(self):
+        """Handle window closing."""
+        # Stop transcription if running
+        if self.is_transcribing:
+            self._stop_transcription()
+
+        # Unload model
+        if self.transcription_engine:
+            self.transcription_engine.unload_model()
+
+        # Close window
+        self.destroy()