Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/gui/settings_dialog.py
+++ b/gui/settings_dialog.py
@@ -0,0 +1,310 @@
+"""Settings dialog for configuring the application."""
+
+import customtkinter as ctk
+from tkinter import messagebox
+from typing import Callable, List, Tuple
+
+
+class SettingsDialog(ctk.CTkToplevel):
+    """Dialog window for application settings."""
+
+    def __init__(
+        self,
+        parent,
+        config,
+        audio_devices: List[Tuple[int, str]],
+        compute_devices: List[Tuple[str, str]],
+        on_save: Callable = None
+    ):
+        """
+        Initialize settings dialog.
+
+        Args:
+            parent: Parent window
+            config: Configuration object
+            audio_devices: List of (device_index, device_name) tuples
+            compute_devices: List of (device_id, device_description) tuples
+            on_save: Callback function when settings are saved
+        """
+        super().__init__(parent)
+
+        self.config = config
+        self.audio_devices = audio_devices
+        self.compute_devices = compute_devices
+        self.on_save = on_save
+
+        # Window configuration
+        self.title("Settings")
+        self.geometry("600x700")
+        self.resizable(False, False)
+
+        # Make dialog modal
+        self.transient(parent)
+        self.grab_set()
+
+        self._create_widgets()
+        self._load_current_settings()
+
+    def _create_widgets(self):
+        """Create all settings widgets."""
+        # Main container with padding
+        main_frame = ctk.CTkFrame(self)
+        main_frame.pack(fill="both", expand=True, padx=20, pady=20)
+
+        # User Settings Section
+        user_frame = ctk.CTkFrame(main_frame)
+        user_frame.pack(fill="x", pady=(0, 15))
+
+        ctk.CTkLabel(user_frame, text="User Settings", font=("", 16, "bold")).pack(
+            anchor="w", padx=10, pady=(10, 5)
+        )
+
+        # User name
+        name_frame = ctk.CTkFrame(user_frame)
+        name_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(name_frame, text="Display Name:", width=150).pack(side="left", padx=5)
+        self.name_entry = ctk.CTkEntry(name_frame, width=300)
+        self.name_entry.pack(side="left", padx=5)
+
+        # Audio Settings Section
+        audio_frame = ctk.CTkFrame(main_frame)
+        audio_frame.pack(fill="x", pady=(0, 15))
+
+        ctk.CTkLabel(audio_frame, text="Audio Settings", font=("", 16, "bold")).pack(
+            anchor="w", padx=10, pady=(10, 5)
+        )
+
+        # Audio device
+        device_frame = ctk.CTkFrame(audio_frame)
+        device_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(device_frame, text="Input Device:", width=150).pack(side="left", padx=5)
+        device_names = [name for _, name in self.audio_devices]
+        self.audio_device_menu = ctk.CTkOptionMenu(device_frame, values=device_names, width=300)
+        self.audio_device_menu.pack(side="left", padx=5)
+
+        # Chunk duration
+        chunk_frame = ctk.CTkFrame(audio_frame)
+        chunk_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(chunk_frame, text="Chunk Duration (s):", width=150).pack(side="left", padx=5)
+        self.chunk_entry = ctk.CTkEntry(chunk_frame, width=100)
+        self.chunk_entry.pack(side="left", padx=5)
+
+        # Transcription Settings Section
+        transcription_frame = ctk.CTkFrame(main_frame)
+        transcription_frame.pack(fill="x", pady=(0, 15))
+
+        ctk.CTkLabel(transcription_frame, text="Transcription Settings", font=("", 16, "bold")).pack(
+            anchor="w", padx=10, pady=(10, 5)
+        )
+
+        # Model size
+        model_frame = ctk.CTkFrame(transcription_frame)
+        model_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(model_frame, text="Model Size:", width=150).pack(side="left", padx=5)
+        self.model_menu = ctk.CTkOptionMenu(
+            model_frame,
+            values=["tiny", "base", "small", "medium", "large"],
+            width=200
+        )
+        self.model_menu.pack(side="left", padx=5)
+
+        # Compute device
+        compute_frame = ctk.CTkFrame(transcription_frame)
+        compute_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(compute_frame, text="Compute Device:", width=150).pack(side="left", padx=5)
+        device_descs = [desc for _, desc in self.compute_devices]
+        self.compute_device_menu = ctk.CTkOptionMenu(compute_frame, values=device_descs, width=300)
+        self.compute_device_menu.pack(side="left", padx=5)
+
+        # Language
+        lang_frame = ctk.CTkFrame(transcription_frame)
+        lang_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(lang_frame, text="Language:", width=150).pack(side="left", padx=5)
+        self.lang_menu = ctk.CTkOptionMenu(
+            lang_frame,
+            values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
+            width=200
+        )
+        self.lang_menu.pack(side="left", padx=5)
+
+        # Noise Suppression Section
+        noise_frame = ctk.CTkFrame(main_frame)
+        noise_frame.pack(fill="x", pady=(0, 15))
+
+        ctk.CTkLabel(noise_frame, text="Noise Suppression", font=("", 16, "bold")).pack(
+            anchor="w", padx=10, pady=(10, 5)
+        )
+
+        # Enable noise suppression
+        ns_enable_frame = ctk.CTkFrame(noise_frame)
+        ns_enable_frame.pack(fill="x", padx=10, pady=5)
+        self.noise_enabled_var = ctk.BooleanVar()
+        self.noise_enabled_check = ctk.CTkCheckBox(
+            ns_enable_frame,
+            text="Enable Noise Suppression",
+            variable=self.noise_enabled_var
+        )
+        self.noise_enabled_check.pack(side="left", padx=5)
+
+        # Noise suppression strength
+        strength_frame = ctk.CTkFrame(noise_frame)
+        strength_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(strength_frame, text="Strength:", width=150).pack(side="left", padx=5)
+        self.noise_strength_slider = ctk.CTkSlider(
+            strength_frame,
+            from_=0.0,
+            to=1.0,
+            number_of_steps=20,
+            width=300
+        )
+        self.noise_strength_slider.pack(side="left", padx=5)
+        self.noise_strength_label = ctk.CTkLabel(strength_frame, text="0.7", width=40)
+        self.noise_strength_label.pack(side="left", padx=5)
+        self.noise_strength_slider.configure(command=self._update_strength_label)
+
+        # VAD
+        vad_frame = ctk.CTkFrame(noise_frame)
+        vad_frame.pack(fill="x", padx=10, pady=5)
+        self.vad_enabled_var = ctk.BooleanVar()
+        self.vad_enabled_check = ctk.CTkCheckBox(
+            vad_frame,
+            text="Enable Voice Activity Detection",
+            variable=self.vad_enabled_var
+        )
+        self.vad_enabled_check.pack(side="left", padx=5)
+
+        # Display Settings Section
+        display_frame = ctk.CTkFrame(main_frame)
+        display_frame.pack(fill="x", pady=(0, 15))
+
+        ctk.CTkLabel(display_frame, text="Display Settings", font=("", 16, "bold")).pack(
+            anchor="w", padx=10, pady=(10, 5)
+        )
+
+        # Show timestamps
+        ts_frame = ctk.CTkFrame(display_frame)
+        ts_frame.pack(fill="x", padx=10, pady=5)
+        self.timestamps_var = ctk.BooleanVar()
+        self.timestamps_check = ctk.CTkCheckBox(
+            ts_frame,
+            text="Show Timestamps",
+            variable=self.timestamps_var
+        )
+        self.timestamps_check.pack(side="left", padx=5)
+
+        # Max lines
+        maxlines_frame = ctk.CTkFrame(display_frame)
+        maxlines_frame.pack(fill="x", padx=10, pady=5)
+        ctk.CTkLabel(maxlines_frame, text="Max Lines:", width=150).pack(side="left", padx=5)
+        self.maxlines_entry = ctk.CTkEntry(maxlines_frame, width=100)
+        self.maxlines_entry.pack(side="left", padx=5)
+
+        # Buttons
+        button_frame = ctk.CTkFrame(main_frame)
+        button_frame.pack(fill="x", pady=(10, 0))
+
+        self.save_button = ctk.CTkButton(
+            button_frame,
+            text="Save",
+            command=self._save_settings,
+            width=120
+        )
+        self.save_button.pack(side="right", padx=5)
+
+        self.cancel_button = ctk.CTkButton(
+            button_frame,
+            text="Cancel",
+            command=self.destroy,
+            width=120,
+            fg_color="gray"
+        )
+        self.cancel_button.pack(side="right", padx=5)
+
+    def _update_strength_label(self, value):
+        """Update the noise strength label."""
+        self.noise_strength_label.configure(text=f"{value:.1f}")
+
+    def _load_current_settings(self):
+        """Load current settings from config."""
+        # User settings
+        self.name_entry.insert(0, self.config.get('user.name', 'User'))
+
+        # Audio settings
+        current_device = self.config.get('audio.input_device', 'default')
+        for idx, (dev_idx, dev_name) in enumerate(self.audio_devices):
+            if str(dev_idx) == current_device or current_device == 'default' and idx == 0:
+                self.audio_device_menu.set(dev_name)
+                break
+
+        self.chunk_entry.insert(0, str(self.config.get('audio.chunk_duration', 3.0)))
+
+        # Transcription settings
+        self.model_menu.set(self.config.get('transcription.model', 'base'))
+
+        current_compute = self.config.get('transcription.device', 'auto')
+        for dev_id, dev_desc in self.compute_devices:
+            if dev_id == current_compute or (current_compute == 'auto' and dev_id == self.compute_devices[0][0]):
+                self.compute_device_menu.set(dev_desc)
+                break
+
+        self.lang_menu.set(self.config.get('transcription.language', 'en'))
+
+        # Noise suppression
+        self.noise_enabled_var.set(self.config.get('noise_suppression.enabled', True))
+        strength = self.config.get('noise_suppression.strength', 0.7)
+        self.noise_strength_slider.set(strength)
+        self._update_strength_label(strength)
+        self.vad_enabled_var.set(self.config.get('processing.use_vad', True))
+
+        # Display settings
+        self.timestamps_var.set(self.config.get('display.show_timestamps', True))
+        self.maxlines_entry.insert(0, str(self.config.get('display.max_lines', 100)))
+
+    def _save_settings(self):
+        """Save settings to config."""
+        try:
+            # User settings
+            self.config.set('user.name', self.name_entry.get())
+
+            # Audio settings
+            selected_audio = self.audio_device_menu.get()
+            for dev_idx, dev_name in self.audio_devices:
+                if dev_name == selected_audio:
+                    self.config.set('audio.input_device', str(dev_idx))
+                    break
+
+            chunk_duration = float(self.chunk_entry.get())
+            self.config.set('audio.chunk_duration', chunk_duration)
+
+            # Transcription settings
+            self.config.set('transcription.model', self.model_menu.get())
+
+            selected_compute = self.compute_device_menu.get()
+            for dev_id, dev_desc in self.compute_devices:
+                if dev_desc == selected_compute:
+                    self.config.set('transcription.device', dev_id)
+                    break
+
+            self.config.set('transcription.language', self.lang_menu.get())
+
+            # Noise suppression
+            self.config.set('noise_suppression.enabled', self.noise_enabled_var.get())
+            self.config.set('noise_suppression.strength', self.noise_strength_slider.get())
+            self.config.set('processing.use_vad', self.vad_enabled_var.get())
+
+            # Display settings
+            self.config.set('display.show_timestamps', self.timestamps_var.get())
+            max_lines = int(self.maxlines_entry.get())
+            self.config.set('display.max_lines', max_lines)
+
+            # Call save callback
+            if self.on_save:
+                self.on_save()
+
+            messagebox.showinfo("Settings Saved", "Settings have been saved successfully!")
+            self.destroy()
+
+        except ValueError as e:
+            messagebox.showerror("Invalid Input", f"Please check your input values:\n{e}")
+        except Exception as e:
+            messagebox.showerror("Error", f"Failed to save settings:\n{e}")