Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/gui/settings_dialog_qt.py
+++ b/gui/settings_dialog_qt.py
@@ -0,0 +1,261 @@
+"""PySide6 settings dialog for configuring the application."""
+
+from PySide6.QtWidgets import (
+    QDialog, QVBoxLayout, QHBoxLayout, QFormLayout,
+    QLabel, QLineEdit, QComboBox, QCheckBox, QSlider,
+    QPushButton, QMessageBox, QGroupBox
+)
+from PySide6.QtCore import Qt
+from typing import Callable, List, Tuple
+
+
+class SettingsDialog(QDialog):
+    """Dialog window for application settings using PySide6."""
+
+    def __init__(
+        self,
+        parent,
+        config,
+        audio_devices: List[Tuple[int, str]],
+        compute_devices: List[Tuple[str, str]],
+        on_save: Callable = None
+    ):
+        """
+        Initialize settings dialog.
+
+        Args:
+            parent: Parent window
+            config: Configuration object
+            audio_devices: List of (device_index, device_name) tuples
+            compute_devices: List of (device_id, device_description) tuples
+            on_save: Callback function when settings are saved
+        """
+        super().__init__(parent)
+
+        self.config = config
+        self.audio_devices = audio_devices
+        self.compute_devices = compute_devices
+        self.on_save = on_save
+
+        # Window configuration
+        self.setWindowTitle("Settings")
+        self.setMinimumSize(600, 700)
+        self.setModal(True)
+
+        self._create_widgets()
+        self._load_current_settings()
+
+    def _create_widgets(self):
+        """Create all settings widgets."""
+        main_layout = QVBoxLayout()
+        self.setLayout(main_layout)
+
+        # User Settings Group
+        user_group = QGroupBox("User Settings")
+        user_layout = QFormLayout()
+
+        self.name_input = QLineEdit()
+        user_layout.addRow("Display Name:", self.name_input)
+
+        user_group.setLayout(user_layout)
+        main_layout.addWidget(user_group)
+
+        # Audio Settings Group
+        audio_group = QGroupBox("Audio Settings")
+        audio_layout = QFormLayout()
+
+        self.audio_device_combo = QComboBox()
+        device_names = [name for _, name in self.audio_devices]
+        self.audio_device_combo.addItems(device_names)
+        audio_layout.addRow("Input Device:", self.audio_device_combo)
+
+        self.chunk_input = QLineEdit()
+        audio_layout.addRow("Chunk Duration (s):", self.chunk_input)
+
+        audio_group.setLayout(audio_layout)
+        main_layout.addWidget(audio_group)
+
+        # Transcription Settings Group
+        transcription_group = QGroupBox("Transcription Settings")
+        transcription_layout = QFormLayout()
+
+        self.model_combo = QComboBox()
+        self.model_combo.addItems(["tiny", "base", "small", "medium", "large"])
+        transcription_layout.addRow("Model Size:", self.model_combo)
+
+        self.compute_device_combo = QComboBox()
+        device_descs = [desc for _, desc in self.compute_devices]
+        self.compute_device_combo.addItems(device_descs)
+        transcription_layout.addRow("Compute Device:", self.compute_device_combo)
+
+        self.lang_combo = QComboBox()
+        self.lang_combo.addItems(["auto", "en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"])
+        transcription_layout.addRow("Language:", self.lang_combo)
+
+        transcription_group.setLayout(transcription_layout)
+        main_layout.addWidget(transcription_group)
+
+        # Noise Suppression Group
+        noise_group = QGroupBox("Noise Suppression")
+        noise_layout = QVBoxLayout()
+
+        self.noise_enabled_check = QCheckBox("Enable Noise Suppression")
+        noise_layout.addWidget(self.noise_enabled_check)
+
+        # Strength slider
+        strength_layout = QHBoxLayout()
+        strength_layout.addWidget(QLabel("Strength:"))
+
+        self.noise_strength_slider = QSlider(Qt.Horizontal)
+        self.noise_strength_slider.setMinimum(0)
+        self.noise_strength_slider.setMaximum(100)
+        self.noise_strength_slider.setValue(70)
+        self.noise_strength_slider.valueChanged.connect(self._update_strength_label)
+        strength_layout.addWidget(self.noise_strength_slider)
+
+        self.noise_strength_label = QLabel("0.7")
+        strength_layout.addWidget(self.noise_strength_label)
+
+        noise_layout.addLayout(strength_layout)
+
+        self.vad_enabled_check = QCheckBox("Enable Voice Activity Detection")
+        noise_layout.addWidget(self.vad_enabled_check)
+
+        noise_group.setLayout(noise_layout)
+        main_layout.addWidget(noise_group)
+
+        # Display Settings Group
+        display_group = QGroupBox("Display Settings")
+        display_layout = QFormLayout()
+
+        self.timestamps_check = QCheckBox()
+        display_layout.addRow("Show Timestamps:", self.timestamps_check)
+
+        self.maxlines_input = QLineEdit()
+        display_layout.addRow("Max Lines:", self.maxlines_input)
+
+        self.font_family_combo = QComboBox()
+        self.font_family_combo.addItems(["Courier", "Arial", "Times New Roman", "Consolas", "Monaco", "Monospace"])
+        display_layout.addRow("Font Family:", self.font_family_combo)
+
+        self.font_size_input = QLineEdit()
+        display_layout.addRow("Font Size:", self.font_size_input)
+
+        self.fade_seconds_input = QLineEdit()
+        display_layout.addRow("Fade After (seconds):", self.fade_seconds_input)
+
+        display_group.setLayout(display_layout)
+        main_layout.addWidget(display_group)
+
+        # Buttons
+        button_layout = QHBoxLayout()
+        button_layout.addStretch()
+
+        self.cancel_button = QPushButton("Cancel")
+        self.cancel_button.clicked.connect(self.reject)
+        button_layout.addWidget(self.cancel_button)
+
+        self.save_button = QPushButton("Save")
+        self.save_button.clicked.connect(self._save_settings)
+        self.save_button.setDefault(True)
+        button_layout.addWidget(self.save_button)
+
+        main_layout.addLayout(button_layout)
+
+    def _update_strength_label(self, value):
+        """Update the noise strength label."""
+        self.noise_strength_label.setText(f"{value / 100:.1f}")
+
+    def _load_current_settings(self):
+        """Load current settings from config."""
+        # User settings
+        self.name_input.setText(self.config.get('user.name', 'User'))
+
+        # Audio settings
+        current_device = self.config.get('audio.input_device', 'default')
+        for idx, (dev_idx, dev_name) in enumerate(self.audio_devices):
+            if str(dev_idx) == current_device or (current_device == 'default' and idx == 0):
+                self.audio_device_combo.setCurrentIndex(idx)
+                break
+
+        self.chunk_input.setText(str(self.config.get('audio.chunk_duration', 3.0)))
+
+        # Transcription settings
+        model = self.config.get('transcription.model', 'base')
+        self.model_combo.setCurrentText(model)
+
+        current_compute = self.config.get('transcription.device', 'auto')
+        for idx, (dev_id, dev_desc) in enumerate(self.compute_devices):
+            if dev_id == current_compute or (current_compute == 'auto' and idx == 0):
+                self.compute_device_combo.setCurrentIndex(idx)
+                break
+
+        lang = self.config.get('transcription.language', 'en')
+        self.lang_combo.setCurrentText(lang)
+
+        # Noise suppression
+        self.noise_enabled_check.setChecked(self.config.get('noise_suppression.enabled', True))
+        strength = self.config.get('noise_suppression.strength', 0.7)
+        self.noise_strength_slider.setValue(int(strength * 100))
+        self._update_strength_label(int(strength * 100))
+        self.vad_enabled_check.setChecked(self.config.get('processing.use_vad', True))
+
+        # Display settings
+        self.timestamps_check.setChecked(self.config.get('display.show_timestamps', True))
+        self.maxlines_input.setText(str(self.config.get('display.max_lines', 100)))
+
+        font_family = self.config.get('display.font_family', 'Courier')
+        self.font_family_combo.setCurrentText(font_family)
+
+        self.font_size_input.setText(str(self.config.get('display.font_size', 12)))
+        self.fade_seconds_input.setText(str(self.config.get('display.fade_after_seconds', 10)))
+
+    def _save_settings(self):
+        """Save settings to config."""
+        try:
+            # User settings
+            self.config.set('user.name', self.name_input.text())
+
+            # Audio settings
+            selected_audio_idx = self.audio_device_combo.currentIndex()
+            dev_idx, _ = self.audio_devices[selected_audio_idx]
+            self.config.set('audio.input_device', str(dev_idx))
+
+            chunk_duration = float(self.chunk_input.text())
+            self.config.set('audio.chunk_duration', chunk_duration)
+
+            # Transcription settings
+            self.config.set('transcription.model', self.model_combo.currentText())
+
+            selected_compute_idx = self.compute_device_combo.currentIndex()
+            dev_id, _ = self.compute_devices[selected_compute_idx]
+            self.config.set('transcription.device', dev_id)
+
+            self.config.set('transcription.language', self.lang_combo.currentText())
+
+            # Noise suppression
+            self.config.set('noise_suppression.enabled', self.noise_enabled_check.isChecked())
+            self.config.set('noise_suppression.strength', self.noise_strength_slider.value() / 100.0)
+            self.config.set('processing.use_vad', self.vad_enabled_check.isChecked())
+
+            # Display settings
+            self.config.set('display.show_timestamps', self.timestamps_check.isChecked())
+            max_lines = int(self.maxlines_input.text())
+            self.config.set('display.max_lines', max_lines)
+            self.config.set('display.font_family', self.font_family_combo.currentText())
+            font_size = int(self.font_size_input.text())
+            self.config.set('display.font_size', font_size)
+            fade_seconds = int(self.fade_seconds_input.text())
+            self.config.set('display.fade_after_seconds', fade_seconds)
+
+            # Call save callback
+            if self.on_save:
+                self.on_save()
+
+            QMessageBox.information(self, "Settings Saved", "Settings have been saved successfully!")
+            self.accept()
+
+        except ValueError as e:
+            QMessageBox.critical(self, "Invalid Input", f"Please check your input values:\n{e}")
+        except Exception as e:
+            QMessageBox.critical(self, "Error", f"Failed to save settings:\n{e}")