Initial commit: Local Transcription App v1.0

Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 18:48:23 -08:00
commit 472233aec4
31 changed files with 5116 additions and 0 deletions
--- a/gui/main_window_qt.py
+++ b/gui/main_window_qt.py
@@ -0,0 +1,524 @@
+"""PySide6 main application window for the local transcription app."""
+
+from PySide6.QtWidgets import (
+    QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
+    QPushButton, QLabel, QFileDialog, QMessageBox
+)
+from PySide6.QtCore import Qt, QThread, Signal
+from PySide6.QtGui import QFont
+from pathlib import Path
+import sys
+
+# Add parent directory to path for imports
+sys.path.append(str(Path(__file__).parent.parent))
+
+from client.config import Config
+from client.device_utils import DeviceManager
+from client.audio_capture import AudioCapture
+from client.noise_suppression import NoiseSuppressor
+from client.transcription_engine import TranscriptionEngine
+from gui.transcription_display_qt import TranscriptionDisplay
+from gui.settings_dialog_qt import SettingsDialog
+from server.web_display import TranscriptionWebServer
+import asyncio
+from threading import Thread
+
+
+class WebServerThread(Thread):
+    """Thread for running the web server."""
+
+    def __init__(self, web_server):
+        super().__init__(daemon=True)
+        self.web_server = web_server
+        self.loop = None
+
+    def run(self):
+        """Run the web server in async event loop."""
+        self.loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_until_complete(self.web_server.start())
+
+
+class ModelLoaderThread(QThread):
+    """Thread for loading the Whisper model without blocking the GUI."""
+
+    finished = Signal(bool, str)  # success, message
+
+    def __init__(self, transcription_engine):
+        super().__init__()
+        self.transcription_engine = transcription_engine
+
+    def run(self):
+        """Load the model in background thread."""
+        try:
+            success = self.transcription_engine.load_model()
+            if success:
+                self.finished.emit(True, "Model loaded successfully")
+            else:
+                self.finished.emit(False, "Failed to load model")
+        except Exception as e:
+            self.finished.emit(False, f"Error loading model: {e}")
+
+
+class MainWindow(QMainWindow):
+    """Main application window using PySide6."""
+
+    def __init__(self):
+        """Initialize the main window."""
+        super().__init__()
+
+        # Application state
+        self.is_transcribing = False
+        self.config = Config()
+        self.device_manager = DeviceManager()
+
+        # Components (initialized later)
+        self.audio_capture: AudioCapture = None
+        self.noise_suppressor: NoiseSuppressor = None
+        self.transcription_engine: TranscriptionEngine = None
+        self.model_loader_thread: ModelLoaderThread = None
+
+        # Track current model settings
+        self.current_model_size: str = None
+        self.current_device_config: str = None
+
+        # Web server components
+        self.web_server: TranscriptionWebServer = None
+        self.web_server_thread: WebServerThread = None
+
+        # Configure window
+        self.setWindowTitle("Local Transcription")
+        self.resize(900, 700)
+
+        # Create UI
+        self._create_widgets()
+
+        # Initialize components (in background)
+        self._initialize_components()
+
+        # Start web server if enabled
+        self._start_web_server_if_enabled()
+
+    def _create_widgets(self):
+        """Create all UI widgets."""
+        # Central widget
+        central_widget = QWidget()
+        self.setCentralWidget(central_widget)
+
+        main_layout = QVBoxLayout()
+        central_widget.setLayout(main_layout)
+
+        # Header
+        header_widget = QWidget()
+        header_widget.setFixedHeight(80)
+        header_layout = QHBoxLayout()
+        header_widget.setLayout(header_layout)
+
+        title_label = QLabel("Local Transcription")
+        title_font = QFont()
+        title_font.setPointSize(24)
+        title_font.setBold(True)
+        title_label.setFont(title_font)
+        header_layout.addWidget(title_label)
+
+        header_layout.addStretch()
+
+        self.settings_button = QPushButton("⚙ Settings")
+        self.settings_button.setFixedSize(120, 40)
+        self.settings_button.clicked.connect(self._open_settings)
+        header_layout.addWidget(self.settings_button)
+
+        main_layout.addWidget(header_widget)
+
+        # Status bar
+        status_widget = QWidget()
+        status_widget.setFixedHeight(60)
+        status_layout = QHBoxLayout()
+        status_widget.setLayout(status_layout)
+
+        self.status_label = QLabel("⚫ Initializing...")
+        status_font = QFont()
+        status_font.setPointSize(14)
+        self.status_label.setFont(status_font)
+        status_layout.addWidget(self.status_label)
+
+        device_info = self.device_manager.get_device_info()
+        device_text = device_info[0][1] if device_info else "No device"
+        self.device_label = QLabel(f"Device: {device_text}")
+        status_layout.addWidget(self.device_label)
+
+        user_name = self.config.get('user.name', 'User')
+        self.user_label = QLabel(f"User: {user_name}")
+        status_layout.addWidget(self.user_label)
+
+        status_layout.addStretch()
+
+        main_layout.addWidget(status_widget)
+
+        # Transcription display
+        self.transcription_display = TranscriptionDisplay(
+            max_lines=self.config.get('display.max_lines', 100),
+            show_timestamps=self.config.get('display.show_timestamps', True),
+            font_family=self.config.get('display.font_family', 'Courier'),
+            font_size=self.config.get('display.font_size', 12)
+        )
+        main_layout.addWidget(self.transcription_display)
+
+        # Control buttons
+        control_widget = QWidget()
+        control_widget.setFixedHeight(80)
+        control_layout = QHBoxLayout()
+        control_widget.setLayout(control_layout)
+
+        self.start_button = QPushButton("▶ Start Transcription")
+        self.start_button.setFixedSize(240, 50)
+        button_font = QFont()
+        button_font.setPointSize(14)
+        button_font.setBold(True)
+        self.start_button.setFont(button_font)
+        self.start_button.clicked.connect(self._toggle_transcription)
+        self.start_button.setStyleSheet("background-color: #2ecc71; color: white;")
+        control_layout.addWidget(self.start_button)
+
+        self.clear_button = QPushButton("Clear")
+        self.clear_button.setFixedSize(120, 50)
+        self.clear_button.clicked.connect(self._clear_transcriptions)
+        control_layout.addWidget(self.clear_button)
+
+        self.save_button = QPushButton("💾 Save")
+        self.save_button.setFixedSize(120, 50)
+        self.save_button.clicked.connect(self._save_transcriptions)
+        control_layout.addWidget(self.save_button)
+
+        control_layout.addStretch()
+
+        main_layout.addWidget(control_widget)
+
+    def _initialize_components(self):
+        """Initialize audio, noise suppression, and transcription components."""
+        # Update status
+        self.status_label.setText("⚙ Initializing...")
+
+        # Set device based on config
+        device_config = self.config.get('transcription.device', 'auto')
+        self.device_manager.set_device(device_config)
+
+        # Initialize transcription engine
+        model_size = self.config.get('transcription.model', 'base')
+        language = self.config.get('transcription.language', 'en')
+        device = self.device_manager.get_device_for_whisper()
+        compute_type = self.device_manager.get_compute_type()
+
+        # Track current settings
+        self.current_model_size = model_size
+        self.current_device_config = device_config
+
+        self.transcription_engine = TranscriptionEngine(
+            model_size=model_size,
+            device=device,
+            compute_type=compute_type,
+            language=language,
+            min_confidence=self.config.get('processing.min_confidence', 0.5)
+        )
+
+        # Load model in background thread
+        self.model_loader_thread = ModelLoaderThread(self.transcription_engine)
+        self.model_loader_thread.finished.connect(self._on_model_loaded)
+        self.model_loader_thread.start()
+
+    def _on_model_loaded(self, success: bool, message: str):
+        """Handle model loading completion."""
+        if success:
+            host = self.config.get('web_server.host', '127.0.0.1')
+            port = self.config.get('web_server.port', 8080)
+            self.status_label.setText(f"✓ Ready | Web: http://{host}:{port}")
+            self.start_button.setEnabled(True)
+        else:
+            self.status_label.setText("❌ Model loading failed")
+            QMessageBox.critical(self, "Error", message)
+            self.start_button.setEnabled(False)
+
+    def _start_web_server_if_enabled(self):
+        """Start web server."""
+        host = self.config.get('web_server.host', '127.0.0.1')
+        port = self.config.get('web_server.port', 8080)
+        show_timestamps = self.config.get('display.show_timestamps', True)
+        fade_after_seconds = self.config.get('display.fade_after_seconds', 10)
+
+        print(f"Starting web server at http://{host}:{port}")
+        self.web_server = TranscriptionWebServer(
+            host=host,
+            port=port,
+            show_timestamps=show_timestamps,
+            fade_after_seconds=fade_after_seconds
+        )
+        self.web_server_thread = WebServerThread(self.web_server)
+        self.web_server_thread.start()
+
+    def _toggle_transcription(self):
+        """Start or stop transcription."""
+        if not self.is_transcribing:
+            self._start_transcription()
+        else:
+            self._stop_transcription()
+
+    def _start_transcription(self):
+        """Start transcription."""
+        try:
+            # Check if engine is ready
+            if not self.transcription_engine or not self.transcription_engine.is_loaded:
+                QMessageBox.critical(self, "Error", "Transcription engine not ready")
+                return
+
+            # Get audio device
+            audio_device_str = self.config.get('audio.input_device', 'default')
+            audio_device = None if audio_device_str == 'default' else int(audio_device_str)
+
+            # Initialize audio capture
+            self.audio_capture = AudioCapture(
+                sample_rate=self.config.get('audio.sample_rate', 16000),
+                chunk_duration=self.config.get('audio.chunk_duration', 3.0),
+                device=audio_device
+            )
+
+            # Initialize noise suppressor
+            self.noise_suppressor = NoiseSuppressor(
+                sample_rate=self.config.get('audio.sample_rate', 16000),
+                method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none",
+                strength=self.config.get('noise_suppression.strength', 0.7),
+                use_vad=self.config.get('processing.use_vad', True)
+            )
+
+            # Start recording
+            self.audio_capture.start_recording(callback=self._process_audio_chunk)
+
+            # Update UI
+            self.is_transcribing = True
+            self.start_button.setText("⏸ Stop Transcription")
+            self.start_button.setStyleSheet("background-color: #e74c3c; color: white;")
+            self.status_label.setText("🔴 Recording...")
+
+        except Exception as e:
+            QMessageBox.critical(self, "Error", f"Failed to start transcription:\n{e}")
+            print(f"Error starting transcription: {e}")
+
+    def _stop_transcription(self):
+        """Stop transcription."""
+        try:
+            # Stop recording
+            if self.audio_capture:
+                self.audio_capture.stop_recording()
+
+            # Update UI
+            self.is_transcribing = False
+            self.start_button.setText("▶ Start Transcription")
+            self.start_button.setStyleSheet("background-color: #2ecc71; color: white;")
+            self.status_label.setText("✓ Ready")
+
+        except Exception as e:
+            QMessageBox.critical(self, "Error", f"Failed to stop transcription:\n{e}")
+            print(f"Error stopping transcription: {e}")
+
+    def _process_audio_chunk(self, audio_chunk):
+        """Process an audio chunk (noise suppression + transcription)."""
+        def process():
+            try:
+                # Apply noise suppression
+                processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True)
+
+                # Skip if silent (VAD filtered it out)
+                if processed_audio is None:
+                    return
+
+                # Transcribe
+                user_name = self.config.get('user.name', 'User')
+                result = self.transcription_engine.transcribe(
+                    processed_audio,
+                    sample_rate=self.config.get('audio.sample_rate', 16000),
+                    user_name=user_name
+                )
+
+                # Display result (use Qt signal for thread safety)
+                if result:
+                    # We need to update UI from main thread
+                    # Note: We don't pass timestamp - let the display widget create it
+                    from PySide6.QtCore import QMetaObject, Q_ARG
+                    QMetaObject.invokeMethod(
+                        self.transcription_display,
+                        "add_transcription",
+                        Qt.QueuedConnection,
+                        Q_ARG(str, result.text),
+                        Q_ARG(str, result.user_name)
+                    )
+
+                    # Broadcast to web server if enabled
+                    if self.web_server and self.web_server_thread:
+                        asyncio.run_coroutine_threadsafe(
+                            self.web_server.broadcast_transcription(
+                                result.text,
+                                result.user_name,
+                                result.timestamp
+                            ),
+                            self.web_server_thread.loop
+                        )
+
+            except Exception as e:
+                print(f"Error processing audio: {e}")
+                import traceback
+                traceback.print_exc()
+
+        # Run in background thread
+        from threading import Thread
+        Thread(target=process, daemon=True).start()
+
+    def _clear_transcriptions(self):
+        """Clear all transcriptions."""
+        reply = QMessageBox.question(
+            self,
+            "Clear Transcriptions",
+            "Are you sure you want to clear all transcriptions?",
+            QMessageBox.Yes | QMessageBox.No
+        )
+
+        if reply == QMessageBox.Yes:
+            self.transcription_display.clear_all()
+
+    def _save_transcriptions(self):
+        """Save transcriptions to file."""
+        filepath, _ = QFileDialog.getSaveFileName(
+            self,
+            "Save Transcriptions",
+            "",
+            "Text files (*.txt);;All files (*.*)"
+        )
+
+        if filepath:
+            if self.transcription_display.save_to_file(filepath):
+                QMessageBox.information(self, "Saved", f"Transcriptions saved to:\n{filepath}")
+            else:
+                QMessageBox.critical(self, "Error", "Failed to save transcriptions")
+
+    def _open_settings(self):
+        """Open settings dialog."""
+        # Get audio devices
+        audio_devices = AudioCapture.get_input_devices()
+        if not audio_devices:
+            audio_devices = [(0, "Default")]
+
+        # Get compute devices
+        compute_devices = self.device_manager.get_device_info()
+        compute_devices.insert(0, ("auto", "Auto-detect"))
+
+        # Open settings dialog
+        dialog = SettingsDialog(
+            self,
+            self.config,
+            audio_devices,
+            compute_devices,
+            on_save=self._on_settings_saved
+        )
+        dialog.exec()
+
+    def _on_settings_saved(self):
+        """Handle settings being saved."""
+        # Update user label
+        user_name = self.config.get('user.name', 'User')
+        self.user_label.setText(f"User: {user_name}")
+
+        # Update display settings
+        show_timestamps = self.config.get('display.show_timestamps', True)
+        self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100))
+        self.transcription_display.set_show_timestamps(show_timestamps)
+        self.transcription_display.set_font(
+            self.config.get('display.font_family', 'Courier'),
+            self.config.get('display.font_size', 12)
+        )
+
+        # Update web server settings
+        if self.web_server:
+            self.web_server.show_timestamps = show_timestamps
+            self.web_server.fade_after_seconds = self.config.get('display.fade_after_seconds', 10)
+
+        # Check if model/device settings changed - reload model if needed
+        new_model = self.config.get('transcription.model', 'base')
+        new_device_config = self.config.get('transcription.device', 'auto')
+
+        # Only reload if model size or device changed
+        if self.current_model_size != new_model or self.current_device_config != new_device_config:
+            self._reload_model()
+        else:
+            QMessageBox.information(self, "Settings Saved", "Settings have been applied successfully!")
+
+    def _reload_model(self):
+        """Reload the transcription model with new settings."""
+        # Stop transcription if running
+        was_transcribing = self.is_transcribing
+        if was_transcribing:
+            self._stop_transcription()
+
+        # Update status
+        self.status_label.setText("⚙ Reloading model...")
+        self.start_button.setEnabled(False)
+
+        # Unload current model
+        if self.transcription_engine:
+            self.transcription_engine.unload_model()
+
+        # Set device based on config
+        device_config = self.config.get('transcription.device', 'auto')
+        self.device_manager.set_device(device_config)
+
+        # Re-initialize transcription engine
+        model_size = self.config.get('transcription.model', 'base')
+        language = self.config.get('transcription.language', 'en')
+        device = self.device_manager.get_device_for_whisper()
+        compute_type = self.device_manager.get_compute_type()
+
+        # Update tracked settings
+        self.current_model_size = model_size
+        self.current_device_config = device_config
+
+        self.transcription_engine = TranscriptionEngine(
+            model_size=model_size,
+            device=device,
+            compute_type=compute_type,
+            language=language,
+            min_confidence=self.config.get('processing.min_confidence', 0.5)
+        )
+
+        # Load model in background thread
+        if self.model_loader_thread and self.model_loader_thread.isRunning():
+            self.model_loader_thread.wait()
+
+        self.model_loader_thread = ModelLoaderThread(self.transcription_engine)
+        self.model_loader_thread.finished.connect(self._on_model_reloaded)
+        self.model_loader_thread.start()
+
+    def _on_model_reloaded(self, success: bool, message: str):
+        """Handle model reloading completion."""
+        if success:
+            host = self.config.get('web_server.host', '127.0.0.1')
+            port = self.config.get('web_server.port', 8080)
+            self.status_label.setText(f"✓ Ready | Web: http://{host}:{port}")
+            self.start_button.setEnabled(True)
+            QMessageBox.information(self, "Settings Saved", "Model reloaded successfully with new settings!")
+        else:
+            self.status_label.setText("❌ Model loading failed")
+            QMessageBox.critical(self, "Error", f"Failed to reload model:\n{message}")
+            self.start_button.setEnabled(False)
+
+    def closeEvent(self, event):
+        """Handle window closing."""
+        # Stop transcription if running
+        if self.is_transcribing:
+            self._stop_transcription()
+
+        # Unload model
+        if self.transcription_engine:
+            self.transcription_engine.unload_model()
+
+        # Wait for model loader thread
+        if self.model_loader_thread and self.model_loader_thread.isRunning():
+            self.model_loader_thread.wait()
+
+        event.accept()