From 472233aec49d8f39d3e1dde2fb03830242dd53e3 Mon Sep 17 00:00:00 2001 From: Josh Knapp Date: Thu, 25 Dec 2025 18:48:23 -0800 Subject: [PATCH] Initial commit: Local Transcription App v1.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 Complete - Standalone Desktop Application Features: - Real-time speech-to-text with Whisper (faster-whisper) - PySide6 desktop GUI with settings dialog - Web server for OBS browser source integration - Audio capture with automatic sample rate detection and resampling - Noise suppression with Voice Activity Detection (VAD) - Configurable display settings (font, timestamps, fade duration) - Settings apply without restart (with automatic model reloading) - Auto-fade for web display transcriptions - CPU/GPU support with automatic device detection - Standalone executable builds (PyInstaller) - CUDA build support (works on systems without CUDA hardware) Components: - Audio capture with sounddevice - Noise reduction with noisereduce + webrtcvad - Transcription with faster-whisper - GUI with PySide6 - Web server with FastAPI + WebSocket - Configuration system with YAML Build System: - Standard builds (CPU-only): build.sh / build.bat - CUDA builds (universal): build-cuda.sh / build-cuda.bat - Comprehensive BUILD.md documentation - Cross-platform support (Linux, Windows) Documentation: - README.md with project overview and quick start - BUILD.md with detailed build instructions - NEXT_STEPS.md with future enhancement roadmap - INSTALL.md with setup instructions šŸ¤– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .gitignore | 56 ++++ BUILD.md | 259 ++++++++++++++++ INSTALL.md | 194 ++++++++++++ NEXT_STEPS.md | 440 +++++++++++++++++++++++++++ README.md | 494 ++++++++++++++++++++++++++++++ build-cuda.bat | 56 ++++ build-cuda.sh | 57 ++++ build.bat | 34 +++ build.sh | 32 ++ client/__init__.py | 0 client/audio_capture.py | 246 +++++++++++++++ client/config.py | 141 +++++++++ client/device_utils.py | 128 ++++++++ client/noise_suppression.py | 164 ++++++++++ client/transcription_engine.py | 232 ++++++++++++++ config/default_config.yaml | 40 +++ gui/__init__.py | 0 gui/main_window.py | 364 ++++++++++++++++++++++ gui/main_window_qt.py | 524 ++++++++++++++++++++++++++++++++ gui/settings_dialog.py | 310 +++++++++++++++++++ gui/settings_dialog_qt.py | 261 ++++++++++++++++ gui/transcription_display.py | 127 ++++++++ gui/transcription_display_qt.py | 159 ++++++++++ local-transcription.spec | 86 ++++++ main.py | 52 ++++ main_cli.py | 221 ++++++++++++++ pyproject.toml | 59 ++++ requirements.txt | 23 ++ server/__init__.py | 0 server/web_display.py | 233 ++++++++++++++ test_components.py | 124 ++++++++ 31 files changed, 5116 insertions(+) create mode 100644 .gitignore create mode 100644 BUILD.md create mode 100644 INSTALL.md create mode 100644 NEXT_STEPS.md create mode 100644 README.md create mode 100644 build-cuda.bat create mode 100755 build-cuda.sh create mode 100644 build.bat create mode 100755 build.sh create mode 100644 client/__init__.py create mode 100644 client/audio_capture.py create mode 100644 client/config.py create mode 100644 client/device_utils.py create mode 100644 client/noise_suppression.py create mode 100644 client/transcription_engine.py create mode 100644 config/default_config.yaml create mode 100644 gui/__init__.py create mode 100644 gui/main_window.py create mode 100644 gui/main_window_qt.py create mode 100644 gui/settings_dialog.py create mode 100644 gui/settings_dialog_qt.py create mode 100644 gui/transcription_display.py create mode 100644 gui/transcription_display_qt.py create mode 100644 local-transcription.spec create mode 100644 main.py create mode 100755 main_cli.py create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 server/__init__.py create mode 100644 server/web_display.py create mode 100644 test_components.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..54216fa --- /dev/null +++ b/.gitignore @@ -0,0 +1,56 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +env/ +ENV/ +.venv/ +.venv + +# uv +uv.lock +.python-version + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Application specific +*.log +config/*.yaml +!config/default_config.yaml +.local-transcription/ + +# Model cache +models/ +.cache/ + +# PyInstaller +*.spec.lock diff --git a/BUILD.md b/BUILD.md new file mode 100644 index 0000000..099dce3 --- /dev/null +++ b/BUILD.md @@ -0,0 +1,259 @@ +# Building Local Transcription + +This guide explains how to build standalone executables for Linux and Windows. + +## Prerequisites + +1. **Python 3.8+** installed on your system +2. **uv** package manager (install from https://docs.astral.sh/uv/) +3. All project dependencies installed (`uv sync`) + +## Building for Linux + +### Standard Build (CPU-only): + +```bash +# Make the build script executable (first time only) +chmod +x build.sh + +# Run the build script +./build.sh +``` + +### CUDA Build (GPU Support): + +Build with CUDA support even without NVIDIA hardware: + +```bash +# Make the build script executable (first time only) +chmod +x build-cuda.sh + +# Run the CUDA build script +./build-cuda.sh +``` + +This will: +- Install PyTorch with CUDA 12.1 support +- Bundle CUDA runtime libraries (~600MB extra) +- Create an executable that works on both GPU and CPU systems +- Automatically fall back to CPU if no CUDA GPU is available + +The executable will be created in `dist/LocalTranscription/LocalTranscription` + +### Manual build: +```bash +# Clean previous builds +rm -rf build dist + +# Build with PyInstaller +uv run pyinstaller local-transcription.spec +``` + +### Distribution: +```bash +cd dist +tar -czf LocalTranscription-Linux.tar.gz LocalTranscription/ +``` + +## Building for Windows + +### Standard Build (CPU-only): + +```cmd +# Run the build script +build.bat +``` + +### CUDA Build (GPU Support): + +Build with CUDA support even without NVIDIA hardware: + +```cmd +# Run the CUDA build script +build-cuda.bat +``` + +This will: +- Install PyTorch with CUDA 12.1 support +- Bundle CUDA runtime libraries (~600MB extra) +- Create an executable that works on both GPU and CPU systems +- Automatically fall back to CPU if no CUDA GPU is available + +The executable will be created in `dist\LocalTranscription\LocalTranscription.exe` + +### Manual build: +```cmd +# Clean previous builds +rmdir /s /q build +rmdir /s /q dist + +# Build with PyInstaller +uv run pyinstaller local-transcription.spec +``` + +### Distribution: +- Compress the `dist\LocalTranscription` folder to a ZIP file +- Or use an installer creator like NSIS or Inno Setup + +## Important Notes + +### Cross-Platform Building + +**You cannot cross-compile!** +- Linux executables must be built on Linux +- Windows executables must be built on Windows +- Mac executables must be built on macOS + +### First Run + +On the first run, the application will: +1. Create a config directory at `~/.local-transcription/` (Linux) or `%USERPROFILE%\.local-transcription\` (Windows) +2. Download the Whisper model (if not already present) +3. The model will be cached in `~/.cache/huggingface/` by default + +### Executable Size + +The built executable will be large (300MB - 2GB+) because it includes: +- Python runtime +- PySide6 (Qt framework) +- PyTorch/faster-whisper +- NumPy, SciPy, and other dependencies + +### Console Window + +By default, the console window is visible (for debugging). To hide it: + +1. Edit `local-transcription.spec` +2. Change `console=True` to `console=False` in the `EXE` section +3. Rebuild + +### GPU Support + +#### Building with CUDA (Recommended for Distribution) + +**Yes, you CAN build with CUDA support on systems without NVIDIA GPUs!** + +PyTorch provides CUDA-enabled builds that bundle the CUDA runtime libraries. This means: + +1. **You don't need NVIDIA hardware** to create CUDA-enabled builds +2. **The executable will work everywhere** - on systems with or without NVIDIA GPUs +3. **Automatic fallback** - the app detects available hardware and uses GPU if available, CPU otherwise +4. **Larger file size** - adds ~600MB-1GB to the executable size + +**How it works:** +```bash +# Linux +./build-cuda.sh + +# Windows +build-cuda.bat +``` + +The build script will: +- Install PyTorch with bundled CUDA 12.1 runtime +- Package all CUDA libraries into the executable +- Create a universal build that runs on any system + +**When users run the executable:** +- If they have an NVIDIA GPU with drivers: Uses GPU acceleration +- If they don't have NVIDIA GPU: Automatically uses CPU +- No configuration needed - it just works! + +#### Alternative: CPU-Only Builds + +If you only want CPU support (smaller file size): +```bash +# Linux +./build.sh + +# Windows +build.bat +``` + +#### AMD GPU Support + +- **ROCm**: Requires special PyTorch builds from AMD +- Not recommended for general distribution +- Better to use CUDA build (works on all systems) or CPU build + +### Optimizations + +To reduce size: + +1. **Remove unused model sizes**: The app downloads models on-demand, so you don't need to bundle them +2. **Use UPX compression**: Already enabled in the spec file +3. **Exclude dev dependencies**: Only build dependencies are needed + +## Testing the Build + +After building, test the executable: + +### Linux: +```bash +cd dist/LocalTranscription +./LocalTranscription +``` + +### Windows: +```cmd +cd dist\LocalTranscription +LocalTranscription.exe +``` + +## Troubleshooting + +### Missing modules error +If you get "No module named X" errors, add the module to the `hiddenimports` list in `local-transcription.spec` + +### DLL errors (Windows) +Make sure Visual C++ Redistributable is installed on the target system: +https://aka.ms/vs/17/release/vc_redist.x64.exe + +### Audio device errors +The application needs access to audio devices. Ensure: +- Microphone permissions are granted +- Audio drivers are installed +- PulseAudio (Linux) or Windows Audio is running + +### Model download fails +Ensure internet connection on first run. Models are downloaded from: +https://huggingface.co/guillaumekln/faster-whisper-base + +## Advanced: Adding an Icon + +1. Create or obtain an `.ico` file (Windows) or `.png` file (Linux) +2. Edit `local-transcription.spec` +3. Change `icon=None` to `icon='path/to/your/icon.ico'` +4. Rebuild + +## Advanced: Creating an Installer + +### Windows (using Inno Setup): + +1. Install Inno Setup: https://jrsoftware.org/isinfo.php +2. Create an `.iss` script file +3. Build the installer + +### Linux (using AppImage): + +```bash +# Install appimagetool +wget https://github.com/AppImage/AppImageKit/releases/download/continuous/appimagetool-x86_64.AppImage +chmod +x appimagetool-x86_64.AppImage + +# Create AppDir structure +mkdir -p LocalTranscription.AppDir/usr/bin +cp -r dist/LocalTranscription/* LocalTranscription.AppDir/usr/bin/ + +# Create desktop file and icon +# (Create .desktop file and icon as needed) + +# Build AppImage +./appimagetool-x86_64.AppImage LocalTranscription.AppDir +``` + +## Support + +For build issues, check: +1. PyInstaller documentation: https://pyinstaller.org/ +2. Project issues: https://github.com/anthropics/claude-code/issues diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..efbfd38 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,194 @@ +# Installation Guide + +## Prerequisites + +- **Python 3.9 or higher** +- **uv** (Python package installer) +- **FFmpeg** (required by faster-whisper) +- **CUDA-capable GPU** (optional, for GPU acceleration) + +### Installing uv + +If you don't have `uv` installed: + +```bash +# On macOS and Linux +curl -LsSf https://astral.sh/uv/install.sh | sh + +# On Windows +powershell -c "irm https://astral.sh/uv/install.ps1 | iex" + +# Or with pip +pip install uv +``` + +### Installing FFmpeg + +#### On Ubuntu/Debian: +```bash +sudo apt update +sudo apt install ffmpeg +``` + +#### On macOS (with Homebrew): +```bash +brew install ffmpeg +``` + +#### On Windows: +Download from [ffmpeg.org](https://ffmpeg.org/download.html) and add to PATH. + +## Installation Steps + +### 1. Navigate to Project Directory + +```bash +cd /home/jknapp/code/local-transcription +``` + +### 2. Install Dependencies with uv + +```bash +# uv will automatically create a virtual environment and install dependencies +uv sync +``` + +This single command will: +- Create a virtual environment (`.venv/`) +- Install all dependencies from `pyproject.toml` +- Lock dependencies for reproducibility + +**Note for CUDA users:** If you have an NVIDIA GPU, install PyTorch with CUDA support: + +```bash +# For CUDA 11.8 +uv pip install torch --index-url https://download.pytorch.org/whl/cu118 + +# For CUDA 12.1 +uv pip install torch --index-url https://download.pytorch.org/whl/cu121 +``` + +### 3. Run the Application + +```bash +# Option 1: Using uv run (automatically uses the venv) +uv run python main.py + +# Option 2: Activate venv manually +source .venv/bin/activate # On Windows: .venv\Scripts\activate +python main.py +``` + +On first run, the application will: +- Download the Whisper model (this may take a few minutes) +- Create a configuration file at `~/.local-transcription/config.yaml` + +## Quick Start Commands + +```bash +# Install everything +uv sync + +# Run the application +uv run python main.py + +# Install with server dependencies (for Phase 2+) +uv sync --extra server + +# Update dependencies +uv sync --upgrade +``` + +## Configuration + +Settings can be changed through the GUI (Settings button) or by editing: +``` +~/.local-transcription/config.yaml +``` + +## Troubleshooting + +### Audio Device Issues + +If no audio devices are detected: +```bash +uv run python -c "import sounddevice as sd; print(sd.query_devices())" +``` + +### GPU Not Detected + +Check if CUDA is available: +```bash +uv run python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" +``` + +### Model Download Fails + +Models are downloaded to `~/.cache/huggingface/`. If download fails: +- Check internet connection +- Ensure sufficient disk space (~1-3 GB depending on model size) + +### uv Command Not Found + +Make sure uv is in your PATH: +```bash +# Add to ~/.bashrc or ~/.zshrc +export PATH="$HOME/.cargo/bin:$PATH" +``` + +## Performance Tips + +For best real-time performance: + +1. **Use GPU if available** - 5-10x faster than CPU +2. **Start with smaller models**: + - `tiny`: Fastest, ~39M parameters, 1-2s latency + - `base`: Good balance, ~74M parameters, 2-3s latency + - `small`: Better accuracy, ~244M parameters, 3-5s latency +3. **Enable VAD** (Voice Activity Detection) to skip silent audio +4. **Adjust chunk duration**: Smaller = lower latency, larger = better accuracy + +## System Requirements + +### Minimum: +- CPU: Dual-core 2GHz+ +- RAM: 4GB +- Model: tiny or base + +### Recommended: +- CPU: Quad-core 3GHz+ or GPU (NVIDIA GTX 1060+) +- RAM: 8GB +- Model: base or small + +### For Best Performance: +- GPU: NVIDIA RTX 2060 or better +- RAM: 16GB +- Model: small or medium + +## Development + +### Install development dependencies: +```bash +uv sync --extra dev +``` + +### Run tests: +```bash +uv run pytest +``` + +### Format code: +```bash +uv run black . +uv run ruff check . +``` + +## Why uv? + +`uv` is significantly faster than pip: +- **10-100x faster** dependency resolution +- **Automatic virtual environment** management +- **Reproducible builds** with lockfile +- **Drop-in replacement** for pip commands + +Learn more at [astral.sh/uv](https://astral.sh/uv) diff --git a/NEXT_STEPS.md b/NEXT_STEPS.md new file mode 100644 index 0000000..bd766d5 --- /dev/null +++ b/NEXT_STEPS.md @@ -0,0 +1,440 @@ +# Next Steps for Local Transcription + +This document outlines potential future enhancements and features for the Local Transcription application. + +## Current Status: Phase 1 Complete āœ… + +The application currently has: +- āœ… Desktop GUI with PySide6 +- āœ… Real-time transcription with Whisper (faster-whisper) +- āœ… Audio capture with automatic sample rate detection and resampling +- āœ… Noise suppression with Voice Activity Detection (VAD) +- āœ… Web server for OBS browser source integration +- āœ… Configurable display settings (font, timestamps, fade duration) +- āœ… Settings apply without restart +- āœ… Auto-fade for web display +- āœ… Standalone executable builds for Linux and Windows +- āœ… CUDA support (with automatic CPU fallback) + +## Phase 2: Multi-User Server Architecture (Optional) + +If you want to enable multiple users to sync their transcriptions to a shared display: + +### Server Components + +1. **WebSocket Server** + - Accept connections from multiple clients + - Aggregate transcriptions from all connected users + - Broadcast to web display clients + - Handle user authentication/authorization + - Rate limiting and abuse prevention + +2. **Database/Storage** (Optional) + - Store transcription history + - User management + - Session logs for later review + - Consider: SQLite, PostgreSQL, or Redis + +3. **Web Admin Interface** + - Monitor connected clients + - View active sessions + - Manage users and permissions + - Export transcription logs + +### Client Updates + +1. **Server Sync Toggle** + - Enable/disable server sync in Settings + - Server URL configuration + - API key/authentication setup + - Connection status indicator + +2. **Network Handling** + - Auto-reconnect on connection loss + - Queue transcriptions when offline + - Sync when connection restored + +### Implementation Technologies + +- **Server Framework**: FastAPI (already used for web display) +- **WebSocket**: Already integrated +- **Database**: SQLAlchemy + SQLite/PostgreSQL +- **Deployment**: Docker container for easy deployment + +**Estimated Effort**: 2-3 weeks for full implementation + +--- + +## Phase 3: Enhanced Features + +### Transcription Improvements + +1. **Multi-Language Support** + - Automatic language detection + - Real-time language switching + - Translation between languages + - Per-user language settings + +2. **Speaker Diarization** + - Detect and label different speakers + - Use pyannote.audio or similar + - Automatically assign speaker IDs + +3. **Custom Vocabulary** + - Add gaming terms, streamer names + - Technical jargon support + - Proper noun correction + +4. **Punctuation & Formatting** + - Automatic punctuation insertion + - Sentence capitalization + - Better text formatting + +### Display Enhancements + +1. **Theme System** + - Light/dark themes + - Custom color schemes + - User-created themes (JSON/YAML) + - Per-element styling + +2. **Animation Options** + - Different fade effects + - Slide in/out animations + - Configurable transition speeds + - Particle effects (optional) + +3. **Layout Modes** + - Karaoke-style (word highlighting) + - Ticker tape (scrolling bottom) + - Multi-column for multiple users + - Picture-in-picture mode + +4. **Web Display Customization** + - CSS customization interface + - Live preview in settings + - Save/load custom styles + - Community theme sharing + +### Audio Processing + +1. **Advanced Noise Reduction** + - RNNoise integration + - Custom noise profiles + - Adaptive filtering + - Echo cancellation + +2. **Audio Effects** + - Equalization presets + - Compression/normalization + - Voice enhancement filters + +3. **Multi-Input Support** + - Multiple microphones simultaneously + - Virtual audio cable integration + - Audio routing/mixing + +--- + +## Phase 4: Integration & Automation + +### OBS Integration + +1. **OBS Plugin** (Advanced) + - Native OBS plugin instead of browser source + - Lower resource usage + - Better performance + - Tighter integration + +2. **Scene Integration** + - Auto-show/hide based on speech + - Integrate with OBS scene switcher + - Hotkey support + +### Streaming Platform Integration + +1. **Twitch Integration** + - Send captions to Twitch chat + - Twitch API integration + - Custom Twitch bot + +2. **YouTube Integration** + - Live caption upload + - YouTube API integration + +3. **Discord Integration** + - Send transcriptions to Discord webhook + - Discord bot for voice chat transcription + +### Automation + +1. **Hotkey Support** + - Global hotkeys for start/stop + - Toggle display visibility + - Quick settings access + +2. **Voice Commands** + - "Hey Transcription, start/stop" + - Command detection in audio stream + - Configurable wake words + +3. **Auto-Start Options** + - Start with OBS + - Start on system boot + - Auto-detect streaming software + +--- + +## Phase 5: Advanced Features + +### AI Enhancements + +1. **Summarization** + - Real-time conversation summaries + - Key point extraction + - Topic detection + +2. **Sentiment Analysis** + - Detect tone/emotion + - Highlight important moments + - Filter profanity (optional) + +3. **Context Awareness** + - Remember conversation context + - Better transcription accuracy + - Adaptive vocabulary + +### Analytics & Insights + +1. **Usage Statistics** + - Words per minute + - Speaking time per user + - Most common words/phrases + - Accuracy metrics + +2. **Export Options** + - Export to SRT/VTT for video captions + - PDF/Word document export + - CSV for data analysis + - JSON API for custom tools + +3. **Search & Filter** + - Search transcription history + - Filter by user, date, keyword + - Highlight search results + +### Accessibility + +1. **Screen Reader Support** + - Full NVDA/JAWS compatibility + - Keyboard navigation + - Voice feedback + +2. **High Contrast Modes** + - Enhanced visibility options + - Color blind friendly palettes + +3. **Text-to-Speech** + - Read back transcriptions + - Multiple voice options + - Speed control + +--- + +## Performance Optimizations + +### Current Considerations + +1. **Model Optimization** + - Quantization (int8, int4) + - Smaller model variants + - TensorRT optimization (NVIDIA) + - ONNX Runtime support + +2. **Caching** + - Cache common phrases + - Model warm-up on startup + - Preload frequently used resources + +3. **Resource Management** + - Dynamic batch sizing + - Memory pooling + - Thread pool optimization + +### Future Optimizations + +1. **Distributed Processing** + - Offload to cloud GPU + - Share processing across multiple machines + - Load balancing + +2. **Edge Computing** + - Run on edge devices (Raspberry Pi) + - Mobile app support + - Embedded systems + +--- + +## Community Features + +### Sharing & Collaboration + +1. **Theme Marketplace** + - Share custom themes + - Download community themes + - Rating system + +2. **Plugin System** + - Allow community plugins + - Custom audio filters + - Display widgets + - Integration modules + +3. **Documentation** + - Video tutorials + - Wiki/knowledge base + - API documentation + - Developer guides + +### User Support + +1. **In-App Help** + - Contextual help tooltips + - Getting started wizard + - Troubleshooting guide + +2. **Community Forum** + - GitHub Discussions + - Discord server + - Reddit community + +--- + +## Technical Debt & Maintenance + +### Code Quality + +1. **Testing** + - Unit tests for core modules + - Integration tests + - End-to-end tests + - Performance benchmarks + +2. **Documentation** + - API documentation + - Code comments + - Architecture diagrams + - Developer setup guide + +3. **CI/CD** + - Automated builds + - Automated testing + - Release automation + - Cross-platform testing + +### Security + +1. **Security Audits** + - Dependency scanning + - Vulnerability assessment + - Code security review + +2. **Data Privacy** + - Local-first by default + - Optional cloud features + - GDPR compliance (if applicable) + - Clear privacy policy + +--- + +## Immediate Quick Wins + +These are small enhancements that could be implemented quickly: + +### Easy (< 1 day) + +- [ ] Add application icon +- [ ] Add "About" dialog with version info +- [ ] Add keyboard shortcuts (Ctrl+S for settings, etc.) +- [ ] Add system tray icon +- [ ] Save window position/size +- [ ] Add "Check for Updates" feature +- [ ] Export transcriptions to text file + +### Medium (1-3 days) + +- [ ] Add profanity filter (optional) +- [ ] Add confidence score display +- [ ] Add audio level meter +- [ ] Multiple language support in UI +- [ ] Dark/light theme toggle +- [ ] Backup/restore settings +- [ ] Recent transcriptions history + +### Larger (1+ weeks) + +- [ ] Cloud sync for settings +- [ ] Mobile companion app +- [ ] Browser extension +- [ ] API server mode +- [ ] Plugin architecture +- [ ] Advanced audio visualization + +--- + +## Resources & References + +### Documentation +- [Faster-Whisper](https://github.com/guillaumekln/faster-whisper) +- [PySide6 Documentation](https://doc.qt.io/qtforpython/) +- [FastAPI Documentation](https://fastapi.tiangolo.com/) +- [PyInstaller Manual](https://pyinstaller.org/en/stable/) + +### Similar Projects +- [whisper.cpp](https://github.com/ggerganov/whisper.cpp) - C++ implementation +- [Buzz](https://github.com/chidiwilliams/buzz) - Desktop transcription tool +- [OpenAI Whisper](https://github.com/openai/whisper) - Original implementation + +### Community +- Create GitHub Discussions for feature requests +- Set up issue templates +- Contributing guidelines +- Code of conduct + +--- + +## Decision Log + +Track major architectural decisions here: + +### 2025-12-25: PyInstaller for Distribution +- **Decision**: Use PyInstaller for creating standalone executables +- **Rationale**: Good PySide6 support, active development, cross-platform +- **Alternatives Considered**: cx_Freeze, Nuitka, py2exe +- **Impact**: Users can run without Python installation + +### 2025-12-25: CUDA Build Strategy +- **Decision**: Provide CUDA-enabled builds that bundle CUDA runtime +- **Rationale**: Universal builds work everywhere, automatic GPU detection +- **Trade-off**: Larger file size (~600MB extra) for better UX +- **Impact**: Single build for both GPU and CPU users + +### 2025-12-25: Web Server Always Running +- **Decision**: Remove enable/disable toggle, always run web server +- **Rationale**: Simplifies UX, no configuration needed for OBS +- **Impact**: Uses one local port (8080 by default), minimal overhead + +--- + +## Contact & Contribution + +When this project is public: +- **Issues**: Report bugs and request features on GitHub Issues +- **Pull Requests**: Contributions welcome! See CONTRIBUTING.md +- **Discussions**: Join GitHub Discussions for questions and ideas +- **License**: [To be determined - consider MIT or Apache 2.0] + +--- + +*Last Updated: 2025-12-25* +*Version: 1.0.0 (Phase 1 Complete)* diff --git a/README.md b/README.md new file mode 100644 index 0000000..c045df5 --- /dev/null +++ b/README.md @@ -0,0 +1,494 @@ +# Local Transcription for Streamers + +A local speech-to-text application designed for streamers that provides real-time transcription using Whisper or similar models. Multiple users can run the application locally and sync their transcriptions to a centralized web stream that can be easily captured in OBS or other streaming software. + +## Features + +- **Standalone Desktop Application**: Use locally with built-in GUI display - no server required +- **Local Transcription**: Run Whisper (or compatible models) locally on your machine +- **CPU/GPU Support**: Choose between CPU or GPU processing based on your hardware +- **Real-time Processing**: Live audio transcription with minimal latency +- **Noise Suppression**: Built-in audio preprocessing to reduce background noise +- **User Configuration**: Set your display name and preferences through the GUI +- **Optional Multi-user Sync**: Connect to a server to sync transcriptions with other users +- **OBS Integration**: Web-based output designed for easy browser source capture +- **Privacy-First**: All processing happens locally; only transcription text is shared +- **Customizable**: Configure model size, language, and streaming settings + +## Quick Start + +### Running from Source + +```bash +# Install dependencies +uv sync + +# Run the application +uv run python main.py +``` + +### Building Standalone Executables + +To create standalone executables for distribution: + +**Linux:** +```bash +./build.sh +``` + +**Windows:** +```cmd +build.bat +``` + +For detailed build instructions, see [BUILD.md](BUILD.md). + +## Architecture Overview + +The application can run in two modes: + +### Standalone Mode (No Server Required): +1. **Desktop Application**: Captures audio, performs speech-to-text, and displays transcriptions locally in a GUI window + +### Multi-user Sync Mode (Optional): +1. **Local Transcription Client**: Captures audio, performs speech-to-text, and sends results to the web server +2. **Centralized Web Server**: Aggregates transcriptions from multiple clients and serves a web stream +3. **Web Stream Interface**: Browser-accessible page displaying synchronized transcriptions (for OBS capture) + +## Use Cases + +- **Multi-language Streams**: Multiple translators transcribing in different languages +- **Accessibility**: Provide real-time captions for viewers +- **Collaborative Podcasts**: Multiple hosts with separate transcriptions +- **Gaming Commentary**: Track who said what in multiplayer sessions + +--- + +## Implementation Plan + +### Phase 1: Standalone Desktop Application + +**Objective**: Build a fully functional standalone transcription app with GUI that works without any server + +#### Components: +1. **Audio Capture Module** + - Capture system audio or microphone input + - Support multiple audio sources (virtual audio cables, physical devices) + - Real-time audio buffering with configurable chunk sizes + - **Noise Suppression**: Preprocess audio to reduce background noise + - Libraries: `pyaudio`, `sounddevice`, `noisereduce`, `webrtcvad` + +2. **Noise Suppression Engine** + - Real-time noise reduction using RNNoise or noisereduce + - Adjustable noise reduction strength + - Optional VAD (Voice Activity Detection) to skip silent segments + - Libraries: `noisereduce`, `rnnoise-python`, `webrtcvad` + +3. **Transcription Engine** + - Integrate OpenAI Whisper (or alternatives: faster-whisper, whisper.cpp) + - Support multiple model sizes (tiny, base, small, medium, large) + - CPU and GPU inference options + - Model management and automatic downloading + - Libraries: `openai-whisper`, `faster-whisper`, `torch` + +4. **Device Selection** + - Auto-detect available compute devices (CPU, CUDA, MPS for Mac) + - Allow user to specify preferred device via GUI + - Graceful fallback if GPU unavailable + - Display device status and performance metrics + +5. **Desktop GUI Application** + - Cross-platform GUI using PyQt6, Tkinter, or CustomTkinter + - Main transcription display window (scrolling text area) + - Settings panel for configuration + - User name input field + - Audio input device selector + - Model size selector + - CPU/GPU toggle + - Start/Stop transcription button + - Optional: System tray integration + - Libraries: `PyQt6`, `customtkinter`, or `tkinter` + +6. **Local Display** + - Real-time transcription display in GUI window + - Scrolling text with timestamps + - User name/label shown with transcriptions + - Copy transcription to clipboard + - Optional: Save transcription to file (TXT, SRT, VTT) + +#### Tasks: +- [ ] Set up project structure and dependencies +- [ ] Implement audio capture with device selection +- [ ] Add noise suppression and VAD preprocessing +- [ ] Integrate Whisper model loading and inference +- [ ] Add CPU/GPU device detection and selection logic +- [ ] Create real-time audio buffer processing pipeline +- [ ] Design and implement GUI layout (main window) +- [ ] Add settings panel with user name configuration +- [ ] Implement local transcription display area +- [ ] Add start/stop controls and status indicators +- [ ] Test transcription accuracy and latency +- [ ] Test noise suppression effectiveness + +--- + +### Phase 2: Web Server and Sync System + +**Objective**: Create a centralized server to aggregate and serve transcriptions + +#### Components: +1. **Web Server** + - FastAPI or Flask-based REST API + - WebSocket support for real-time updates + - User/client registration and management + - Libraries: `fastapi`, `uvicorn`, `websockets` + +2. **Transcription Aggregator** + - Receive transcription chunks from multiple clients + - Associate transcriptions with user IDs/names + - Timestamp management and synchronization + - Buffer management for smooth streaming + +3. **Database/Storage** (Optional) + - Store transcription history (SQLite for simplicity) + - Session management + - Export functionality (SRT, VTT, TXT formats) + +#### API Endpoints: +- `POST /api/register` - Register a new client +- `POST /api/transcription` - Submit transcription chunk +- `WS /api/stream` - WebSocket for real-time transcription stream +- `GET /stream` - Web page for OBS browser source + +#### Tasks: +- [ ] Set up FastAPI server with CORS support +- [ ] Implement WebSocket handler for real-time streaming +- [ ] Create client registration system +- [ ] Build transcription aggregation logic +- [ ] Add timestamp synchronization +- [ ] Create data models for clients and transcriptions + +--- + +### Phase 3: Client-Server Communication (Optional Multi-user Mode) + +**Objective**: Add optional server connectivity to enable multi-user transcription sync + +#### Components: +1. **HTTP/WebSocket Client** + - Register client with server on startup + - Send transcription chunks as they're generated + - Handle connection drops and reconnection + - Libraries: `requests`, `websockets` + +2. **Configuration System** + - Config file for server URL, API keys, user settings + - Model preferences (size, language) + - Audio input settings + - Format: YAML or JSON + +3. **Status Monitoring** + - Connection status indicator + - Transcription queue health + - Error handling and logging + +#### Tasks: +- [ ] Add "Enable Server Sync" toggle to GUI +- [ ] Add server URL configuration field in settings +- [ ] Implement WebSocket client for sending transcriptions +- [ ] Add configuration file support (YAML/JSON) +- [ ] Create connection management with auto-reconnect +- [ ] Add local logging and error handling +- [ ] Add server connection status indicator to GUI +- [ ] Allow app to function normally if server is unavailable + +--- + +### Phase 4: Web Stream Interface (OBS Integration) + +**Objective**: Create a web page that displays synchronized transcriptions for OBS + +#### Components: +1. **Web Frontend** + - HTML/CSS/JavaScript page for displaying transcriptions + - Responsive design with customizable styling + - Auto-scroll with configurable retention window + - Libraries: Vanilla JS or lightweight framework (Alpine.js, htmx) + +2. **Styling Options** + - Customizable fonts, colors, sizes + - Background transparency for OBS chroma key + - User name/ID display options + - Timestamp display (optional) + +3. **Display Modes** + - Scrolling captions (like live TV captions) + - Multi-user panel view (separate sections per user) + - Overlay mode (minimal UI for transparency) + +#### Tasks: +- [ ] Create HTML template for transcription display +- [ ] Implement WebSocket client in JavaScript +- [ ] Add CSS styling with OBS-friendly transparency +- [ ] Create customization controls (URL parameters or UI) +- [ ] Test with OBS browser source +- [ ] Add configurable retention/scroll behavior + +--- + +### Phase 5: Advanced Features + +**Objective**: Enhance functionality and user experience + +#### Features: +1. **Language Detection** + - Auto-detect spoken language + - Multi-language support in single stream + - Language selector in GUI + +2. **Speaker Diarization** (Optional) + - Identify different speakers + - Label transcriptions by speaker + - Useful for multi-host streams + +3. **Profanity Filtering** + - Optional word filtering/replacement + - Customizable filter lists + - Toggle in GUI settings + +4. **Advanced Noise Profiles** + - Save and load custom noise profiles + - Adaptive noise suppression + - Different profiles for different environments + +5. **Export Functionality** + - Save transcriptions in multiple formats (TXT, SRT, VTT, JSON) + - Export button in GUI + - Automatic session saving + +6. **Hotkey Support** + - Global hotkeys to start/stop transcription + - Mute/unmute hotkey + - Quick save hotkey + +7. **Docker Support** + - Containerized server deployment + - Docker Compose for easy multi-component setup + - Pre-built images for easy deployment + +8. **Themes and Customization** + - Dark/light theme toggle + - Customizable font sizes and colors for display + - OBS-friendly transparent overlay mode + +#### Tasks: +- [ ] Add language detection and multi-language support +- [ ] Implement speaker diarization +- [ ] Create optional profanity filter +- [ ] Add export functionality (SRT, VTT, plain text, JSON) +- [ ] Implement global hotkey support +- [ ] Create Docker containers for server component +- [ ] Add theme customization options +- [ ] Create advanced noise profile management + +--- + +## Technology Stack + +### Local Client: +- **Python 3.9+** +- **GUI**: PyQt6 / CustomTkinter / tkinter +- **Audio**: PyAudio / sounddevice +- **Noise Suppression**: noisereduce / rnnoise-python +- **VAD**: webrtcvad +- **ML Framework**: PyTorch (for Whisper) +- **Transcription**: openai-whisper / faster-whisper +- **Networking**: websockets, requests (optional for server sync) +- **Config**: PyYAML / json + +### Server: +- **Backend**: FastAPI / Flask +- **WebSocket**: python-websockets / FastAPI WebSockets +- **Server**: Uvicorn / Gunicorn +- **Database** (optional): SQLite / PostgreSQL +- **CORS**: fastapi-cors + +### Web Interface: +- **Frontend**: HTML5, CSS3, JavaScript (ES6+) +- **Real-time**: WebSocket API +- **Styling**: CSS Grid/Flexbox for layout + +--- + +## Project Structure + +``` +local-transcription/ + client/ # Local transcription client +  __init__.py +  audio_capture.py # Audio input handling +  transcription_engine.py # Whisper integration +  network_client.py # Server communication +  config.py # Configuration management +  main.py # Client entry point + server/ # Centralized web server +  __init__.py +  api.py # FastAPI routes +  websocket_handler.py # WebSocket management +  models.py # Data models +  database.py # Optional DB layer +  main.py # Server entry point + web/ # Web stream interface +  index.html # OBS browser source page +  styles.css # Customizable styling +  app.js # WebSocket client & UI logic + config/ +  client_config.example.yaml +  server_config.example.yaml + tests/ +  test_audio.py +  test_transcription.py +  test_server.py + requirements.txt # Python dependencies + README.md + main.py # Combined launcher (optional) +``` + +--- + +## Installation (Planned) + +### Prerequisites: +- Python 3.9 or higher +- CUDA-capable GPU (optional, for GPU acceleration) +- FFmpeg (required by Whisper) + +### Steps: + +1. **Clone the repository** + ```bash + git clone + cd local-transcription + ``` + +2. **Install dependencies** + ```bash + pip install -r requirements.txt + ``` + +3. **Download Whisper models** + ```bash + # Models will be auto-downloaded on first run + # Or manually download: + python -c "import whisper; whisper.load_model('base')" + ``` + +4. **Configure client** + ```bash + cp config/client_config.example.yaml config/client_config.yaml + # Edit config/client_config.yaml with your settings + ``` + +5. **Run the server** (one instance) + ```bash + python server/main.py + ``` + +6. **Run the client** (on each user's machine) + ```bash + python client/main.py + ``` + +7. **Add to OBS** + - Add a Browser Source + - URL: `http://:8000/stream` + - Set width/height as needed + - Check "Shutdown source when not visible" for performance + +--- + +## Configuration (Planned) + +### Client Configuration: +```yaml +user: + name: "Streamer1" # Display name for transcriptions + id: "unique-user-id" # Optional unique identifier + +audio: + input_device: "default" # or specific device index + sample_rate: 16000 + chunk_duration: 2.0 # seconds + +noise_suppression: + enabled: true # Enable/disable noise reduction + strength: 0.7 # 0.0 to 1.0 - reduction strength + method: "noisereduce" # "noisereduce" or "rnnoise" + +transcription: + model: "base" # tiny, base, small, medium, large + device: "cuda" # cpu, cuda, mps + language: "en" # or "auto" for detection + task: "transcribe" # or "translate" + +processing: + use_vad: true # Voice Activity Detection + min_confidence: 0.5 # Minimum transcription confidence + +server_sync: + enabled: false # Enable multi-user server sync + url: "ws://localhost:8000" # Server URL (when enabled) + api_key: "" # Optional API key + +display: + show_timestamps: true # Show timestamps in local display + max_lines: 100 # Maximum lines to keep in display + font_size: 12 # GUI font size +``` + +### Server Configuration: +```yaml +server: + host: "0.0.0.0" + port: 8000 + api_key_required: false + +stream: + max_clients: 10 + buffer_size: 100 # messages to buffer + retention_time: 300 # seconds + +database: + enabled: false + path: "transcriptions.db" +``` + +--- + +## Roadmap + +- [x] Project planning and architecture design +- [ ] Phase 1: Standalone desktop application with GUI +- [ ] Phase 2: Web server and sync system (optional multi-user mode) +- [ ] Phase 3: Client-server communication (optional) +- [ ] Phase 4: Web stream interface for OBS (optional) +- [ ] Phase 5: Advanced features (hotkeys, themes, Docker, etc.) + +--- + +## Contributing + +Contributions are welcome! Please feel free to submit issues or pull requests. + +--- + +## License + +[Choose appropriate license - MIT, Apache 2.0, etc.] + +--- + +## Acknowledgments + +- OpenAI Whisper for the excellent speech recognition model +- The streaming community for inspiration and use cases diff --git a/build-cuda.bat b/build-cuda.bat new file mode 100644 index 0000000..bfbf6f5 --- /dev/null +++ b/build-cuda.bat @@ -0,0 +1,56 @@ +@echo off +REM Build script for Windows with CUDA support + +echo Building Local Transcription with CUDA support... +echo ================================================== +echo. +echo This will create a build that supports both CPU and CUDA GPUs. +echo The executable will be larger (~2-3GB) but will work on any system. +echo. + +set /p INSTALL_CUDA="Install PyTorch with CUDA support? (y/n) " +if /i "%INSTALL_CUDA%"=="y" ( + echo Installing PyTorch with CUDA 12.1 support... + + REM Uninstall CPU-only version if present + uv pip uninstall -y torch + + REM Install CUDA-enabled PyTorch + REM This installs PyTorch with bundled CUDA runtime + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + + echo CUDA-enabled PyTorch installed + echo. +) + +REM Clean previous builds +echo Cleaning previous builds... +if exist build rmdir /s /q build +if exist dist rmdir /s /q dist + +REM Build with PyInstaller +echo Running PyInstaller... +uv run pyinstaller local-transcription.spec + +REM Check if build succeeded +if exist "dist\LocalTranscription" ( + echo. + echo Build successful! + echo Executable location: dist\LocalTranscription\LocalTranscription.exe + echo. + echo CUDA Support: YES (falls back to CPU if CUDA not available^) + echo. + echo To run the application: + echo cd dist\LocalTranscription + echo LocalTranscription.exe + echo. + echo To create a distributable package: + echo - Compress the dist\LocalTranscription folder to a ZIP file + echo - Name it: LocalTranscription-Windows-CUDA.zip + echo. + echo Note: This build will work on systems with or without NVIDIA GPUs. +) else ( + echo. + echo Build failed! + exit /b 1 +) diff --git a/build-cuda.sh b/build-cuda.sh new file mode 100755 index 0000000..5d2719f --- /dev/null +++ b/build-cuda.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Build script for Linux with CUDA support + +echo "Building Local Transcription with CUDA support..." +echo "==================================================" +echo "" +echo "This will create a build that supports both CPU and CUDA GPUs." +echo "The executable will be larger (~2-3GB) but will work on any system." +echo "" + +# Check if we should install CUDA-enabled PyTorch +read -p "Install PyTorch with CUDA support? (y/n) " -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]] +then + echo "Installing PyTorch with CUDA 12.1 support..." + # Uninstall CPU-only version if present + uv pip uninstall -y torch + + # Install CUDA-enabled PyTorch + # This installs PyTorch with bundled CUDA runtime + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + + echo "āœ“ CUDA-enabled PyTorch installed" + echo "" +fi + +# Clean previous builds +echo "Cleaning previous builds..." +rm -rf build dist + +# Build with PyInstaller +echo "Running PyInstaller..." +uv run pyinstaller local-transcription.spec + +# Check if build succeeded +if [ -d "dist/LocalTranscription" ]; then + echo "" + echo "āœ“ Build successful!" + echo "Executable location: dist/LocalTranscription/LocalTranscription" + echo "" + echo "CUDA Support: YES (falls back to CPU if CUDA not available)" + echo "" + echo "To run the application:" + echo " cd dist/LocalTranscription" + echo " ./LocalTranscription" + echo "" + echo "To create a distributable package:" + echo " cd dist" + echo " tar -czf LocalTranscription-Linux-CUDA.tar.gz LocalTranscription/" + echo "" + echo "Note: This build will work on systems with or without NVIDIA GPUs." +else + echo "" + echo "āœ— Build failed!" + exit 1 +fi diff --git a/build.bat b/build.bat new file mode 100644 index 0000000..e0f5c4c --- /dev/null +++ b/build.bat @@ -0,0 +1,34 @@ +@echo off +REM Build script for Windows + +echo Building Local Transcription for Windows... +echo ========================================== +echo. + +REM Clean previous builds +echo Cleaning previous builds... +if exist build rmdir /s /q build +if exist dist rmdir /s /q dist + +REM Build with PyInstaller +echo Running PyInstaller... +uv run pyinstaller local-transcription.spec + +REM Check if build succeeded +if exist "dist\LocalTranscription" ( + echo. + echo Build successful! + echo Executable location: dist\LocalTranscription\LocalTranscription.exe + echo. + echo To run the application: + echo cd dist\LocalTranscription + echo LocalTranscription.exe + echo. + echo To create a distributable package: + echo - Install 7-Zip or WinRAR + echo - Compress the dist\LocalTranscription folder to a ZIP file +) else ( + echo. + echo Build failed! + exit /b 1 +) diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..b94d25f --- /dev/null +++ b/build.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Build script for Linux + +echo "Building Local Transcription for Linux..." +echo "=========================================" + +# Clean previous builds +echo "Cleaning previous builds..." +rm -rf build dist + +# Build with PyInstaller +echo "Running PyInstaller..." +uv run pyinstaller local-transcription.spec + +# Check if build succeeded +if [ -d "dist/LocalTranscription" ]; then + echo "" + echo "āœ“ Build successful!" + echo "Executable location: dist/LocalTranscription/LocalTranscription" + echo "" + echo "To run the application:" + echo " cd dist/LocalTranscription" + echo " ./LocalTranscription" + echo "" + echo "To create a distributable package:" + echo " cd dist" + echo " tar -czf LocalTranscription-Linux.tar.gz LocalTranscription/" +else + echo "" + echo "āœ— Build failed!" + exit 1 +fi diff --git a/client/__init__.py b/client/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/client/audio_capture.py b/client/audio_capture.py new file mode 100644 index 0000000..5817b4a --- /dev/null +++ b/client/audio_capture.py @@ -0,0 +1,246 @@ +"""Audio capture module for recording microphone or system audio.""" + +import numpy as np +import sounddevice as sd +from scipy import signal +from typing import Callable, Optional, List, Tuple +from threading import Thread, Event +import queue + + +class AudioCapture: + """Captures audio from input devices and provides chunks for processing.""" + + def __init__( + self, + sample_rate: int = 16000, + chunk_duration: float = 3.0, + device: Optional[int] = None + ): + """ + Initialize audio capture. + + Args: + sample_rate: Target audio sample rate in Hz (16000 for Whisper) + chunk_duration: Duration of each audio chunk in seconds + device: Input device index, or None for default + """ + self.target_sample_rate = sample_rate + self.chunk_duration = chunk_duration + self.device = device + self.chunk_size = int(sample_rate * chunk_duration) + + # Hardware sample rate (will be auto-detected) + self.hardware_sample_rate = None + + self.audio_queue = queue.Queue() + self.is_recording = False + self.stop_event = Event() + self.recording_thread: Optional[Thread] = None + + def _detect_sample_rate(self) -> int: + """ + Detect a supported sample rate for the audio device. + + Returns: + Supported sample rate + """ + # Try common sample rates in order of preference + common_rates = [self.target_sample_rate, 48000, 44100, 22050, 32000, 8000] + + for rate in common_rates: + try: + # Try to create a test stream + with sd.InputStream( + device=self.device, + channels=1, + samplerate=rate, + blocksize=1024 + ): + print(f"Using hardware sample rate: {rate} Hz") + return rate + except sd.PortAudioError: + continue + + # If nothing works, default to 48000 + print(f"Warning: Could not detect sample rate, defaulting to 48000 Hz") + return 48000 + + def _resample(self, audio: np.ndarray, from_rate: int, to_rate: int) -> np.ndarray: + """ + Resample audio from one sample rate to another. + + Args: + audio: Input audio data + from_rate: Source sample rate + to_rate: Target sample rate + + Returns: + Resampled audio + """ + if from_rate == to_rate: + return audio + + # Calculate resampling ratio + num_samples = int(len(audio) * to_rate / from_rate) + + # Use scipy's resample for high-quality resampling + resampled = signal.resample(audio, num_samples) + + return resampled.astype(np.float32) + + @staticmethod + def get_input_devices() -> List[Tuple[int, str]]: + """ + Get list of available input audio devices. + + Returns: + List of (device_index, device_name) tuples + """ + devices = [] + device_list = sd.query_devices() + + for i, device in enumerate(device_list): + # Only include devices with input channels + if device['max_input_channels'] > 0: + devices.append((i, device['name'])) + + return devices + + @staticmethod + def get_default_device() -> Optional[Tuple[int, str]]: + """ + Get the default input device. + + Returns: + (device_index, device_name) tuple or None + """ + try: + default_device = sd.query_devices(kind='input') + device_list = sd.query_devices() + + for i, device in enumerate(device_list): + if device['name'] == default_device['name']: + return (i, device['name']) + except: + pass + + return None + + def _audio_callback(self, indata, frames, time_info, status): + """Callback function for sounddevice stream.""" + if status: + print(f"Audio status: {status}") + + # Copy audio data to queue + audio_data = indata.copy().flatten() + self.audio_queue.put(audio_data) + + def start_recording(self, callback: Optional[Callable[[np.ndarray], None]] = None): + """ + Start recording audio. + + Args: + callback: Optional callback function to receive audio chunks + """ + if self.is_recording: + return + + # Detect supported sample rate + self.hardware_sample_rate = self._detect_sample_rate() + + self.is_recording = True + self.stop_event.clear() + + def record_loop(): + """Recording loop that runs in a separate thread.""" + buffer = np.array([], dtype=np.float32) + + # Calculate hardware chunk size + hardware_chunk_size = int(self.hardware_sample_rate * self.chunk_duration) + + try: + with sd.InputStream( + device=self.device, + channels=1, + samplerate=self.hardware_sample_rate, + callback=self._audio_callback, + blocksize=int(self.hardware_sample_rate * 0.1) # 100ms blocks + ): + while not self.stop_event.is_set(): + try: + # Get audio data from queue (with timeout) + audio_chunk = self.audio_queue.get(timeout=0.1) + buffer = np.concatenate([buffer, audio_chunk]) + + # If we have enough data for a full chunk + if len(buffer) >= hardware_chunk_size: + # Extract chunk + chunk = buffer[:hardware_chunk_size] + buffer = buffer[hardware_chunk_size:] + + # Resample to target rate if needed + if self.hardware_sample_rate != self.target_sample_rate: + chunk = self._resample( + chunk, + self.hardware_sample_rate, + self.target_sample_rate + ) + + # Send to callback if provided + if callback: + callback(chunk) + + except queue.Empty: + continue + except Exception as e: + print(f"Error in recording loop: {e}") + except Exception as e: + print(f"Error opening audio stream: {e}") + self.is_recording = False + + self.recording_thread = Thread(target=record_loop, daemon=True) + self.recording_thread.start() + + def stop_recording(self): + """Stop recording audio.""" + if not self.is_recording: + return + + self.is_recording = False + self.stop_event.set() + + if self.recording_thread: + self.recording_thread.join(timeout=2.0) + self.recording_thread = None + + def get_audio_chunk(self, timeout: float = 1.0) -> Optional[np.ndarray]: + """ + Get the next audio chunk from the queue. + + Args: + timeout: Maximum time to wait for a chunk + + Returns: + Audio chunk as numpy array or None if timeout + """ + try: + return self.audio_queue.get(timeout=timeout) + except queue.Empty: + return None + + def is_recording_active(self) -> bool: + """Check if recording is currently active.""" + return self.is_recording + + def clear_queue(self): + """Clear any pending audio chunks from the queue.""" + while not self.audio_queue.empty(): + try: + self.audio_queue.get_nowait() + except queue.Empty: + break + + def __del__(self): + """Cleanup when object is destroyed.""" + self.stop_recording() diff --git a/client/config.py b/client/config.py new file mode 100644 index 0000000..c989f8e --- /dev/null +++ b/client/config.py @@ -0,0 +1,141 @@ +"""Configuration management for the local transcription application.""" + +import os +import yaml +from pathlib import Path +from typing import Any, Dict, Optional + + +class Config: + """Manages application configuration with YAML file storage.""" + + def __init__(self, config_path: Optional[str] = None): + """ + Initialize configuration. + + Args: + config_path: Path to configuration file. If None, uses default location. + """ + self.app_dir = Path.home() / ".local-transcription" + self.app_dir.mkdir(parents=True, exist_ok=True) + + if config_path is None: + self.config_path = self.app_dir / "config.yaml" + else: + self.config_path = Path(config_path) + + self.config: Dict[str, Any] = {} + self.load() + + def load(self) -> None: + """Load configuration from file or create default if not exists.""" + if self.config_path.exists(): + with open(self.config_path, 'r') as f: + self.config = yaml.safe_load(f) or {} + else: + # Load default configuration + default_config_path = Path(__file__).parent.parent / "config" / "default_config.yaml" + if default_config_path.exists(): + with open(default_config_path, 'r') as f: + self.config = yaml.safe_load(f) or {} + else: + self.config = self._get_default_config() + + # Save the default configuration + self.save() + + def save(self) -> None: + """Save current configuration to file.""" + with open(self.config_path, 'w') as f: + yaml.dump(self.config, f, default_flow_style=False, indent=2) + + def get(self, key_path: str, default: Any = None) -> Any: + """ + Get configuration value using dot notation. + + Args: + key_path: Dot-separated path to config value (e.g., "audio.sample_rate") + default: Default value if key not found + + Returns: + Configuration value or default + """ + keys = key_path.split('.') + value = self.config + + for key in keys: + if isinstance(value, dict) and key in value: + value = value[key] + else: + return default + + return value + + def set(self, key_path: str, value: Any) -> None: + """ + Set configuration value using dot notation. + + Args: + key_path: Dot-separated path to config value (e.g., "audio.sample_rate") + value: Value to set + """ + keys = key_path.split('.') + config = self.config + + # Navigate to the parent dict + for key in keys[:-1]: + if key not in config: + config[key] = {} + config = config[key] + + # Set the value + config[keys[-1]] = value + self.save() + + def _get_default_config(self) -> Dict[str, Any]: + """Get hardcoded default configuration.""" + return { + 'user': { + 'name': 'User', + 'id': '' + }, + 'audio': { + 'input_device': 'default', + 'sample_rate': 16000, + 'chunk_duration': 3.0 + }, + 'noise_suppression': { + 'enabled': True, + 'strength': 0.7, + 'method': 'noisereduce' + }, + 'transcription': { + 'model': 'base', + 'device': 'auto', + 'language': 'en', + 'task': 'transcribe' + }, + 'processing': { + 'use_vad': True, + 'min_confidence': 0.5 + }, + 'server_sync': { + 'enabled': False, + 'url': 'ws://localhost:8000', + 'api_key': '' + }, + 'display': { + 'show_timestamps': True, + 'max_lines': 100, + 'font_size': 12, + 'theme': 'dark' + } + } + + def reset_to_default(self) -> None: + """Reset configuration to default values.""" + self.config = self._get_default_config() + self.save() + + def __repr__(self) -> str: + return f"Config(path={self.config_path})" diff --git a/client/device_utils.py b/client/device_utils.py new file mode 100644 index 0000000..e2a170b --- /dev/null +++ b/client/device_utils.py @@ -0,0 +1,128 @@ +"""Utilities for detecting and managing compute devices (CPU/GPU).""" + +import torch +from typing import List, Tuple + + +class DeviceManager: + """Manages device detection and selection for transcription.""" + + def __init__(self): + """Initialize device manager and detect available devices.""" + self.available_devices = self._detect_devices() + self.current_device = self.available_devices[0] if self.available_devices else "cpu" + + def _detect_devices(self) -> List[str]: + """ + Detect available compute devices. + + Returns: + List of available device names + """ + devices = ["cpu"] + + # Check for CUDA (NVIDIA GPU) + if torch.cuda.is_available(): + devices.append("cuda") + + # Check for MPS (Apple Silicon GPU) + if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + devices.append("mps") + + return devices + + def get_device_info(self) -> List[Tuple[str, str]]: + """ + Get detailed information about available devices. + + Returns: + List of (device_name, device_description) tuples + """ + info = [] + + for device in self.available_devices: + if device == "cpu": + info.append(("cpu", "CPU")) + elif device == "cuda": + try: + gpu_name = torch.cuda.get_device_name(0) + info.append(("cuda", f"CUDA GPU: {gpu_name}")) + except: + info.append(("cuda", "CUDA GPU")) + elif device == "mps": + info.append(("mps", "Apple Silicon GPU (MPS)")) + + return info + + def set_device(self, device: str) -> bool: + """ + Set the current device for transcription. + + Args: + device: Device name ('cpu', 'cuda', 'mps', or 'auto') + + Returns: + True if device was set successfully, False otherwise + """ + if device == "auto": + # Auto-select best available device + if "cuda" in self.available_devices: + self.current_device = "cuda" + elif "mps" in self.available_devices: + self.current_device = "mps" + else: + self.current_device = "cpu" + return True + + if device in self.available_devices: + self.current_device = device + return True + + return False + + def get_device(self) -> str: + """ + Get the currently selected device. + + Returns: + Current device name + """ + return self.current_device + + def is_gpu_available(self) -> bool: + """ + Check if any GPU is available. + + Returns: + True if CUDA or MPS is available + """ + return "cuda" in self.available_devices or "mps" in self.available_devices + + def get_device_for_whisper(self) -> str: + """ + Get device string formatted for faster-whisper. + + Returns: + Device string for faster-whisper ('cpu', 'cuda', etc.) + """ + if self.current_device == "mps": + # faster-whisper doesn't support MPS, fall back to CPU + return "cpu" + return self.current_device + + def get_compute_type(self) -> str: + """ + Get the appropriate compute type for the current device. + + Returns: + Compute type string for faster-whisper + """ + if self.current_device == "cuda": + # Use float16 for GPU for better performance + return "float16" + else: + # Use int8 for CPU for better performance + return "int8" + + def __repr__(self) -> str: + return f"DeviceManager(current={self.current_device}, available={self.available_devices})" diff --git a/client/noise_suppression.py b/client/noise_suppression.py new file mode 100644 index 0000000..caa31f8 --- /dev/null +++ b/client/noise_suppression.py @@ -0,0 +1,164 @@ +"""Noise suppression module for reducing background noise in audio.""" + +import warnings +# Suppress pkg_resources deprecation warning from webrtcvad +warnings.filterwarnings("ignore", message=".*pkg_resources.*", category=UserWarning) + +import numpy as np +import noisereduce as nr +import webrtcvad +from typing import Optional + + +class NoiseSuppressor: + """Handles noise reduction and voice activity detection.""" + + def __init__( + self, + sample_rate: int = 16000, + method: str = "noisereduce", + strength: float = 0.7, + use_vad: bool = True + ): + """ + Initialize noise suppressor. + + Args: + sample_rate: Audio sample rate in Hz + method: Noise reduction method ('noisereduce' or 'none') + strength: Noise reduction strength (0.0 to 1.0) + use_vad: Whether to use Voice Activity Detection + """ + self.sample_rate = sample_rate + self.method = method + self.strength = max(0.0, min(1.0, strength)) # Clamp to [0, 1] + self.use_vad = use_vad + + # Initialize VAD if requested + self.vad = None + if use_vad: + try: + # WebRTC VAD supports 16kHz, 32kHz, and 48kHz + if sample_rate in [8000, 16000, 32000, 48000]: + self.vad = webrtcvad.Vad(2) # Aggressiveness: 0-3 (2 is balanced) + else: + print(f"Warning: VAD not supported for sample rate {sample_rate}Hz") + self.use_vad = False + except Exception as e: + print(f"Warning: Failed to initialize VAD: {e}") + self.use_vad = False + + # Store noise profile for adaptive reduction + self.noise_profile: Optional[np.ndarray] = None + + def reduce_noise(self, audio: np.ndarray) -> np.ndarray: + """ + Apply noise reduction to audio. + + Args: + audio: Audio data as numpy array (float32, range [-1, 1]) + + Returns: + Noise-reduced audio + """ + if self.method == "none" or self.strength == 0.0: + return audio + + try: + # Ensure audio is float32 + audio = audio.astype(np.float32) + + if self.method == "noisereduce": + # Apply noisereduce noise reduction + reduced = nr.reduce_noise( + y=audio, + sr=self.sample_rate, + prop_decrease=self.strength, + stationary=True + ) + return reduced.astype(np.float32) + else: + return audio + + except Exception as e: + print(f"Error in noise reduction: {e}") + return audio + + def is_speech(self, audio: np.ndarray) -> bool: + """ + Detect if audio contains speech using VAD. + + Args: + audio: Audio data as numpy array (float32, range [-1, 1]) + + Returns: + True if speech is detected, False otherwise + """ + if not self.use_vad or self.vad is None: + return True # Assume speech if VAD not available + + try: + # Convert float32 audio to int16 for VAD + audio_int16 = (audio * 32767).astype(np.int16) + + # VAD requires specific frame sizes (10, 20, or 30 ms) + frame_duration_ms = 30 + frame_size = int(self.sample_rate * frame_duration_ms / 1000) + + # Process audio in frames + num_frames = len(audio_int16) // frame_size + speech_frames = 0 + + for i in range(num_frames): + frame = audio_int16[i * frame_size:(i + 1) * frame_size] + if self.vad.is_speech(frame.tobytes(), self.sample_rate): + speech_frames += 1 + + # Consider it speech if more than 30% of frames contain speech + return speech_frames > (num_frames * 0.3) + + except Exception as e: + print(f"Error in VAD: {e}") + return True # Assume speech on error + + def process(self, audio: np.ndarray, skip_silent: bool = True) -> Optional[np.ndarray]: + """ + Process audio with noise reduction and optional VAD filtering. + + Args: + audio: Audio data as numpy array + skip_silent: If True, return None for non-speech audio + + Returns: + Processed audio or None if silent (when skip_silent=True) + """ + # Check for speech first (before noise reduction) + if skip_silent and self.use_vad: + if not self.is_speech(audio): + return None + + # Apply noise reduction + processed_audio = self.reduce_noise(audio) + + return processed_audio + + def set_strength(self, strength: float): + """ + Update noise reduction strength. + + Args: + strength: New strength value (0.0 to 1.0) + """ + self.strength = max(0.0, min(1.0, strength)) + + def set_vad_enabled(self, enabled: bool): + """ + Enable or disable Voice Activity Detection. + + Args: + enabled: True to enable VAD, False to disable + """ + self.use_vad = enabled and self.vad is not None + + def __repr__(self) -> str: + return f"NoiseSuppressor(method={self.method}, strength={self.strength}, vad={self.use_vad})" diff --git a/client/transcription_engine.py b/client/transcription_engine.py new file mode 100644 index 0000000..1e6df90 --- /dev/null +++ b/client/transcription_engine.py @@ -0,0 +1,232 @@ +"""Transcription engine using faster-whisper for speech-to-text.""" + +import numpy as np +from faster_whisper import WhisperModel +from typing import Optional, List, Tuple +from datetime import datetime +import threading + + +class TranscriptionResult: + """Represents a transcription result.""" + + def __init__(self, text: str, confidence: float, timestamp: datetime, user_name: str = ""): + """ + Initialize transcription result. + + Args: + text: Transcribed text + confidence: Confidence score (0.0 to 1.0) + timestamp: Timestamp of transcription + user_name: Name of the user/speaker + """ + self.text = text.strip() + self.confidence = confidence + self.timestamp = timestamp + self.user_name = user_name + + def __repr__(self) -> str: + time_str = self.timestamp.strftime("%H:%M:%S") + if self.user_name: + return f"[{time_str}] {self.user_name}: {self.text}" + return f"[{time_str}] {self.text}" + + def to_dict(self) -> dict: + """Convert to dictionary.""" + return { + 'text': self.text, + 'confidence': self.confidence, + 'timestamp': self.timestamp.isoformat(), + 'user_name': self.user_name + } + + +class TranscriptionEngine: + """Handles speech-to-text transcription using faster-whisper.""" + + def __init__( + self, + model_size: str = "base", + device: str = "cpu", + compute_type: str = "int8", + language: str = "en", + min_confidence: float = 0.5 + ): + """ + Initialize transcription engine. + + Args: + model_size: Whisper model size ('tiny', 'base', 'small', 'medium', 'large') + device: Device to use ('cpu', 'cuda', 'auto') + compute_type: Compute type ('int8', 'float16', 'float32') + language: Language code for transcription + min_confidence: Minimum confidence threshold for transcriptions + """ + self.model_size = model_size + self.device = device + self.compute_type = compute_type + self.language = language + self.min_confidence = min_confidence + self.model: Optional[WhisperModel] = None + self.model_lock = threading.Lock() + self.is_loaded = False + + def load_model(self) -> bool: + """ + Load the Whisper model. + + Returns: + True if model loaded successfully, False otherwise + """ + try: + print(f"Loading Whisper {self.model_size} model on {self.device}...") + + with self.model_lock: + self.model = WhisperModel( + self.model_size, + device=self.device, + compute_type=self.compute_type + ) + self.is_loaded = True + + print(f"Model loaded successfully!") + return True + + except Exception as e: + print(f"Error loading model: {e}") + self.is_loaded = False + return False + + def transcribe( + self, + audio: np.ndarray, + sample_rate: int = 16000, + user_name: str = "" + ) -> Optional[TranscriptionResult]: + """ + Transcribe audio to text. + + Args: + audio: Audio data as numpy array (float32) + sample_rate: Audio sample rate in Hz + user_name: Name of the user/speaker + + Returns: + TranscriptionResult or None if transcription failed or confidence too low + """ + if not self.is_loaded or self.model is None: + print("Model not loaded") + return None + + try: + # Ensure audio is float32 + audio = audio.astype(np.float32) + + # Transcribe using faster-whisper + with self.model_lock: + segments, info = self.model.transcribe( + audio, + language=self.language if self.language != "auto" else None, + vad_filter=True, # Use built-in VAD + vad_parameters=dict( + min_silence_duration_ms=500 + ) + ) + + # Collect all segments + full_text = "" + total_confidence = 0.0 + segment_count = 0 + + for segment in segments: + full_text += segment.text + " " + total_confidence += segment.avg_logprob + segment_count += 1 + + # Calculate average confidence + if segment_count == 0: + return None + + # Convert log probability to approximate confidence (0-1 range) + # avg_logprob is typically in range [-1, 0], so we transform it + avg_confidence = np.exp(total_confidence / segment_count) + + # Filter by minimum confidence + if avg_confidence < self.min_confidence: + return None + + # Clean up text + text = full_text.strip() + + if not text: + return None + + # Create result + result = TranscriptionResult( + text=text, + confidence=avg_confidence, + timestamp=datetime.now(), + user_name=user_name + ) + + return result + + except Exception as e: + print(f"Error during transcription: {e}") + return None + + def change_model(self, model_size: str) -> bool: + """ + Change to a different model size. + + Args: + model_size: New model size + + Returns: + True if model changed successfully + """ + self.model_size = model_size + self.is_loaded = False + self.model = None + return self.load_model() + + def change_device(self, device: str, compute_type: Optional[str] = None) -> bool: + """ + Change compute device. + + Args: + device: New device ('cpu', 'cuda', etc.) + compute_type: Optional new compute type + + Returns: + True if device changed successfully + """ + self.device = device + if compute_type: + self.compute_type = compute_type + + self.is_loaded = False + self.model = None + return self.load_model() + + def change_language(self, language: str): + """ + Change transcription language. + + Args: + language: Language code or 'auto' + """ + self.language = language + + def unload_model(self): + """Unload the model from memory.""" + with self.model_lock: + self.model = None + self.is_loaded = False + + def __repr__(self) -> str: + return f"TranscriptionEngine(model={self.model_size}, device={self.device}, loaded={self.is_loaded})" + + def __del__(self): + """Cleanup when object is destroyed.""" + self.unload_model() diff --git a/config/default_config.yaml b/config/default_config.yaml new file mode 100644 index 0000000..31733e9 --- /dev/null +++ b/config/default_config.yaml @@ -0,0 +1,40 @@ +user: + name: "User" + id: "" + +audio: + input_device: "default" + sample_rate: 16000 + chunk_duration: 3.0 + +noise_suppression: + enabled: true + strength: 0.7 + method: "noisereduce" + +transcription: + model: "base" + device: "auto" + language: "en" + task: "transcribe" + +processing: + use_vad: true + min_confidence: 0.5 + +server_sync: + enabled: false + url: "ws://localhost:8000" + api_key: "" + +display: + show_timestamps: true + max_lines: 100 + font_family: "Courier" + font_size: 12 + theme: "dark" + fade_after_seconds: 10 # Time before transcriptions fade out (0 = never fade) + +web_server: + port: 8080 + host: "127.0.0.1" diff --git a/gui/__init__.py b/gui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gui/main_window.py b/gui/main_window.py new file mode 100644 index 0000000..02d7dbb --- /dev/null +++ b/gui/main_window.py @@ -0,0 +1,364 @@ +"""Main application window for the local transcription app.""" + +import customtkinter as ctk +from tkinter import filedialog, messagebox +import threading +from pathlib import Path +import sys + +# Add parent directory to path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +from client.config import Config +from client.device_utils import DeviceManager +from client.audio_capture import AudioCapture +from client.noise_suppression import NoiseSuppressor +from client.transcription_engine import TranscriptionEngine +from gui.transcription_display import TranscriptionDisplay +from gui.settings_dialog import SettingsDialog + + +class MainWindow(ctk.CTk): + """Main application window.""" + + def __init__(self): + """Initialize the main window.""" + super().__init__() + + # Application state + self.is_transcribing = False + self.config = Config() + self.device_manager = DeviceManager() + + # Components (initialized later) + self.audio_capture: AudioCapture = None + self.noise_suppressor: NoiseSuppressor = None + self.transcription_engine: TranscriptionEngine = None + + # Configure window + self.title("Local Transcription") + self.geometry("900x700") + + # Set theme + ctk.set_appearance_mode(self.config.get('display.theme', 'dark')) + ctk.set_default_color_theme("blue") + + # Create UI + self._create_widgets() + + # Handle window close + self.protocol("WM_DELETE_WINDOW", self._on_closing) + + # Initialize components after GUI is ready (delay to avoid XCB threading issues) + self.after(100, self._initialize_components) + + def _create_widgets(self): + """Create all UI widgets.""" + # Header frame + header_frame = ctk.CTkFrame(self, height=80) + header_frame.pack(fill="x", padx=10, pady=(10, 0)) + header_frame.pack_propagate(False) + + # Title + title_label = ctk.CTkLabel( + header_frame, + text="Local Transcription", + font=("", 24, "bold") + ) + title_label.pack(side="left", padx=20, pady=20) + + # Settings button + self.settings_button = ctk.CTkButton( + header_frame, + text="āš™ Settings", + command=self._open_settings, + width=120 + ) + self.settings_button.pack(side="right", padx=20, pady=20) + + # Status frame + status_frame = ctk.CTkFrame(self, height=60) + status_frame.pack(fill="x", padx=10, pady=(10, 0)) + status_frame.pack_propagate(False) + + # Status label + self.status_label = ctk.CTkLabel( + status_frame, + text="⚫ Ready", + font=("", 14) + ) + self.status_label.pack(side="left", padx=20) + + # Device info + device_info = self.device_manager.get_device_info() + device_text = device_info[0][1] if device_info else "No device" + self.device_label = ctk.CTkLabel( + status_frame, + text=f"Device: {device_text}", + font=("", 12) + ) + self.device_label.pack(side="left", padx=20) + + # User name display + user_name = self.config.get('user.name', 'User') + self.user_label = ctk.CTkLabel( + status_frame, + text=f"User: {user_name}", + font=("", 12) + ) + self.user_label.pack(side="left", padx=20) + + # Transcription display frame + display_frame = ctk.CTkFrame(self) + display_frame.pack(fill="both", expand=True, padx=10, pady=10) + + # Transcription display + self.transcription_display = TranscriptionDisplay( + display_frame, + max_lines=self.config.get('display.max_lines', 100), + show_timestamps=self.config.get('display.show_timestamps', True), + font=("Courier", self.config.get('display.font_size', 12)) + ) + self.transcription_display.pack(fill="both", expand=True, padx=10, pady=10) + + # Control frame + control_frame = ctk.CTkFrame(self, height=80) + control_frame.pack(fill="x", padx=10, pady=(0, 10)) + control_frame.pack_propagate(False) + + # Start/Stop button + self.start_button = ctk.CTkButton( + control_frame, + text="ā–¶ Start Transcription", + command=self._toggle_transcription, + width=200, + height=50, + font=("", 16, "bold"), + fg_color="green" + ) + self.start_button.pack(side="left", padx=20, pady=15) + + # Clear button + self.clear_button = ctk.CTkButton( + control_frame, + text="Clear", + command=self._clear_transcriptions, + width=120, + height=50 + ) + self.clear_button.pack(side="left", padx=10, pady=15) + + # Save button + self.save_button = ctk.CTkButton( + control_frame, + text="šŸ’¾ Save", + command=self._save_transcriptions, + width=120, + height=50 + ) + self.save_button.pack(side="left", padx=10, pady=15) + + def _initialize_components(self): + """Initialize audio, noise suppression, and transcription components.""" + # Update status + self.status_label.configure(text="āš™ Initializing...") + self.update() + + try: + # Set device based on config + device_config = self.config.get('transcription.device', 'auto') + self.device_manager.set_device(device_config) + + # Initialize transcription engine + model_size = self.config.get('transcription.model', 'base') + language = self.config.get('transcription.language', 'en') + device = self.device_manager.get_device_for_whisper() + compute_type = self.device_manager.get_compute_type() + + self.transcription_engine = TranscriptionEngine( + model_size=model_size, + device=device, + compute_type=compute_type, + language=language, + min_confidence=self.config.get('processing.min_confidence', 0.5) + ) + + # Load model (synchronously to avoid X11 threading issues) + success = self.transcription_engine.load_model() + + if success: + self.status_label.configure(text="āœ“ Ready") + else: + self.status_label.configure(text="āŒ Model loading failed") + messagebox.showerror("Error", "Failed to load transcription model") + + except Exception as e: + print(f"Error initializing components: {e}") + self.status_label.configure(text="āŒ Initialization failed") + messagebox.showerror("Error", f"Failed to initialize:\n{e}") + + def _update_status(self, status: str): + """Update status label (thread-safe).""" + self.after(0, lambda: self.status_label.configure(text=status)) + + def _toggle_transcription(self): + """Start or stop transcription.""" + if not self.is_transcribing: + self._start_transcription() + else: + self._stop_transcription() + + def _start_transcription(self): + """Start transcription.""" + try: + # Check if engine is ready + if not self.transcription_engine or not self.transcription_engine.is_loaded: + messagebox.showerror("Error", "Transcription engine not ready") + return + + # Get audio device + audio_device_str = self.config.get('audio.input_device', 'default') + audio_device = None if audio_device_str == 'default' else int(audio_device_str) + + # Initialize audio capture + self.audio_capture = AudioCapture( + sample_rate=self.config.get('audio.sample_rate', 16000), + chunk_duration=self.config.get('audio.chunk_duration', 3.0), + device=audio_device + ) + + # Initialize noise suppressor + self.noise_suppressor = NoiseSuppressor( + sample_rate=self.config.get('audio.sample_rate', 16000), + method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none", + strength=self.config.get('noise_suppression.strength', 0.7), + use_vad=self.config.get('processing.use_vad', True) + ) + + # Start recording + self.audio_capture.start_recording(callback=self._process_audio_chunk) + + # Update UI + self.is_transcribing = True + self.start_button.configure(text="āø Stop Transcription", fg_color="red") + self.status_label.configure(text="šŸ”“ Recording...") + + except Exception as e: + messagebox.showerror("Error", f"Failed to start transcription:\n{e}") + print(f"Error starting transcription: {e}") + + def _stop_transcription(self): + """Stop transcription.""" + try: + # Stop recording + if self.audio_capture: + self.audio_capture.stop_recording() + + # Update UI + self.is_transcribing = False + self.start_button.configure(text="ā–¶ Start Transcription", fg_color="green") + self.status_label.configure(text="āœ“ Ready") + + except Exception as e: + messagebox.showerror("Error", f"Failed to stop transcription:\n{e}") + print(f"Error stopping transcription: {e}") + + def _process_audio_chunk(self, audio_chunk): + """Process an audio chunk (noise suppression + transcription).""" + def process(): + try: + # Apply noise suppression + processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True) + + # Skip if silent (VAD filtered it out) + if processed_audio is None: + return + + # Transcribe + user_name = self.config.get('user.name', 'User') + result = self.transcription_engine.transcribe( + processed_audio, + sample_rate=self.config.get('audio.sample_rate', 16000), + user_name=user_name + ) + + # Display result + if result: + self.after(0, lambda: self.transcription_display.add_transcription( + text=result.text, + user_name=result.user_name, + timestamp=result.timestamp + )) + + except Exception as e: + print(f"Error processing audio: {e}") + + # Run in background thread + threading.Thread(target=process, daemon=True).start() + + def _clear_transcriptions(self): + """Clear all transcriptions.""" + if messagebox.askyesno("Clear Transcriptions", "Are you sure you want to clear all transcriptions?"): + self.transcription_display.clear() + + def _save_transcriptions(self): + """Save transcriptions to file.""" + filepath = filedialog.asksaveasfilename( + defaultextension=".txt", + filetypes=[("Text files", "*.txt"), ("All files", "*.*")] + ) + + if filepath: + if self.transcription_display.save_to_file(filepath): + messagebox.showinfo("Saved", f"Transcriptions saved to:\n{filepath}") + else: + messagebox.showerror("Error", "Failed to save transcriptions") + + def _open_settings(self): + """Open settings dialog.""" + # Get audio devices + audio_devices = AudioCapture.get_input_devices() + if not audio_devices: + audio_devices = [(0, "Default")] + + # Get compute devices + compute_devices = self.device_manager.get_device_info() + compute_devices.insert(0, ("auto", "Auto-detect")) + + # Open settings dialog + SettingsDialog( + self, + self.config, + audio_devices, + compute_devices, + on_save=self._on_settings_saved + ) + + def _on_settings_saved(self): + """Handle settings being saved.""" + # Update user label + user_name = self.config.get('user.name', 'User') + self.user_label.configure(text=f"User: {user_name}") + + # Update display settings + self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100)) + self.transcription_display.set_show_timestamps(self.config.get('display.show_timestamps', True)) + + # Note: Model/device changes require restart + messagebox.showinfo( + "Settings Saved", + "Some settings (model size, device) require restarting the application to take effect." + ) + + def _on_closing(self): + """Handle window closing.""" + # Stop transcription if running + if self.is_transcribing: + self._stop_transcription() + + # Unload model + if self.transcription_engine: + self.transcription_engine.unload_model() + + # Close window + self.destroy() diff --git a/gui/main_window_qt.py b/gui/main_window_qt.py new file mode 100644 index 0000000..06463c0 --- /dev/null +++ b/gui/main_window_qt.py @@ -0,0 +1,524 @@ +"""PySide6 main application window for the local transcription app.""" + +from PySide6.QtWidgets import ( + QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, + QPushButton, QLabel, QFileDialog, QMessageBox +) +from PySide6.QtCore import Qt, QThread, Signal +from PySide6.QtGui import QFont +from pathlib import Path +import sys + +# Add parent directory to path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +from client.config import Config +from client.device_utils import DeviceManager +from client.audio_capture import AudioCapture +from client.noise_suppression import NoiseSuppressor +from client.transcription_engine import TranscriptionEngine +from gui.transcription_display_qt import TranscriptionDisplay +from gui.settings_dialog_qt import SettingsDialog +from server.web_display import TranscriptionWebServer +import asyncio +from threading import Thread + + +class WebServerThread(Thread): + """Thread for running the web server.""" + + def __init__(self, web_server): + super().__init__(daemon=True) + self.web_server = web_server + self.loop = None + + def run(self): + """Run the web server in async event loop.""" + self.loop = asyncio.new_event_loop() + asyncio.set_event_loop(self.loop) + self.loop.run_until_complete(self.web_server.start()) + + +class ModelLoaderThread(QThread): + """Thread for loading the Whisper model without blocking the GUI.""" + + finished = Signal(bool, str) # success, message + + def __init__(self, transcription_engine): + super().__init__() + self.transcription_engine = transcription_engine + + def run(self): + """Load the model in background thread.""" + try: + success = self.transcription_engine.load_model() + if success: + self.finished.emit(True, "Model loaded successfully") + else: + self.finished.emit(False, "Failed to load model") + except Exception as e: + self.finished.emit(False, f"Error loading model: {e}") + + +class MainWindow(QMainWindow): + """Main application window using PySide6.""" + + def __init__(self): + """Initialize the main window.""" + super().__init__() + + # Application state + self.is_transcribing = False + self.config = Config() + self.device_manager = DeviceManager() + + # Components (initialized later) + self.audio_capture: AudioCapture = None + self.noise_suppressor: NoiseSuppressor = None + self.transcription_engine: TranscriptionEngine = None + self.model_loader_thread: ModelLoaderThread = None + + # Track current model settings + self.current_model_size: str = None + self.current_device_config: str = None + + # Web server components + self.web_server: TranscriptionWebServer = None + self.web_server_thread: WebServerThread = None + + # Configure window + self.setWindowTitle("Local Transcription") + self.resize(900, 700) + + # Create UI + self._create_widgets() + + # Initialize components (in background) + self._initialize_components() + + # Start web server if enabled + self._start_web_server_if_enabled() + + def _create_widgets(self): + """Create all UI widgets.""" + # Central widget + central_widget = QWidget() + self.setCentralWidget(central_widget) + + main_layout = QVBoxLayout() + central_widget.setLayout(main_layout) + + # Header + header_widget = QWidget() + header_widget.setFixedHeight(80) + header_layout = QHBoxLayout() + header_widget.setLayout(header_layout) + + title_label = QLabel("Local Transcription") + title_font = QFont() + title_font.setPointSize(24) + title_font.setBold(True) + title_label.setFont(title_font) + header_layout.addWidget(title_label) + + header_layout.addStretch() + + self.settings_button = QPushButton("āš™ Settings") + self.settings_button.setFixedSize(120, 40) + self.settings_button.clicked.connect(self._open_settings) + header_layout.addWidget(self.settings_button) + + main_layout.addWidget(header_widget) + + # Status bar + status_widget = QWidget() + status_widget.setFixedHeight(60) + status_layout = QHBoxLayout() + status_widget.setLayout(status_layout) + + self.status_label = QLabel("⚫ Initializing...") + status_font = QFont() + status_font.setPointSize(14) + self.status_label.setFont(status_font) + status_layout.addWidget(self.status_label) + + device_info = self.device_manager.get_device_info() + device_text = device_info[0][1] if device_info else "No device" + self.device_label = QLabel(f"Device: {device_text}") + status_layout.addWidget(self.device_label) + + user_name = self.config.get('user.name', 'User') + self.user_label = QLabel(f"User: {user_name}") + status_layout.addWidget(self.user_label) + + status_layout.addStretch() + + main_layout.addWidget(status_widget) + + # Transcription display + self.transcription_display = TranscriptionDisplay( + max_lines=self.config.get('display.max_lines', 100), + show_timestamps=self.config.get('display.show_timestamps', True), + font_family=self.config.get('display.font_family', 'Courier'), + font_size=self.config.get('display.font_size', 12) + ) + main_layout.addWidget(self.transcription_display) + + # Control buttons + control_widget = QWidget() + control_widget.setFixedHeight(80) + control_layout = QHBoxLayout() + control_widget.setLayout(control_layout) + + self.start_button = QPushButton("ā–¶ Start Transcription") + self.start_button.setFixedSize(240, 50) + button_font = QFont() + button_font.setPointSize(14) + button_font.setBold(True) + self.start_button.setFont(button_font) + self.start_button.clicked.connect(self._toggle_transcription) + self.start_button.setStyleSheet("background-color: #2ecc71; color: white;") + control_layout.addWidget(self.start_button) + + self.clear_button = QPushButton("Clear") + self.clear_button.setFixedSize(120, 50) + self.clear_button.clicked.connect(self._clear_transcriptions) + control_layout.addWidget(self.clear_button) + + self.save_button = QPushButton("šŸ’¾ Save") + self.save_button.setFixedSize(120, 50) + self.save_button.clicked.connect(self._save_transcriptions) + control_layout.addWidget(self.save_button) + + control_layout.addStretch() + + main_layout.addWidget(control_widget) + + def _initialize_components(self): + """Initialize audio, noise suppression, and transcription components.""" + # Update status + self.status_label.setText("āš™ Initializing...") + + # Set device based on config + device_config = self.config.get('transcription.device', 'auto') + self.device_manager.set_device(device_config) + + # Initialize transcription engine + model_size = self.config.get('transcription.model', 'base') + language = self.config.get('transcription.language', 'en') + device = self.device_manager.get_device_for_whisper() + compute_type = self.device_manager.get_compute_type() + + # Track current settings + self.current_model_size = model_size + self.current_device_config = device_config + + self.transcription_engine = TranscriptionEngine( + model_size=model_size, + device=device, + compute_type=compute_type, + language=language, + min_confidence=self.config.get('processing.min_confidence', 0.5) + ) + + # Load model in background thread + self.model_loader_thread = ModelLoaderThread(self.transcription_engine) + self.model_loader_thread.finished.connect(self._on_model_loaded) + self.model_loader_thread.start() + + def _on_model_loaded(self, success: bool, message: str): + """Handle model loading completion.""" + if success: + host = self.config.get('web_server.host', '127.0.0.1') + port = self.config.get('web_server.port', 8080) + self.status_label.setText(f"āœ“ Ready | Web: http://{host}:{port}") + self.start_button.setEnabled(True) + else: + self.status_label.setText("āŒ Model loading failed") + QMessageBox.critical(self, "Error", message) + self.start_button.setEnabled(False) + + def _start_web_server_if_enabled(self): + """Start web server.""" + host = self.config.get('web_server.host', '127.0.0.1') + port = self.config.get('web_server.port', 8080) + show_timestamps = self.config.get('display.show_timestamps', True) + fade_after_seconds = self.config.get('display.fade_after_seconds', 10) + + print(f"Starting web server at http://{host}:{port}") + self.web_server = TranscriptionWebServer( + host=host, + port=port, + show_timestamps=show_timestamps, + fade_after_seconds=fade_after_seconds + ) + self.web_server_thread = WebServerThread(self.web_server) + self.web_server_thread.start() + + def _toggle_transcription(self): + """Start or stop transcription.""" + if not self.is_transcribing: + self._start_transcription() + else: + self._stop_transcription() + + def _start_transcription(self): + """Start transcription.""" + try: + # Check if engine is ready + if not self.transcription_engine or not self.transcription_engine.is_loaded: + QMessageBox.critical(self, "Error", "Transcription engine not ready") + return + + # Get audio device + audio_device_str = self.config.get('audio.input_device', 'default') + audio_device = None if audio_device_str == 'default' else int(audio_device_str) + + # Initialize audio capture + self.audio_capture = AudioCapture( + sample_rate=self.config.get('audio.sample_rate', 16000), + chunk_duration=self.config.get('audio.chunk_duration', 3.0), + device=audio_device + ) + + # Initialize noise suppressor + self.noise_suppressor = NoiseSuppressor( + sample_rate=self.config.get('audio.sample_rate', 16000), + method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none", + strength=self.config.get('noise_suppression.strength', 0.7), + use_vad=self.config.get('processing.use_vad', True) + ) + + # Start recording + self.audio_capture.start_recording(callback=self._process_audio_chunk) + + # Update UI + self.is_transcribing = True + self.start_button.setText("āø Stop Transcription") + self.start_button.setStyleSheet("background-color: #e74c3c; color: white;") + self.status_label.setText("šŸ”“ Recording...") + + except Exception as e: + QMessageBox.critical(self, "Error", f"Failed to start transcription:\n{e}") + print(f"Error starting transcription: {e}") + + def _stop_transcription(self): + """Stop transcription.""" + try: + # Stop recording + if self.audio_capture: + self.audio_capture.stop_recording() + + # Update UI + self.is_transcribing = False + self.start_button.setText("ā–¶ Start Transcription") + self.start_button.setStyleSheet("background-color: #2ecc71; color: white;") + self.status_label.setText("āœ“ Ready") + + except Exception as e: + QMessageBox.critical(self, "Error", f"Failed to stop transcription:\n{e}") + print(f"Error stopping transcription: {e}") + + def _process_audio_chunk(self, audio_chunk): + """Process an audio chunk (noise suppression + transcription).""" + def process(): + try: + # Apply noise suppression + processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True) + + # Skip if silent (VAD filtered it out) + if processed_audio is None: + return + + # Transcribe + user_name = self.config.get('user.name', 'User') + result = self.transcription_engine.transcribe( + processed_audio, + sample_rate=self.config.get('audio.sample_rate', 16000), + user_name=user_name + ) + + # Display result (use Qt signal for thread safety) + if result: + # We need to update UI from main thread + # Note: We don't pass timestamp - let the display widget create it + from PySide6.QtCore import QMetaObject, Q_ARG + QMetaObject.invokeMethod( + self.transcription_display, + "add_transcription", + Qt.QueuedConnection, + Q_ARG(str, result.text), + Q_ARG(str, result.user_name) + ) + + # Broadcast to web server if enabled + if self.web_server and self.web_server_thread: + asyncio.run_coroutine_threadsafe( + self.web_server.broadcast_transcription( + result.text, + result.user_name, + result.timestamp + ), + self.web_server_thread.loop + ) + + except Exception as e: + print(f"Error processing audio: {e}") + import traceback + traceback.print_exc() + + # Run in background thread + from threading import Thread + Thread(target=process, daemon=True).start() + + def _clear_transcriptions(self): + """Clear all transcriptions.""" + reply = QMessageBox.question( + self, + "Clear Transcriptions", + "Are you sure you want to clear all transcriptions?", + QMessageBox.Yes | QMessageBox.No + ) + + if reply == QMessageBox.Yes: + self.transcription_display.clear_all() + + def _save_transcriptions(self): + """Save transcriptions to file.""" + filepath, _ = QFileDialog.getSaveFileName( + self, + "Save Transcriptions", + "", + "Text files (*.txt);;All files (*.*)" + ) + + if filepath: + if self.transcription_display.save_to_file(filepath): + QMessageBox.information(self, "Saved", f"Transcriptions saved to:\n{filepath}") + else: + QMessageBox.critical(self, "Error", "Failed to save transcriptions") + + def _open_settings(self): + """Open settings dialog.""" + # Get audio devices + audio_devices = AudioCapture.get_input_devices() + if not audio_devices: + audio_devices = [(0, "Default")] + + # Get compute devices + compute_devices = self.device_manager.get_device_info() + compute_devices.insert(0, ("auto", "Auto-detect")) + + # Open settings dialog + dialog = SettingsDialog( + self, + self.config, + audio_devices, + compute_devices, + on_save=self._on_settings_saved + ) + dialog.exec() + + def _on_settings_saved(self): + """Handle settings being saved.""" + # Update user label + user_name = self.config.get('user.name', 'User') + self.user_label.setText(f"User: {user_name}") + + # Update display settings + show_timestamps = self.config.get('display.show_timestamps', True) + self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100)) + self.transcription_display.set_show_timestamps(show_timestamps) + self.transcription_display.set_font( + self.config.get('display.font_family', 'Courier'), + self.config.get('display.font_size', 12) + ) + + # Update web server settings + if self.web_server: + self.web_server.show_timestamps = show_timestamps + self.web_server.fade_after_seconds = self.config.get('display.fade_after_seconds', 10) + + # Check if model/device settings changed - reload model if needed + new_model = self.config.get('transcription.model', 'base') + new_device_config = self.config.get('transcription.device', 'auto') + + # Only reload if model size or device changed + if self.current_model_size != new_model or self.current_device_config != new_device_config: + self._reload_model() + else: + QMessageBox.information(self, "Settings Saved", "Settings have been applied successfully!") + + def _reload_model(self): + """Reload the transcription model with new settings.""" + # Stop transcription if running + was_transcribing = self.is_transcribing + if was_transcribing: + self._stop_transcription() + + # Update status + self.status_label.setText("āš™ Reloading model...") + self.start_button.setEnabled(False) + + # Unload current model + if self.transcription_engine: + self.transcription_engine.unload_model() + + # Set device based on config + device_config = self.config.get('transcription.device', 'auto') + self.device_manager.set_device(device_config) + + # Re-initialize transcription engine + model_size = self.config.get('transcription.model', 'base') + language = self.config.get('transcription.language', 'en') + device = self.device_manager.get_device_for_whisper() + compute_type = self.device_manager.get_compute_type() + + # Update tracked settings + self.current_model_size = model_size + self.current_device_config = device_config + + self.transcription_engine = TranscriptionEngine( + model_size=model_size, + device=device, + compute_type=compute_type, + language=language, + min_confidence=self.config.get('processing.min_confidence', 0.5) + ) + + # Load model in background thread + if self.model_loader_thread and self.model_loader_thread.isRunning(): + self.model_loader_thread.wait() + + self.model_loader_thread = ModelLoaderThread(self.transcription_engine) + self.model_loader_thread.finished.connect(self._on_model_reloaded) + self.model_loader_thread.start() + + def _on_model_reloaded(self, success: bool, message: str): + """Handle model reloading completion.""" + if success: + host = self.config.get('web_server.host', '127.0.0.1') + port = self.config.get('web_server.port', 8080) + self.status_label.setText(f"āœ“ Ready | Web: http://{host}:{port}") + self.start_button.setEnabled(True) + QMessageBox.information(self, "Settings Saved", "Model reloaded successfully with new settings!") + else: + self.status_label.setText("āŒ Model loading failed") + QMessageBox.critical(self, "Error", f"Failed to reload model:\n{message}") + self.start_button.setEnabled(False) + + def closeEvent(self, event): + """Handle window closing.""" + # Stop transcription if running + if self.is_transcribing: + self._stop_transcription() + + # Unload model + if self.transcription_engine: + self.transcription_engine.unload_model() + + # Wait for model loader thread + if self.model_loader_thread and self.model_loader_thread.isRunning(): + self.model_loader_thread.wait() + + event.accept() diff --git a/gui/settings_dialog.py b/gui/settings_dialog.py new file mode 100644 index 0000000..06774c4 --- /dev/null +++ b/gui/settings_dialog.py @@ -0,0 +1,310 @@ +"""Settings dialog for configuring the application.""" + +import customtkinter as ctk +from tkinter import messagebox +from typing import Callable, List, Tuple + + +class SettingsDialog(ctk.CTkToplevel): + """Dialog window for application settings.""" + + def __init__( + self, + parent, + config, + audio_devices: List[Tuple[int, str]], + compute_devices: List[Tuple[str, str]], + on_save: Callable = None + ): + """ + Initialize settings dialog. + + Args: + parent: Parent window + config: Configuration object + audio_devices: List of (device_index, device_name) tuples + compute_devices: List of (device_id, device_description) tuples + on_save: Callback function when settings are saved + """ + super().__init__(parent) + + self.config = config + self.audio_devices = audio_devices + self.compute_devices = compute_devices + self.on_save = on_save + + # Window configuration + self.title("Settings") + self.geometry("600x700") + self.resizable(False, False) + + # Make dialog modal + self.transient(parent) + self.grab_set() + + self._create_widgets() + self._load_current_settings() + + def _create_widgets(self): + """Create all settings widgets.""" + # Main container with padding + main_frame = ctk.CTkFrame(self) + main_frame.pack(fill="both", expand=True, padx=20, pady=20) + + # User Settings Section + user_frame = ctk.CTkFrame(main_frame) + user_frame.pack(fill="x", pady=(0, 15)) + + ctk.CTkLabel(user_frame, text="User Settings", font=("", 16, "bold")).pack( + anchor="w", padx=10, pady=(10, 5) + ) + + # User name + name_frame = ctk.CTkFrame(user_frame) + name_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(name_frame, text="Display Name:", width=150).pack(side="left", padx=5) + self.name_entry = ctk.CTkEntry(name_frame, width=300) + self.name_entry.pack(side="left", padx=5) + + # Audio Settings Section + audio_frame = ctk.CTkFrame(main_frame) + audio_frame.pack(fill="x", pady=(0, 15)) + + ctk.CTkLabel(audio_frame, text="Audio Settings", font=("", 16, "bold")).pack( + anchor="w", padx=10, pady=(10, 5) + ) + + # Audio device + device_frame = ctk.CTkFrame(audio_frame) + device_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(device_frame, text="Input Device:", width=150).pack(side="left", padx=5) + device_names = [name for _, name in self.audio_devices] + self.audio_device_menu = ctk.CTkOptionMenu(device_frame, values=device_names, width=300) + self.audio_device_menu.pack(side="left", padx=5) + + # Chunk duration + chunk_frame = ctk.CTkFrame(audio_frame) + chunk_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(chunk_frame, text="Chunk Duration (s):", width=150).pack(side="left", padx=5) + self.chunk_entry = ctk.CTkEntry(chunk_frame, width=100) + self.chunk_entry.pack(side="left", padx=5) + + # Transcription Settings Section + transcription_frame = ctk.CTkFrame(main_frame) + transcription_frame.pack(fill="x", pady=(0, 15)) + + ctk.CTkLabel(transcription_frame, text="Transcription Settings", font=("", 16, "bold")).pack( + anchor="w", padx=10, pady=(10, 5) + ) + + # Model size + model_frame = ctk.CTkFrame(transcription_frame) + model_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(model_frame, text="Model Size:", width=150).pack(side="left", padx=5) + self.model_menu = ctk.CTkOptionMenu( + model_frame, + values=["tiny", "base", "small", "medium", "large"], + width=200 + ) + self.model_menu.pack(side="left", padx=5) + + # Compute device + compute_frame = ctk.CTkFrame(transcription_frame) + compute_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(compute_frame, text="Compute Device:", width=150).pack(side="left", padx=5) + device_descs = [desc for _, desc in self.compute_devices] + self.compute_device_menu = ctk.CTkOptionMenu(compute_frame, values=device_descs, width=300) + self.compute_device_menu.pack(side="left", padx=5) + + # Language + lang_frame = ctk.CTkFrame(transcription_frame) + lang_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(lang_frame, text="Language:", width=150).pack(side="left", padx=5) + self.lang_menu = ctk.CTkOptionMenu( + lang_frame, + values=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"], + width=200 + ) + self.lang_menu.pack(side="left", padx=5) + + # Noise Suppression Section + noise_frame = ctk.CTkFrame(main_frame) + noise_frame.pack(fill="x", pady=(0, 15)) + + ctk.CTkLabel(noise_frame, text="Noise Suppression", font=("", 16, "bold")).pack( + anchor="w", padx=10, pady=(10, 5) + ) + + # Enable noise suppression + ns_enable_frame = ctk.CTkFrame(noise_frame) + ns_enable_frame.pack(fill="x", padx=10, pady=5) + self.noise_enabled_var = ctk.BooleanVar() + self.noise_enabled_check = ctk.CTkCheckBox( + ns_enable_frame, + text="Enable Noise Suppression", + variable=self.noise_enabled_var + ) + self.noise_enabled_check.pack(side="left", padx=5) + + # Noise suppression strength + strength_frame = ctk.CTkFrame(noise_frame) + strength_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(strength_frame, text="Strength:", width=150).pack(side="left", padx=5) + self.noise_strength_slider = ctk.CTkSlider( + strength_frame, + from_=0.0, + to=1.0, + number_of_steps=20, + width=300 + ) + self.noise_strength_slider.pack(side="left", padx=5) + self.noise_strength_label = ctk.CTkLabel(strength_frame, text="0.7", width=40) + self.noise_strength_label.pack(side="left", padx=5) + self.noise_strength_slider.configure(command=self._update_strength_label) + + # VAD + vad_frame = ctk.CTkFrame(noise_frame) + vad_frame.pack(fill="x", padx=10, pady=5) + self.vad_enabled_var = ctk.BooleanVar() + self.vad_enabled_check = ctk.CTkCheckBox( + vad_frame, + text="Enable Voice Activity Detection", + variable=self.vad_enabled_var + ) + self.vad_enabled_check.pack(side="left", padx=5) + + # Display Settings Section + display_frame = ctk.CTkFrame(main_frame) + display_frame.pack(fill="x", pady=(0, 15)) + + ctk.CTkLabel(display_frame, text="Display Settings", font=("", 16, "bold")).pack( + anchor="w", padx=10, pady=(10, 5) + ) + + # Show timestamps + ts_frame = ctk.CTkFrame(display_frame) + ts_frame.pack(fill="x", padx=10, pady=5) + self.timestamps_var = ctk.BooleanVar() + self.timestamps_check = ctk.CTkCheckBox( + ts_frame, + text="Show Timestamps", + variable=self.timestamps_var + ) + self.timestamps_check.pack(side="left", padx=5) + + # Max lines + maxlines_frame = ctk.CTkFrame(display_frame) + maxlines_frame.pack(fill="x", padx=10, pady=5) + ctk.CTkLabel(maxlines_frame, text="Max Lines:", width=150).pack(side="left", padx=5) + self.maxlines_entry = ctk.CTkEntry(maxlines_frame, width=100) + self.maxlines_entry.pack(side="left", padx=5) + + # Buttons + button_frame = ctk.CTkFrame(main_frame) + button_frame.pack(fill="x", pady=(10, 0)) + + self.save_button = ctk.CTkButton( + button_frame, + text="Save", + command=self._save_settings, + width=120 + ) + self.save_button.pack(side="right", padx=5) + + self.cancel_button = ctk.CTkButton( + button_frame, + text="Cancel", + command=self.destroy, + width=120, + fg_color="gray" + ) + self.cancel_button.pack(side="right", padx=5) + + def _update_strength_label(self, value): + """Update the noise strength label.""" + self.noise_strength_label.configure(text=f"{value:.1f}") + + def _load_current_settings(self): + """Load current settings from config.""" + # User settings + self.name_entry.insert(0, self.config.get('user.name', 'User')) + + # Audio settings + current_device = self.config.get('audio.input_device', 'default') + for idx, (dev_idx, dev_name) in enumerate(self.audio_devices): + if str(dev_idx) == current_device or current_device == 'default' and idx == 0: + self.audio_device_menu.set(dev_name) + break + + self.chunk_entry.insert(0, str(self.config.get('audio.chunk_duration', 3.0))) + + # Transcription settings + self.model_menu.set(self.config.get('transcription.model', 'base')) + + current_compute = self.config.get('transcription.device', 'auto') + for dev_id, dev_desc in self.compute_devices: + if dev_id == current_compute or (current_compute == 'auto' and dev_id == self.compute_devices[0][0]): + self.compute_device_menu.set(dev_desc) + break + + self.lang_menu.set(self.config.get('transcription.language', 'en')) + + # Noise suppression + self.noise_enabled_var.set(self.config.get('noise_suppression.enabled', True)) + strength = self.config.get('noise_suppression.strength', 0.7) + self.noise_strength_slider.set(strength) + self._update_strength_label(strength) + self.vad_enabled_var.set(self.config.get('processing.use_vad', True)) + + # Display settings + self.timestamps_var.set(self.config.get('display.show_timestamps', True)) + self.maxlines_entry.insert(0, str(self.config.get('display.max_lines', 100))) + + def _save_settings(self): + """Save settings to config.""" + try: + # User settings + self.config.set('user.name', self.name_entry.get()) + + # Audio settings + selected_audio = self.audio_device_menu.get() + for dev_idx, dev_name in self.audio_devices: + if dev_name == selected_audio: + self.config.set('audio.input_device', str(dev_idx)) + break + + chunk_duration = float(self.chunk_entry.get()) + self.config.set('audio.chunk_duration', chunk_duration) + + # Transcription settings + self.config.set('transcription.model', self.model_menu.get()) + + selected_compute = self.compute_device_menu.get() + for dev_id, dev_desc in self.compute_devices: + if dev_desc == selected_compute: + self.config.set('transcription.device', dev_id) + break + + self.config.set('transcription.language', self.lang_menu.get()) + + # Noise suppression + self.config.set('noise_suppression.enabled', self.noise_enabled_var.get()) + self.config.set('noise_suppression.strength', self.noise_strength_slider.get()) + self.config.set('processing.use_vad', self.vad_enabled_var.get()) + + # Display settings + self.config.set('display.show_timestamps', self.timestamps_var.get()) + max_lines = int(self.maxlines_entry.get()) + self.config.set('display.max_lines', max_lines) + + # Call save callback + if self.on_save: + self.on_save() + + messagebox.showinfo("Settings Saved", "Settings have been saved successfully!") + self.destroy() + + except ValueError as e: + messagebox.showerror("Invalid Input", f"Please check your input values:\n{e}") + except Exception as e: + messagebox.showerror("Error", f"Failed to save settings:\n{e}") diff --git a/gui/settings_dialog_qt.py b/gui/settings_dialog_qt.py new file mode 100644 index 0000000..32efdc2 --- /dev/null +++ b/gui/settings_dialog_qt.py @@ -0,0 +1,261 @@ +"""PySide6 settings dialog for configuring the application.""" + +from PySide6.QtWidgets import ( + QDialog, QVBoxLayout, QHBoxLayout, QFormLayout, + QLabel, QLineEdit, QComboBox, QCheckBox, QSlider, + QPushButton, QMessageBox, QGroupBox +) +from PySide6.QtCore import Qt +from typing import Callable, List, Tuple + + +class SettingsDialog(QDialog): + """Dialog window for application settings using PySide6.""" + + def __init__( + self, + parent, + config, + audio_devices: List[Tuple[int, str]], + compute_devices: List[Tuple[str, str]], + on_save: Callable = None + ): + """ + Initialize settings dialog. + + Args: + parent: Parent window + config: Configuration object + audio_devices: List of (device_index, device_name) tuples + compute_devices: List of (device_id, device_description) tuples + on_save: Callback function when settings are saved + """ + super().__init__(parent) + + self.config = config + self.audio_devices = audio_devices + self.compute_devices = compute_devices + self.on_save = on_save + + # Window configuration + self.setWindowTitle("Settings") + self.setMinimumSize(600, 700) + self.setModal(True) + + self._create_widgets() + self._load_current_settings() + + def _create_widgets(self): + """Create all settings widgets.""" + main_layout = QVBoxLayout() + self.setLayout(main_layout) + + # User Settings Group + user_group = QGroupBox("User Settings") + user_layout = QFormLayout() + + self.name_input = QLineEdit() + user_layout.addRow("Display Name:", self.name_input) + + user_group.setLayout(user_layout) + main_layout.addWidget(user_group) + + # Audio Settings Group + audio_group = QGroupBox("Audio Settings") + audio_layout = QFormLayout() + + self.audio_device_combo = QComboBox() + device_names = [name for _, name in self.audio_devices] + self.audio_device_combo.addItems(device_names) + audio_layout.addRow("Input Device:", self.audio_device_combo) + + self.chunk_input = QLineEdit() + audio_layout.addRow("Chunk Duration (s):", self.chunk_input) + + audio_group.setLayout(audio_layout) + main_layout.addWidget(audio_group) + + # Transcription Settings Group + transcription_group = QGroupBox("Transcription Settings") + transcription_layout = QFormLayout() + + self.model_combo = QComboBox() + self.model_combo.addItems(["tiny", "base", "small", "medium", "large"]) + transcription_layout.addRow("Model Size:", self.model_combo) + + self.compute_device_combo = QComboBox() + device_descs = [desc for _, desc in self.compute_devices] + self.compute_device_combo.addItems(device_descs) + transcription_layout.addRow("Compute Device:", self.compute_device_combo) + + self.lang_combo = QComboBox() + self.lang_combo.addItems(["auto", "en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"]) + transcription_layout.addRow("Language:", self.lang_combo) + + transcription_group.setLayout(transcription_layout) + main_layout.addWidget(transcription_group) + + # Noise Suppression Group + noise_group = QGroupBox("Noise Suppression") + noise_layout = QVBoxLayout() + + self.noise_enabled_check = QCheckBox("Enable Noise Suppression") + noise_layout.addWidget(self.noise_enabled_check) + + # Strength slider + strength_layout = QHBoxLayout() + strength_layout.addWidget(QLabel("Strength:")) + + self.noise_strength_slider = QSlider(Qt.Horizontal) + self.noise_strength_slider.setMinimum(0) + self.noise_strength_slider.setMaximum(100) + self.noise_strength_slider.setValue(70) + self.noise_strength_slider.valueChanged.connect(self._update_strength_label) + strength_layout.addWidget(self.noise_strength_slider) + + self.noise_strength_label = QLabel("0.7") + strength_layout.addWidget(self.noise_strength_label) + + noise_layout.addLayout(strength_layout) + + self.vad_enabled_check = QCheckBox("Enable Voice Activity Detection") + noise_layout.addWidget(self.vad_enabled_check) + + noise_group.setLayout(noise_layout) + main_layout.addWidget(noise_group) + + # Display Settings Group + display_group = QGroupBox("Display Settings") + display_layout = QFormLayout() + + self.timestamps_check = QCheckBox() + display_layout.addRow("Show Timestamps:", self.timestamps_check) + + self.maxlines_input = QLineEdit() + display_layout.addRow("Max Lines:", self.maxlines_input) + + self.font_family_combo = QComboBox() + self.font_family_combo.addItems(["Courier", "Arial", "Times New Roman", "Consolas", "Monaco", "Monospace"]) + display_layout.addRow("Font Family:", self.font_family_combo) + + self.font_size_input = QLineEdit() + display_layout.addRow("Font Size:", self.font_size_input) + + self.fade_seconds_input = QLineEdit() + display_layout.addRow("Fade After (seconds):", self.fade_seconds_input) + + display_group.setLayout(display_layout) + main_layout.addWidget(display_group) + + # Buttons + button_layout = QHBoxLayout() + button_layout.addStretch() + + self.cancel_button = QPushButton("Cancel") + self.cancel_button.clicked.connect(self.reject) + button_layout.addWidget(self.cancel_button) + + self.save_button = QPushButton("Save") + self.save_button.clicked.connect(self._save_settings) + self.save_button.setDefault(True) + button_layout.addWidget(self.save_button) + + main_layout.addLayout(button_layout) + + def _update_strength_label(self, value): + """Update the noise strength label.""" + self.noise_strength_label.setText(f"{value / 100:.1f}") + + def _load_current_settings(self): + """Load current settings from config.""" + # User settings + self.name_input.setText(self.config.get('user.name', 'User')) + + # Audio settings + current_device = self.config.get('audio.input_device', 'default') + for idx, (dev_idx, dev_name) in enumerate(self.audio_devices): + if str(dev_idx) == current_device or (current_device == 'default' and idx == 0): + self.audio_device_combo.setCurrentIndex(idx) + break + + self.chunk_input.setText(str(self.config.get('audio.chunk_duration', 3.0))) + + # Transcription settings + model = self.config.get('transcription.model', 'base') + self.model_combo.setCurrentText(model) + + current_compute = self.config.get('transcription.device', 'auto') + for idx, (dev_id, dev_desc) in enumerate(self.compute_devices): + if dev_id == current_compute or (current_compute == 'auto' and idx == 0): + self.compute_device_combo.setCurrentIndex(idx) + break + + lang = self.config.get('transcription.language', 'en') + self.lang_combo.setCurrentText(lang) + + # Noise suppression + self.noise_enabled_check.setChecked(self.config.get('noise_suppression.enabled', True)) + strength = self.config.get('noise_suppression.strength', 0.7) + self.noise_strength_slider.setValue(int(strength * 100)) + self._update_strength_label(int(strength * 100)) + self.vad_enabled_check.setChecked(self.config.get('processing.use_vad', True)) + + # Display settings + self.timestamps_check.setChecked(self.config.get('display.show_timestamps', True)) + self.maxlines_input.setText(str(self.config.get('display.max_lines', 100))) + + font_family = self.config.get('display.font_family', 'Courier') + self.font_family_combo.setCurrentText(font_family) + + self.font_size_input.setText(str(self.config.get('display.font_size', 12))) + self.fade_seconds_input.setText(str(self.config.get('display.fade_after_seconds', 10))) + + def _save_settings(self): + """Save settings to config.""" + try: + # User settings + self.config.set('user.name', self.name_input.text()) + + # Audio settings + selected_audio_idx = self.audio_device_combo.currentIndex() + dev_idx, _ = self.audio_devices[selected_audio_idx] + self.config.set('audio.input_device', str(dev_idx)) + + chunk_duration = float(self.chunk_input.text()) + self.config.set('audio.chunk_duration', chunk_duration) + + # Transcription settings + self.config.set('transcription.model', self.model_combo.currentText()) + + selected_compute_idx = self.compute_device_combo.currentIndex() + dev_id, _ = self.compute_devices[selected_compute_idx] + self.config.set('transcription.device', dev_id) + + self.config.set('transcription.language', self.lang_combo.currentText()) + + # Noise suppression + self.config.set('noise_suppression.enabled', self.noise_enabled_check.isChecked()) + self.config.set('noise_suppression.strength', self.noise_strength_slider.value() / 100.0) + self.config.set('processing.use_vad', self.vad_enabled_check.isChecked()) + + # Display settings + self.config.set('display.show_timestamps', self.timestamps_check.isChecked()) + max_lines = int(self.maxlines_input.text()) + self.config.set('display.max_lines', max_lines) + self.config.set('display.font_family', self.font_family_combo.currentText()) + font_size = int(self.font_size_input.text()) + self.config.set('display.font_size', font_size) + fade_seconds = int(self.fade_seconds_input.text()) + self.config.set('display.fade_after_seconds', fade_seconds) + + # Call save callback + if self.on_save: + self.on_save() + + QMessageBox.information(self, "Settings Saved", "Settings have been saved successfully!") + self.accept() + + except ValueError as e: + QMessageBox.critical(self, "Invalid Input", f"Please check your input values:\n{e}") + except Exception as e: + QMessageBox.critical(self, "Error", f"Failed to save settings:\n{e}") diff --git a/gui/transcription_display.py b/gui/transcription_display.py new file mode 100644 index 0000000..b2b993b --- /dev/null +++ b/gui/transcription_display.py @@ -0,0 +1,127 @@ +"""Transcription display widget for showing real-time transcriptions.""" + +import customtkinter as ctk +from typing import List +from datetime import datetime + + +class TranscriptionDisplay(ctk.CTkTextbox): + """Custom text widget for displaying transcriptions.""" + + def __init__(self, master, max_lines: int = 100, show_timestamps: bool = True, **kwargs): + """ + Initialize transcription display. + + Args: + master: Parent widget + max_lines: Maximum number of lines to keep in display + show_timestamps: Whether to show timestamps + **kwargs: Additional arguments for CTkTextbox + """ + super().__init__(master, **kwargs) + + self.max_lines = max_lines + self.show_timestamps = show_timestamps + self.line_count = 0 + + # Configure text widget + self.configure(state="disabled") # Read-only by default + + def add_transcription(self, text: str, user_name: str = "", timestamp: datetime = None): + """ + Add a new transcription to the display. + + Args: + text: Transcription text + user_name: User/speaker name + timestamp: Timestamp of transcription + """ + if timestamp is None: + timestamp = datetime.now() + + # Build the display line + line_parts = [] + + if self.show_timestamps: + time_str = timestamp.strftime("%H:%M:%S") + line_parts.append(f"[{time_str}]") + + if user_name: + line_parts.append(f"{user_name}:") + + line_parts.append(text) + + line = " ".join(line_parts) + "\n" + + # Add to display + self.configure(state="normal") + self.insert("end", line) + self.configure(state="disabled") + + # Auto-scroll to bottom + self.see("end") + + # Track line count + self.line_count += 1 + + # Remove old lines if exceeding max + if self.line_count > self.max_lines: + self._remove_oldest_lines(self.line_count - self.max_lines) + + def _remove_oldest_lines(self, num_lines: int): + """ + Remove oldest lines from the display. + + Args: + num_lines: Number of lines to remove + """ + self.configure(state="normal") + self.delete("1.0", f"{num_lines + 1}.0") + self.configure(state="disabled") + self.line_count -= num_lines + + def clear(self): + """Clear all transcriptions.""" + self.configure(state="normal") + self.delete("1.0", "end") + self.configure(state="disabled") + self.line_count = 0 + + def get_all_text(self) -> str: + """ + Get all transcription text. + + Returns: + All text in the display + """ + return self.get("1.0", "end") + + def set_max_lines(self, max_lines: int): + """Update maximum number of lines to keep.""" + self.max_lines = max_lines + + # Trim if necessary + if self.line_count > self.max_lines: + self._remove_oldest_lines(self.line_count - self.max_lines) + + def set_show_timestamps(self, show: bool): + """Update whether to show timestamps.""" + self.show_timestamps = show + + def save_to_file(self, filepath: str) -> bool: + """ + Save transcriptions to a file. + + Args: + filepath: Path to save file + + Returns: + True if saved successfully + """ + try: + with open(filepath, 'w') as f: + f.write(self.get_all_text()) + return True + except Exception as e: + print(f"Error saving transcriptions: {e}") + return False diff --git a/gui/transcription_display_qt.py b/gui/transcription_display_qt.py new file mode 100644 index 0000000..e7ca667 --- /dev/null +++ b/gui/transcription_display_qt.py @@ -0,0 +1,159 @@ +"""PySide6 transcription display widget for showing real-time transcriptions.""" + +from PySide6.QtWidgets import QTextEdit +from PySide6.QtGui import QFont, QTextCursor +from PySide6.QtCore import Qt, Slot +from datetime import datetime + + +class TranscriptionDisplay(QTextEdit): + """Custom text widget for displaying transcriptions using PySide6.""" + + def __init__(self, parent=None, max_lines=100, show_timestamps=True, font_family="Courier", font_size=12): + """ + Initialize transcription display. + + Args: + parent: Parent widget + max_lines: Maximum number of lines to keep in display + show_timestamps: Whether to show timestamps + font_family: Font family name + font_size: Font size in points + """ + super().__init__(parent) + + self.max_lines = max_lines + self.show_timestamps = show_timestamps + self.line_count = 0 + self.font_family = font_family + self.font_size = font_size + + # Configure text widget + self.setReadOnly(True) + self.setFont(QFont(font_family, font_size)) + + # Set dark theme styling + self.setStyleSheet(""" + QTextEdit { + background-color: #2b2b2b; + color: #ffffff; + border: 1px solid #3d3d3d; + border-radius: 5px; + padding: 10px; + } + """) + + @Slot(str, str) + def add_transcription(self, text: str, user_name: str = "", timestamp: datetime = None): + """ + Add a new transcription to the display. + + Args: + text: Transcription text + user_name: User/speaker name + timestamp: Timestamp of transcription + """ + if timestamp is None: + timestamp = datetime.now() + + # Build the display line + line_parts = [] + + if self.show_timestamps: + time_str = timestamp.strftime("%H:%M:%S") + line_parts.append(f"[{time_str}]") + + if user_name: + line_parts.append(f"{user_name}:") + + line_parts.append(text) + + line = " ".join(line_parts) + + # Add to display + self.append(line) + + # Auto-scroll to bottom + cursor = self.textCursor() + cursor.movePosition(QTextCursor.End) + self.setTextCursor(cursor) + + # Track line count + self.line_count += 1 + + # Remove old lines if exceeding max + if self.line_count > self.max_lines: + self._remove_oldest_lines(self.line_count - self.max_lines) + + def _remove_oldest_lines(self, num_lines: int): + """ + Remove oldest lines from the display. + + Args: + num_lines: Number of lines to remove + """ + cursor = self.textCursor() + cursor.movePosition(QTextCursor.Start) + + for _ in range(num_lines): + cursor.select(QTextCursor.BlockUnderCursor) + cursor.removeSelectedText() + cursor.deleteChar() # Remove the newline + + self.line_count -= num_lines + + def clear_all(self): + """Clear all transcriptions.""" + self.clear() + self.line_count = 0 + + def get_all_text(self) -> str: + """ + Get all transcription text. + + Returns: + All text in the display + """ + return self.toPlainText() + + def set_max_lines(self, max_lines: int): + """Update maximum number of lines to keep.""" + self.max_lines = max_lines + + # Trim if necessary + if self.line_count > self.max_lines: + self._remove_oldest_lines(self.line_count - self.max_lines) + + def set_show_timestamps(self, show: bool): + """Update whether to show timestamps.""" + self.show_timestamps = show + + def set_font(self, font_family: str, font_size: int): + """ + Update font settings. + + Args: + font_family: Font family name + font_size: Font size in points + """ + self.font_family = font_family + self.font_size = font_size + super().setFont(QFont(font_family, font_size)) + + def save_to_file(self, filepath: str) -> bool: + """ + Save transcriptions to a file. + + Args: + filepath: Path to save file + + Returns: + True if saved successfully + """ + try: + with open(filepath, 'w') as f: + f.write(self.toPlainText()) + return True + except Exception as e: + print(f"Error saving transcriptions: {e}") + return False diff --git a/local-transcription.spec b/local-transcription.spec new file mode 100644 index 0000000..91e7091 --- /dev/null +++ b/local-transcription.spec @@ -0,0 +1,86 @@ +# -*- mode: python ; coding: utf-8 -*- +"""PyInstaller spec file for Local Transcription app.""" + +import sys +from pathlib import Path + +block_cipher = None + +# Determine if we're on Windows +is_windows = sys.platform == 'win32' + +a = Analysis( + ['main.py'], + pathex=[], + binaries=[], + datas=[ + ('config/default_config.yaml', 'config'), + ], + hiddenimports=[ + 'PySide6.QtCore', + 'PySide6.QtWidgets', + 'PySide6.QtGui', + 'faster_whisper', + 'faster_whisper.transcribe', + 'faster_whisper.vad', + 'ctranslate2', + 'sounddevice', + 'noisereduce', + 'webrtcvad', + 'scipy', + 'scipy.signal', + 'numpy', + 'fastapi', + 'uvicorn', + 'uvicorn.logging', + 'uvicorn.loops', + 'uvicorn.loops.auto', + 'uvicorn.protocols', + 'uvicorn.protocols.http', + 'uvicorn.protocols.http.auto', + 'uvicorn.protocols.websockets', + 'uvicorn.protocols.websockets.auto', + 'uvicorn.lifespan', + 'uvicorn.lifespan.on', + ], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + [], + exclude_binaries=True, + name='LocalTranscription', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + console=True, # Set to False to hide console window + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, + icon=None, # Add icon file path here if you have one +) + +coll = COLLECT( + exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=True, + upx_exclude=[], + name='LocalTranscription', +) diff --git a/main.py b/main.py new file mode 100644 index 0000000..49ce89b --- /dev/null +++ b/main.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Local Transcription Application + +A standalone desktop application for real-time speech-to-text transcription +using Whisper models. Supports CPU/GPU processing, noise suppression, and +optional multi-user server synchronization. +""" + +import sys +from pathlib import Path + +# Add project root to Python path +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) + +from PySide6.QtWidgets import QApplication +from gui.main_window_qt import MainWindow + + +def main(): + """Main application entry point.""" + try: + print("Starting Local Transcription Application...") + print("=" * 50) + + # Create Qt application + app = QApplication(sys.argv) + + # Set application info + app.setApplicationName("Local Transcription") + app.setOrganizationName("LocalTranscription") + + # Create and show main window + window = MainWindow() + window.show() + + # Run application + sys.exit(app.exec()) + + except KeyboardInterrupt: + print("\nApplication interrupted by user") + sys.exit(0) + except Exception as e: + print(f"Fatal error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/main_cli.py b/main_cli.py new file mode 100755 index 0000000..299d067 --- /dev/null +++ b/main_cli.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +""" +Local Transcription CLI + +Command-line version of the transcription application. +Works without GUI - perfect for testing and headless operation. +""" + +import sys +import os +from pathlib import Path +import signal +import argparse + +# Add project root to Python path +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) + +from client.config import Config +from client.device_utils import DeviceManager +from client.audio_capture import AudioCapture +from client.noise_suppression import NoiseSuppressor +from client.transcription_engine import TranscriptionEngine + + +class TranscriptionCLI: + """CLI transcription application.""" + + def __init__(self, args): + """Initialize the CLI application.""" + self.args = args + self.config = Config() + self.device_manager = DeviceManager() + self.is_running = False + + # Override config with command-line arguments + if args.model: + self.config.set('transcription.model', args.model) + if args.device: + self.config.set('transcription.device', args.device) + if args.language: + self.config.set('transcription.language', args.language) + if args.user: + self.config.set('user.name', args.user) + + # Components + self.audio_capture = None + self.noise_suppressor = None + self.transcription_engine = None + + def initialize(self): + """Initialize all components.""" + print("=" * 60) + print("Local Transcription CLI") + print("=" * 60) + + # Device setup + device_config = self.config.get('transcription.device', 'auto') + self.device_manager.set_device(device_config) + + print(f"\nUser: {self.config.get('user.name', 'User')}") + print(f"Model: {self.config.get('transcription.model', 'base')}") + print(f"Language: {self.config.get('transcription.language', 'en')}") + print(f"Device: {self.device_manager.current_device}") + + # Initialize transcription engine + print(f"\nLoading Whisper model...") + model_size = self.config.get('transcription.model', 'base') + language = self.config.get('transcription.language', 'en') + device = self.device_manager.get_device_for_whisper() + compute_type = self.device_manager.get_compute_type() + + self.transcription_engine = TranscriptionEngine( + model_size=model_size, + device=device, + compute_type=compute_type, + language=language, + min_confidence=self.config.get('processing.min_confidence', 0.5) + ) + + success = self.transcription_engine.load_model() + if not success: + print("āŒ Failed to load model!") + return False + + print("āœ“ Model loaded successfully!") + + # Initialize audio capture + audio_device_str = self.config.get('audio.input_device', 'default') + audio_device = None if audio_device_str == 'default' else int(audio_device_str) + + self.audio_capture = AudioCapture( + sample_rate=self.config.get('audio.sample_rate', 16000), + chunk_duration=self.config.get('audio.chunk_duration', 3.0), + device=audio_device + ) + + # Initialize noise suppressor + self.noise_suppressor = NoiseSuppressor( + sample_rate=self.config.get('audio.sample_rate', 16000), + method="noisereduce" if self.config.get('noise_suppression.enabled', True) else "none", + strength=self.config.get('noise_suppression.strength', 0.7), + use_vad=self.config.get('processing.use_vad', True) + ) + + print("\nāœ“ All components initialized!") + return True + + def process_audio_chunk(self, audio_chunk): + """Process an audio chunk.""" + try: + # Apply noise suppression + processed_audio = self.noise_suppressor.process(audio_chunk, skip_silent=True) + + # Skip if silent + if processed_audio is None: + return + + # Transcribe + user_name = self.config.get('user.name', 'User') + result = self.transcription_engine.transcribe( + processed_audio, + sample_rate=self.config.get('audio.sample_rate', 16000), + user_name=user_name + ) + + # Display result + if result: + print(f"{result}") + + except Exception as e: + print(f"Error processing audio: {e}") + + def run(self): + """Run the transcription loop.""" + if not self.initialize(): + return 1 + + # Setup signal handler for graceful shutdown + def signal_handler(sig, frame): + print("\n\nStopping transcription...") + self.is_running = False + + signal.signal(signal.SIGINT, signal_handler) + + print("\n" + "=" * 60) + print("šŸŽ¤ Recording... (Press Ctrl+C to stop)") + print("=" * 60) + print() + + # Start recording + self.is_running = True + self.audio_capture.start_recording(callback=self.process_audio_chunk) + + # Keep running until interrupted + try: + while self.is_running: + signal.pause() + except AttributeError: + # signal.pause() not available on Windows + import time + while self.is_running: + time.sleep(0.1) + + # Cleanup + self.audio_capture.stop_recording() + self.transcription_engine.unload_model() + + print("\n" + "=" * 60) + print("āœ“ Transcription stopped") + print("=" * 60) + + return 0 + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description='Local Transcription CLI - Real-time speech-to-text' + ) + parser.add_argument( + '-m', '--model', + choices=['tiny', 'base', 'small', 'medium', 'large'], + help='Whisper model size' + ) + parser.add_argument( + '-d', '--device', + choices=['cpu', 'cuda', 'auto'], + help='Compute device' + ) + parser.add_argument( + '-l', '--language', + help='Language code (e.g., en, es, fr) or "auto"' + ) + parser.add_argument( + '-u', '--user', + help='User/speaker name' + ) + parser.add_argument( + '--list-devices', + action='store_true', + help='List available audio input devices' + ) + + args = parser.parse_args() + + # List devices if requested + if args.list_devices: + print("Available audio input devices:") + devices = AudioCapture.get_input_devices() + for idx, name in devices: + print(f" [{idx}] {name}") + return 0 + + # Run application + app = TranscriptionCLI(args) + return app.run() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..807ca59 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,59 @@ +[project] +name = "local-transcription" +version = "0.1.0" +description = "A standalone desktop application for real-time speech-to-text transcription using Whisper models" +readme = "README.md" +requires-python = ">=3.9" +license = {text = "MIT"} +authors = [ + {name = "Your Name", email = "your.email@example.com"} +] +keywords = ["transcription", "speech-to-text", "whisper", "streaming", "obs"] + +dependencies = [ + "numpy>=1.24.0", + "pyyaml>=6.0", + "sounddevice>=0.4.6", + "scipy>=1.10.0", + "noisereduce>=3.0.0", + "webrtcvad>=2.0.10", + "faster-whisper>=0.10.0", + "torch>=2.0.0", + "PySide6>=6.6.0", +] + +[project.optional-dependencies] +server = [ + "fastapi>=0.104.0", + "uvicorn>=0.24.0", + "websockets>=12.0", + "requests>=2.31.0", +] +dev = [ + "pytest>=7.4.0", + "black>=23.0.0", + "ruff>=0.1.0", +] + +[project.scripts] +local-transcription = "main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["client", "gui"] + +[tool.uv] +dev-dependencies = [ + "pyinstaller>=6.17.0", +] + +[tool.ruff] +line-length = 100 +target-version = "py39" + +[tool.black] +line-length = 100 +target-version = ["py39"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..35ba3d5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,23 @@ +# Core Dependencies +numpy>=1.24.0 +pyyaml>=6.0 + +# Audio Processing +sounddevice>=0.4.6 +scipy>=1.10.0 + +# Noise Suppression +noisereduce>=3.0.0 +webrtcvad>=2.0.10 + +# Transcription - Using faster-whisper for better real-time performance +faster-whisper>=0.10.0 +torch>=2.0.0 + +# GUI - Using CustomTkinter for modern look +customtkinter>=5.2.0 +pillow>=10.0.0 + +# Optional: Server sync dependencies (will move to requirements-server.txt later) +# websockets>=12.0 +# requests>=2.31.0 diff --git a/server/__init__.py b/server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/web_display.py b/server/web_display.py new file mode 100644 index 0000000..a9b3742 --- /dev/null +++ b/server/web_display.py @@ -0,0 +1,233 @@ +"""Web server for displaying transcriptions in a browser (for OBS browser source).""" + +import asyncio +from fastapi import FastAPI, WebSocket +from fastapi.responses import HTMLResponse +from typing import List, Optional +import json +from datetime import datetime + + +class TranscriptionWebServer: + """Web server for displaying transcriptions.""" + + def __init__(self, host: str = "127.0.0.1", port: int = 8080, show_timestamps: bool = True, fade_after_seconds: int = 10): + """ + Initialize web server. + + Args: + host: Server host address + port: Server port + show_timestamps: Whether to show timestamps in transcriptions + fade_after_seconds: Time in seconds before transcriptions fade out (0 = never fade) + """ + self.host = host + self.port = port + self.show_timestamps = show_timestamps + self.fade_after_seconds = fade_after_seconds + self.app = FastAPI() + self.active_connections: List[WebSocket] = [] + self.transcriptions = [] # Store recent transcriptions + + # Setup routes + self._setup_routes() + + def _setup_routes(self): + """Setup FastAPI routes.""" + + @self.app.get("/", response_class=HTMLResponse) + async def get_display(): + """Serve the transcription display page.""" + return self._get_html() + + @self.app.websocket("/ws") + async def websocket_endpoint(websocket: WebSocket): + """WebSocket endpoint for real-time updates.""" + await websocket.accept() + self.active_connections.append(websocket) + + try: + # Send recent transcriptions + for trans in self.transcriptions[-20:]: # Last 20 + await websocket.send_json(trans) + + # Keep connection alive + while True: + # Wait for ping/pong to keep connection alive + await websocket.receive_text() + except: + self.active_connections.remove(websocket) + + def _get_html(self) -> str: + """Generate HTML for transcription display.""" + return f""" + + + + Transcription Display + + + +
+ + + + + """ + + async def broadcast_transcription(self, text: str, user_name: str = "", timestamp: Optional[datetime] = None): + """ + Broadcast a transcription to all connected clients. + + Args: + text: Transcription text + user_name: User/speaker name + timestamp: Timestamp of transcription + """ + if timestamp is None: + timestamp = datetime.now() + + trans_data = { + "text": text, + "user_name": user_name, + } + + # Only include timestamp if enabled + if self.show_timestamps: + trans_data["timestamp"] = timestamp.strftime("%H:%M:%S") + + # Store transcription + self.transcriptions.append(trans_data) + if len(self.transcriptions) > 100: + self.transcriptions.pop(0) + + # Broadcast to all connected clients + disconnected = [] + for connection in self.active_connections: + try: + await connection.send_json(trans_data) + except: + disconnected.append(connection) + + # Remove disconnected clients + for conn in disconnected: + self.active_connections.remove(conn) + + async def start(self): + """Start the web server.""" + import uvicorn + config = uvicorn.Config( + self.app, + host=self.host, + port=self.port, + log_level="warning" + ) + server = uvicorn.Server(config) + await server.serve() diff --git a/test_components.py b/test_components.py new file mode 100644 index 0000000..0e844e3 --- /dev/null +++ b/test_components.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Test script to verify all components work without GUI. +This can run in headless environments. +""" + +import sys +from pathlib import Path + +# Add project root to Python path +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) + +print("=" * 60) +print("Testing Local Transcription Components (No GUI)") +print("=" * 60) + +# Test 1: Configuration +print("\n1. Testing Configuration System...") +try: + from client.config import Config + config = Config() + print(f" āœ“ Config loaded: {config.config_path}") + print(f" āœ“ User name: {config.get('user.name')}") + print(f" āœ“ Model: {config.get('transcription.model')}") +except Exception as e: + print(f" āœ— Config failed: {e}") + sys.exit(1) + +# Test 2: Device Detection +print("\n2. Testing Device Detection...") +try: + from client.device_utils import DeviceManager + device_mgr = DeviceManager() + print(f" āœ“ Available devices: {device_mgr.available_devices}") + print(f" āœ“ Current device: {device_mgr.current_device}") + print(f" āœ“ GPU available: {device_mgr.is_gpu_available()}") + + device_info = device_mgr.get_device_info() + for dev_id, dev_desc in device_info: + print(f" - {dev_id}: {dev_desc}") +except Exception as e: + print(f" āœ— Device detection failed: {e}") + sys.exit(1) + +# Test 3: Audio Devices +print("\n3. Testing Audio Capture...") +try: + from client.audio_capture import AudioCapture + devices = AudioCapture.get_input_devices() + print(f" āœ“ Found {len(devices)} audio input device(s)") + for idx, name in devices[:5]: # Show first 5 + print(f" - [{idx}] {name}") + if len(devices) > 5: + print(f" ... and {len(devices) - 5} more") +except Exception as e: + print(f" āœ— Audio capture failed: {e}") + +# Test 4: Noise Suppression +print("\n4. Testing Noise Suppression...") +try: + from client.noise_suppression import NoiseSuppressor + import numpy as np + + suppressor = NoiseSuppressor(sample_rate=16000, method="noisereduce", strength=0.7) + print(f" āœ“ Noise suppressor created: {suppressor}") + + # Test with dummy audio + test_audio = np.random.randn(16000).astype(np.float32) * 0.1 + processed = suppressor.process(test_audio, skip_silent=False) + print(f" āœ“ Processed audio shape: {processed.shape}") +except Exception as e: + print(f" āœ— Noise suppression failed: {e}") + +# Test 5: Transcription Engine +print("\n5. Testing Transcription Engine (Loading Model)...") +try: + from client.transcription_engine import TranscriptionEngine + + device = device_mgr.get_device_for_whisper() + compute_type = device_mgr.get_compute_type() + + print(f" → Using device: {device} with compute type: {compute_type}") + print(f" → Loading model (this may take 1-2 minutes on first run)...") + + engine = TranscriptionEngine( + model_size="tiny", # Use tiny for faster testing + device=device, + compute_type=compute_type, + language="en" + ) + + success = engine.load_model() + if success: + print(f" āœ“ Model loaded successfully!") + print(f" āœ“ Engine: {engine}") + + # Test transcription with dummy audio + print(f"\n Testing transcription with silent audio...") + test_audio = np.zeros(48000, dtype=np.float32) # 3 seconds of silence + result = engine.transcribe(test_audio, sample_rate=16000, user_name="Test") + + if result: + print(f" āœ“ Transcription result: '{result.text}'") + else: + print(f" ℹ No transcription (expected for silent audio)") + + engine.unload_model() + else: + print(f" āœ— Model loading failed") + sys.exit(1) + +except Exception as e: + print(f" āœ— Transcription engine failed: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +print("\n" + "=" * 60) +print("āœ“ All Components Tested Successfully!") +print("=" * 60) +print("\nThe application is ready to use!") +print("Run 'uv run python main.py' on a system with a display.") +print("=" * 60)