From 58faa83cb3bca99b8d1e29efe1f2a1cb8eac25a2 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 21:33:43 -0700 Subject: [PATCH] Cross-platform distribution, UI improvements, and performance optimizations - PyInstaller frozen sidecar: spec file, build script, and ffmpeg path resolver for self-contained distribution without Python prerequisites - Dual-mode sidecar launcher: frozen binary (production) with dev mode fallback - Parallel transcription + diarization pipeline (~30-40% faster) - GPU auto-detection for diarization (CUDA when available) - Async run_pipeline command for real-time progress event delivery - Web Audio API backend for instant playback and seeking - OpenAI-compatible provider replacing LiteLLM client-side routing - Cross-platform RAM detection (Linux/macOS/Windows) - Settings: speaker count hint, token reveal toggles, dark dropdown styling - Loading splash screen, flexbox layout fix for viewport overflow - Gitea Actions CI/CD pipeline (Linux, Windows, macOS ARM) - Updated README and CLAUDE.md documentation Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitea/workflows/build.yml | 136 +++++++++++ .github/workflows/build.yml | 141 ++++++++++++ .gitignore | 6 + CLAUDE.md | 14 +- README.md | 94 ++++++-- package.json | 4 +- python/build_sidecar.py | 215 ++++++++++++++++++ python/pyproject.toml | 3 + python/voice_to_notes.spec | 67 ++++++ python/voice_to_notes/hardware/detect.py | 86 ++++++- python/voice_to_notes/ipc/handlers.py | 6 +- .../providers/litellm_provider.py | 42 ++-- python/voice_to_notes/services/ai_provider.py | 4 +- python/voice_to_notes/services/diarize.py | 11 +- python/voice_to_notes/services/pipeline.py | 110 +++++---- python/voice_to_notes/services/transcribe.py | 5 +- python/voice_to_notes/utils/ffmpeg.py | 43 ++++ src-tauri/binaries/.gitkeep | 0 src-tauri/src/commands/transcribe.rs | 49 ++-- src-tauri/src/sidecar/mod.rs | 214 ++++++++++------- src-tauri/tauri.conf.json | 2 +- src/lib/components/AIChatPanel.svelte | 15 ++ src/lib/components/ProgressOverlay.svelte | 60 +++-- src/lib/components/SettingsModal.svelte | 71 +++++- src/lib/components/WaveformPlayer.svelte | 12 +- src/lib/stores/settings.ts | 22 ++ src/routes/+page.svelte | 152 +++++++++---- 27 files changed, 1301 insertions(+), 283 deletions(-) create mode 100644 .gitea/workflows/build.yml create mode 100644 .github/workflows/build.yml create mode 100644 python/build_sidecar.py create mode 100644 python/voice_to_notes.spec create mode 100644 python/voice_to_notes/utils/ffmpeg.py create mode 100644 src-tauri/binaries/.gitkeep diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml new file mode 100644 index 0000000..92000a5 --- /dev/null +++ b/.gitea/workflows/build.yml @@ -0,0 +1,136 @@ +name: Build & Release + +on: + push: + branches: [main] + tags: ["v*"] + pull_request: + branches: [main] + +env: + PYTHON_VERSION: "3.11" + NODE_VERSION: "20" + +jobs: + build-sidecar: + name: Build sidecar (${{ matrix.target }}) + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-latest + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-latest + target: aarch64-apple-darwin + platform: macos + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Build sidecar + working-directory: python + run: python build_sidecar.py --cpu-only + + - name: Upload sidecar artifact + uses: actions/upload-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: python/dist/voice-to-notes-sidecar/ + retention-days: 7 + + build-tauri: + name: Build app (${{ matrix.target }}) + needs: build-sidecar + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-latest + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-latest + target: aarch64-apple-darwin + platform: macos + + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + # Note: 'cache: npm' requires the Gitea instance to have + # Actions cache configured. Remove this if caching is unavailable. + cache: npm + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + + - name: Install system dependencies (Linux) + if: matrix.platform == 'linux' + run: | + sudo apt-get update + sudo apt-get install -y libgtk-3-dev libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf + + - name: Download sidecar artifact + uses: actions/download-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: src-tauri/binaries/ + + - name: Make sidecar executable (Unix) + if: matrix.platform != 'windows' + run: chmod +x src-tauri/binaries/voice-to-notes-sidecar-${{ matrix.target }} + + - name: Install npm dependencies + run: npm ci + + - name: Build Tauri app + run: npm run tauri build + env: + TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} + TAURI_CONFIG: '{"bundle":{"externalBin":["binaries/voice-to-notes-sidecar"]}}' + + - name: Upload app artifacts (Linux) + if: matrix.platform == 'linux' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/deb/*.deb + src-tauri/target/release/bundle/appimage/*.AppImage + retention-days: 30 + + - name: Upload app artifacts (Windows) + if: matrix.platform == 'windows' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/msi/*.msi + src-tauri/target/release/bundle/nsis/*.exe + retention-days: 30 + + - name: Upload app artifacts (macOS) + if: matrix.platform == 'macos' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/dmg/*.dmg + src-tauri/target/release/bundle/macos/*.app + retention-days: 30 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..7290873 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,141 @@ +name: Build & Release + +on: + push: + branches: [main] + tags: ["v*"] + pull_request: + branches: [main] + workflow_dispatch: + +env: + PYTHON_VERSION: "3.11" + NODE_VERSION: "20" + +jobs: + build-sidecar: + name: Build sidecar (${{ matrix.target }}) + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-20.04 + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-13 + target: x86_64-apple-darwin + platform: macos-intel + - runner: macos-14 + target: aarch64-apple-darwin + platform: macos-arm + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Build sidecar + working-directory: python + run: python build_sidecar.py --cpu-only + + - name: Upload sidecar artifact + uses: actions/upload-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: python/dist/voice-to-notes-sidecar/ + retention-days: 7 + + build-tauri: + name: Build app (${{ matrix.target }}) + needs: build-sidecar + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-20.04 + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-13 + target: x86_64-apple-darwin + platform: macos-intel + - runner: macos-14 + target: aarch64-apple-darwin + platform: macos-arm + + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: npm + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + + - name: Install system dependencies (Linux) + if: matrix.platform == 'linux' + run: | + sudo apt-get update + sudo apt-get install -y libgtk-3-dev libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf + + - name: Download sidecar artifact + uses: actions/download-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: src-tauri/binaries/ + + - name: Make sidecar executable (Unix) + if: matrix.platform != 'windows' + run: chmod +x src-tauri/binaries/voice-to-notes-sidecar-${{ matrix.target }} + + - name: Install npm dependencies + run: npm ci + + - name: Build Tauri app + run: npm run tauri build + env: + TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} + TAURI_CONFIG: '{"bundle":{"externalBin":["binaries/voice-to-notes-sidecar"]}}' + + - name: Upload app artifacts (Linux) + if: matrix.platform == 'linux' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/deb/*.deb + src-tauri/target/release/bundle/appimage/*.AppImage + retention-days: 30 + + - name: Upload app artifacts (Windows) + if: matrix.platform == 'windows' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/msi/*.msi + src-tauri/target/release/bundle/nsis/*.exe + retention-days: 30 + + - name: Upload app artifacts (macOS) + if: startsWith(matrix.platform, 'macos') + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/dmg/*.dmg + src-tauri/target/release/bundle/macos/*.app + retention-days: 30 diff --git a/.gitignore b/.gitignore index 50cad14..59a5935 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,9 @@ Thumbs.db *.ogg *.flac !test/fixtures/* + +# Sidecar build artifacts +src-tauri/binaries/* +!src-tauri/binaries/.gitkeep +python/dist/ +python/build/ diff --git a/CLAUDE.md b/CLAUDE.md index 6e55905..f12ceab 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,7 +8,7 @@ Desktop app for transcribing audio/video with speaker identification. Runs local - **ML pipeline:** Python sidecar process (faster-whisper, pyannote.audio, wav2vec2) - **Database:** SQLite (via rusqlite in Rust) - **Local AI:** Bundled llama-server (llama.cpp) — default, no install needed -- **Cloud AI providers:** LiteLLM, OpenAI, Anthropic (optional, user-configured) +- **Cloud AI providers:** OpenAI, Anthropic, OpenAI-compatible endpoints (optional, user-configured) - **Caption export:** pysubs2 (Python) - **Audio UI:** wavesurfer.js - **Transcript editor:** TipTap (ProseMirror) @@ -40,7 +40,13 @@ docs/ # Architecture and design documents - Database: UUIDs as primary keys (TEXT type in SQLite) - All timestamps in milliseconds (integer) relative to media file start +## Distribution +- Python sidecar is frozen via PyInstaller into a standalone binary for distribution +- Tauri bundles the sidecar via `externalBin` — no Python required for end users +- CI/CD builds on Gitea Actions (Linux, Windows, macOS ARM) +- Dev mode uses system Python (`VOICE_TO_NOTES_DEV=1` or debug builds) + ## Platform Targets -- Linux (primary development target) -- Windows (must work, tested before release) -- macOS (future, not yet targeted) +- Linux x86_64 (primary development target) +- Windows x86_64 +- macOS aarch64 (Apple Silicon) diff --git a/README.md b/README.md index 740f612..350a55d 100644 --- a/README.md +++ b/README.md @@ -2,28 +2,90 @@ A desktop application that transcribes audio/video recordings with speaker identification, producing editable transcriptions with synchronized audio playback. -## Goals +## Features -- **Speech-to-Text Transcription** — Accurately convert spoken audio from recordings into text -- **Speaker Identification (Diarization)** — Detect and distinguish between different speakers in a conversation -- **Speaker Naming** — Assign and persist speaker names/IDs across the transcription -- **Synchronized Playback** — Click any transcribed text segment to play back the corresponding audio for review and correction -- **Export Formats** - - Closed captioning files (SRT, VTT) for video - - Plain text documents with speaker labels -- **AI Integration** — Connect to AI providers to ask questions about the conversation and generate condensed notes/summaries +- **Speech-to-Text Transcription** — Accurate transcription via faster-whisper (Whisper models) with word-level timestamps +- **Speaker Identification (Diarization)** — Detect and distinguish between speakers using pyannote.audio +- **Synchronized Playback** — Click any word to seek to that point in the audio (Web Audio API for instant playback) +- **AI Integration** — Ask questions about your transcript via OpenAI, Anthropic, or any OpenAI-compatible API (LiteLLM proxies, Ollama, vLLM) +- **Export Formats** — SRT, WebVTT, ASS captions, plain text, and Markdown with speaker labels +- **Cross-Platform** — Builds for Linux, Windows, and macOS (Apple Silicon) ## Platform Support -| Platform | Status | -|----------|--------| -| Linux | Planned (initial target) | -| Windows | Planned (initial target) | -| macOS | Future (pending hardware) | +| Platform | Architecture | Status | +|----------|-------------|--------| +| Linux | x86_64 | Supported | +| Windows | x86_64 | Supported | +| macOS | ARM (Apple Silicon) | Supported | -## Project Status +## Tech Stack -**Early planning phase** — Architecture and technology decisions in progress. +- **Desktop shell:** Tauri v2 (Rust backend + Svelte 5 / TypeScript frontend) +- **ML pipeline:** Python sidecar (faster-whisper, pyannote.audio) — frozen via PyInstaller for distribution +- **Audio playback:** wavesurfer.js with Web Audio API backend +- **AI providers:** OpenAI, Anthropic, OpenAI-compatible endpoints (local or remote) +- **Local AI:** Bundled llama-server (llama.cpp) +- **Caption export:** pysubs2 + +## Development + +### Prerequisites + +- Node.js 20+ +- Rust (stable) +- Python 3.11+ with ML dependencies +- System: `libgtk-3-dev`, `libwebkit2gtk-4.1-dev` (Linux) + +### Getting Started + +```bash +# Install frontend dependencies +npm install + +# Install Python sidecar dependencies +cd python && pip install -e . && cd .. + +# Run in dev mode (uses system Python for the sidecar) +npm run tauri:dev +``` + +### Building for Distribution + +```bash +# Build the frozen Python sidecar +npm run sidecar:build + +# Build the Tauri app (requires sidecar in src-tauri/binaries/) +npm run tauri build +``` + +### CI/CD + +Gitea Actions workflows are in `.gitea/workflows/`. The build pipeline: + +1. **Build sidecar** — PyInstaller-frozen Python binary per platform (CPU-only PyTorch) +2. **Build Tauri app** — Bundles the sidecar via `externalBin`, produces .deb/.AppImage (Linux), .msi (Windows), .dmg (macOS) + +#### Required Secrets + +| Secret | Purpose | Required? | +|--------|---------|-----------| +| `TAURI_SIGNING_PRIVATE_KEY` | Signs Tauri update bundles | Optional (for auto-updates) | + +No other secrets are needed for building. AI provider API keys and HuggingFace tokens are configured by end users in the app's Settings. + +### Project Structure + +``` +src/ # Svelte 5 frontend +src-tauri/ # Rust backend (Tauri commands, sidecar manager, SQLite) +python/ # Python sidecar (transcription, diarization, AI) + voice_to_notes/ # Python package + build_sidecar.py # PyInstaller build script + voice_to_notes.spec # PyInstaller spec +.gitea/workflows/ # Gitea Actions CI/CD +``` ## License diff --git a/package.json b/package.json index 543d205..3a5ca97 100644 --- a/package.json +++ b/package.json @@ -11,7 +11,9 @@ "check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch", "lint": "eslint .", "test": "vitest", - "tauri": "tauri" + "tauri": "tauri", + "tauri:dev": "VOICE_TO_NOTES_DEV=1 tauri dev", + "sidecar:build": "cd python && python3 build_sidecar.py" }, "license": "MIT", "dependencies": { diff --git a/python/build_sidecar.py b/python/build_sidecar.py new file mode 100644 index 0000000..3b855ad --- /dev/null +++ b/python/build_sidecar.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +"""Build the Voice to Notes sidecar as a standalone binary using PyInstaller. + +Usage: + python build_sidecar.py [--cpu-only] + +Produces a directory `dist/voice-to-notes-sidecar/` containing the frozen +sidecar binary and all dependencies. The main binary is renamed to include +the Tauri target triple for externalBin resolution. +""" + +from __future__ import annotations + +import argparse +import os +import platform +import shutil +import stat +import subprocess +import sys +import urllib.request +import zipfile +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +DIST_DIR = SCRIPT_DIR / "dist" +BUILD_DIR = SCRIPT_DIR / "build" +SPEC_FILE = SCRIPT_DIR / "voice_to_notes.spec" + +# Static ffmpeg download URLs (GPL-licensed builds) +FFMPEG_URLS: dict[str, str] = { + "linux-x86_64": "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz", + "darwin-x86_64": "https://evermeet.cx/ffmpeg/getrelease/zip", + "darwin-arm64": "https://evermeet.cx/ffmpeg/getrelease/zip", + "win32-x86_64": "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip", +} + + +def get_target_triple() -> str: + """Determine the Tauri-compatible target triple for the current platform.""" + machine = platform.machine().lower() + system = platform.system().lower() + + arch_map = { + "x86_64": "x86_64", + "amd64": "x86_64", + "aarch64": "aarch64", + "arm64": "aarch64", + } + arch = arch_map.get(machine, machine) + + if system == "linux": + return f"{arch}-unknown-linux-gnu" + elif system == "darwin": + return f"{arch}-apple-darwin" + elif system == "windows": + return f"{arch}-pc-windows-msvc" + else: + return f"{arch}-unknown-{system}" + + +def create_venv_and_install(cpu_only: bool) -> Path: + """Create a fresh venv and install dependencies.""" + venv_dir = BUILD_DIR / "sidecar-venv" + if venv_dir.exists(): + shutil.rmtree(venv_dir) + + print(f"[build] Creating venv at {venv_dir}") + subprocess.run([sys.executable, "-m", "venv", str(venv_dir)], check=True) + + # Determine pip and python paths inside venv + if sys.platform == "win32": + pip = str(venv_dir / "Scripts" / "pip") + python = str(venv_dir / "Scripts" / "python") + else: + pip = str(venv_dir / "bin" / "pip") + python = str(venv_dir / "bin" / "python") + + # Upgrade pip + subprocess.run([pip, "install", "--upgrade", "pip"], check=True) + + # Install torch (CPU-only to avoid bundling ~2GB of CUDA libs) + if cpu_only: + print("[build] Installing PyTorch (CPU-only)") + subprocess.run( + [pip, "install", "torch", "torchaudio", + "--index-url", "https://download.pytorch.org/whl/cpu"], + check=True, + ) + else: + print("[build] Installing PyTorch (default, may include CUDA)") + subprocess.run([pip, "install", "torch", "torchaudio"], check=True) + + # Install project and dev deps (includes pyinstaller) + print("[build] Installing project dependencies") + subprocess.run([pip, "install", "-e", f"{SCRIPT_DIR}[dev]"], check=True) + + return Path(python) + + +def run_pyinstaller(python: Path) -> Path: + """Run PyInstaller using the spec file.""" + print("[build] Running PyInstaller") + subprocess.run( + [str(python), "-m", "PyInstaller", "--clean", "--noconfirm", str(SPEC_FILE)], + cwd=str(SCRIPT_DIR), + check=True, + ) + output_dir = DIST_DIR / "voice-to-notes-sidecar" + if not output_dir.exists(): + raise RuntimeError(f"PyInstaller output not found at {output_dir}") + return output_dir + + +def download_ffmpeg(output_dir: Path) -> None: + """Download a static ffmpeg/ffprobe binary for the current platform.""" + system = sys.platform + machine = platform.machine().lower() + if machine in ("amd64", "x86_64"): + machine = "x86_64" + elif machine in ("aarch64", "arm64"): + machine = "arm64" + + key = f"{system}-{machine}" + if system == "win32": + key = f"win32-{machine}" + elif system == "linux": + key = f"linux-{machine}" + + url = FFMPEG_URLS.get(key) + if not url: + print(f"[build] Warning: No ffmpeg download URL for platform {key}, skipping") + return + + print(f"[build] Downloading ffmpeg for {key}") + tmp_path = output_dir / "ffmpeg_download" + try: + urllib.request.urlretrieve(url, str(tmp_path)) + + if url.endswith(".tar.xz"): + # Linux static build + import tarfile + with tarfile.open(str(tmp_path), "r:xz") as tar: + for member in tar.getmembers(): + basename = os.path.basename(member.name) + if basename in ("ffmpeg", "ffprobe"): + member.name = basename + tar.extract(member, path=str(output_dir)) + dest = output_dir / basename + dest.chmod(dest.stat().st_mode | stat.S_IEXEC) + elif url.endswith(".zip"): + with zipfile.ZipFile(str(tmp_path), "r") as zf: + for name in zf.namelist(): + basename = os.path.basename(name) + if basename in ("ffmpeg", "ffprobe", "ffmpeg.exe", "ffprobe.exe"): + data = zf.read(name) + dest = output_dir / basename + dest.write_bytes(data) + if sys.platform != "win32": + dest.chmod(dest.stat().st_mode | stat.S_IEXEC) + print("[build] ffmpeg downloaded successfully") + except Exception as e: + print(f"[build] Warning: Failed to download ffmpeg: {e}") + finally: + if tmp_path.exists(): + tmp_path.unlink() + + +def rename_binary(output_dir: Path, target_triple: str) -> None: + """Rename the main binary to include the target triple for Tauri.""" + if sys.platform == "win32": + src = output_dir / "voice-to-notes-sidecar.exe" + dst = output_dir / f"voice-to-notes-sidecar-{target_triple}.exe" + else: + src = output_dir / "voice-to-notes-sidecar" + dst = output_dir / f"voice-to-notes-sidecar-{target_triple}" + + if src.exists(): + print(f"[build] Renaming {src.name} -> {dst.name}") + src.rename(dst) + else: + print(f"[build] Warning: Expected binary not found at {src}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build the Voice to Notes sidecar binary") + parser.add_argument( + "--cpu-only", + action="store_true", + default=True, + help="Install CPU-only PyTorch (default: True, avoids bundling CUDA)", + ) + parser.add_argument( + "--with-cuda", + action="store_true", + help="Install PyTorch with CUDA support", + ) + args = parser.parse_args() + cpu_only = not args.with_cuda + + target_triple = get_target_triple() + print(f"[build] Target triple: {target_triple}") + print(f"[build] CPU-only: {cpu_only}") + + python = create_venv_and_install(cpu_only) + output_dir = run_pyinstaller(python) + download_ffmpeg(output_dir) + rename_binary(output_dir, target_triple) + + print(f"\n[build] Done! Sidecar built at: {output_dir}") + print(f"[build] Copy contents to src-tauri/binaries/ for Tauri bundling") + + +if __name__ == "__main__": + main() diff --git a/python/pyproject.toml b/python/pyproject.toml index 62f118a..d7f3f8f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "faster-whisper>=1.1.0", "pyannote.audio>=3.1.0", "pysubs2>=1.7.0", + "openai>=1.0.0", + "anthropic>=0.20.0", ] [project.optional-dependencies] @@ -20,6 +22,7 @@ dev = [ "ruff>=0.8.0", "pytest>=8.0.0", "pytest-asyncio>=0.24.0", + "pyinstaller>=6.0", ] [tool.ruff] diff --git a/python/voice_to_notes.spec b/python/voice_to_notes.spec new file mode 100644 index 0000000..0687bb1 --- /dev/null +++ b/python/voice_to_notes.spec @@ -0,0 +1,67 @@ +# -*- mode: python ; coding: utf-8 -*- +"""PyInstaller spec for the Voice to Notes sidecar binary.""" + +from PyInstaller.utils.hooks import collect_all + +block_cipher = None + +# Collect all files for packages that have shared libraries / data files +# PyInstaller often misses these for ML packages +ctranslate2_datas, ctranslate2_binaries, ctranslate2_hiddenimports = collect_all("ctranslate2") +faster_whisper_datas, faster_whisper_binaries, faster_whisper_hiddenimports = collect_all( + "faster_whisper" +) +pyannote_datas, pyannote_binaries, pyannote_hiddenimports = collect_all("pyannote") + +a = Analysis( + ["voice_to_notes/main.py"], + pathex=[], + binaries=ctranslate2_binaries + faster_whisper_binaries + pyannote_binaries, + datas=ctranslate2_datas + faster_whisper_datas + pyannote_datas, + hiddenimports=[ + "torch", + "torchaudio", + "huggingface_hub", + "pysubs2", + "openai", + "anthropic", + "litellm", + ] + + ctranslate2_hiddenimports + + faster_whisper_hiddenimports + + pyannote_hiddenimports, + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=["tkinter", "test", "unittest", "pip", "setuptools"], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + [], + exclude_binaries=True, + name="voice-to-notes-sidecar", + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + console=True, +) + +coll = COLLECT( + exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=True, + upx_exclude=[], + name="voice-to-notes-sidecar", +) diff --git a/python/voice_to_notes/hardware/detect.py b/python/voice_to_notes/hardware/detect.py index f69c401..408d26a 100644 --- a/python/voice_to_notes/hardware/detect.py +++ b/python/voice_to_notes/hardware/detect.py @@ -2,7 +2,10 @@ from __future__ import annotations +import ctypes import os +import platform +import subprocess import sys from dataclasses import dataclass @@ -21,6 +24,77 @@ class HardwareInfo: recommended_compute_type: str = "int8" +def _detect_ram_mb() -> int: + """Detect total system RAM in MB (cross-platform). + + Tries platform-specific methods in order: + 1. Linux: read /proc/meminfo + 2. macOS: sysctl hw.memsize + 3. Windows: GlobalMemoryStatusEx via ctypes + 4. Fallback: os.sysconf (most Unix systems) + + Returns 0 if all methods fail. + """ + # Linux: read /proc/meminfo + if sys.platform == "linux": + try: + with open("/proc/meminfo") as f: + for line in f: + if line.startswith("MemTotal:"): + # Value is in kB + return int(line.split()[1]) // 1024 + except (FileNotFoundError, ValueError, OSError): + pass + + # macOS: sysctl hw.memsize (returns bytes) + if sys.platform == "darwin": + try: + result = subprocess.run( + ["sysctl", "-n", "hw.memsize"], + capture_output=True, + text=True, + check=True, + ) + return int(result.stdout.strip()) // (1024 * 1024) + except (subprocess.SubprocessError, ValueError, OSError): + pass + + # Windows: GlobalMemoryStatusEx via ctypes + if sys.platform == "win32": + try: + + class MEMORYSTATUSEX(ctypes.Structure): + _fields_ = [ + ("dwLength", ctypes.c_ulong), + ("dwMemoryLoad", ctypes.c_ulong), + ("ullTotalPhys", ctypes.c_ulonglong), + ("ullAvailPhys", ctypes.c_ulonglong), + ("ullTotalPageFile", ctypes.c_ulonglong), + ("ullAvailPageFile", ctypes.c_ulonglong), + ("ullTotalVirtual", ctypes.c_ulonglong), + ("ullAvailVirtual", ctypes.c_ulonglong), + ("ullAvailExtendedVirtual", ctypes.c_ulonglong), + ] + + mem_status = MEMORYSTATUSEX() + mem_status.dwLength = ctypes.sizeof(MEMORYSTATUSEX) + if ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(mem_status)): + return int(mem_status.ullTotalPhys) // (1024 * 1024) + except (AttributeError, OSError): + pass + + # Fallback: os.sysconf (works on most Unix systems) + try: + page_size = os.sysconf("SC_PAGE_SIZE") + phys_pages = os.sysconf("SC_PHYS_PAGES") + if page_size > 0 and phys_pages > 0: + return (page_size * phys_pages) // (1024 * 1024) + except (ValueError, OSError, AttributeError): + pass + + return 0 + + def detect_hardware() -> HardwareInfo: """Detect available hardware and recommend model configuration.""" info = HardwareInfo() @@ -28,16 +102,8 @@ def detect_hardware() -> HardwareInfo: # CPU info info.cpu_cores = os.cpu_count() or 1 - # RAM info - try: - with open("/proc/meminfo") as f: - for line in f: - if line.startswith("MemTotal:"): - # Value is in kB - info.ram_mb = int(line.split()[1]) // 1024 - break - except (FileNotFoundError, ValueError): - pass + # RAM info (cross-platform) + info.ram_mb = _detect_ram_mb() # CUDA detection try: diff --git a/python/voice_to_notes/ipc/handlers.py b/python/voice_to_notes/ipc/handlers.py index c87f665..dcc20ef 100644 --- a/python/voice_to_notes/ipc/handlers.py +++ b/python/voice_to_notes/ipc/handlers.py @@ -260,10 +260,12 @@ def make_ai_chat_handler() -> HandlerFunc: model=config.get("model", "claude-sonnet-4-6"), )) elif provider_name == "litellm": - from voice_to_notes.providers.litellm_provider import LiteLLMProvider + from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider - service.register_provider("litellm", LiteLLMProvider( + service.register_provider("litellm", OpenAICompatibleProvider( model=config.get("model", "gpt-4o-mini"), + api_key=config.get("api_key"), + api_base=config.get("api_base"), )) return IPCMessage( id=msg.id, diff --git a/python/voice_to_notes/providers/litellm_provider.py b/python/voice_to_notes/providers/litellm_provider.py index fd48a0a..faa91fe 100644 --- a/python/voice_to_notes/providers/litellm_provider.py +++ b/python/voice_to_notes/providers/litellm_provider.py @@ -1,4 +1,4 @@ -"""LiteLLM provider — multi-provider gateway.""" +"""OpenAI-compatible provider — works with any OpenAI-compatible API endpoint.""" from __future__ import annotations @@ -7,36 +7,44 @@ from typing import Any from voice_to_notes.providers.base import AIProvider -class LiteLLMProvider(AIProvider): - """Routes through LiteLLM for access to 100+ LLM providers.""" +class OpenAICompatibleProvider(AIProvider): + """Connects to any OpenAI-compatible API (LiteLLM proxy, Ollama, vLLM, etc.).""" - def __init__(self, model: str = "gpt-4o-mini", **kwargs: Any) -> None: + def __init__( + self, + api_key: str | None = None, + api_base: str | None = None, + model: str = "gpt-4o-mini", + **kwargs: Any, + ) -> None: + self._api_key = api_key or "sk-no-key" + self._api_base = api_base self._model = model self._extra_kwargs = kwargs def chat(self, messages: list[dict[str, str]], **kwargs: Any) -> str: - try: - import litellm - except ImportError: - raise RuntimeError("litellm package is required. Install with: pip install litellm") + from openai import OpenAI - merged_kwargs = {**self._extra_kwargs, **kwargs} - response = litellm.completion( - model=merged_kwargs.get("model", self._model), + client_kwargs: dict[str, Any] = {"api_key": self._api_key} + if self._api_base: + client_kwargs["base_url"] = self._api_base + + client = OpenAI(**client_kwargs) + response = client.chat.completions.create( + model=kwargs.get("model", self._model), messages=messages, - temperature=merged_kwargs.get("temperature", 0.7), - max_tokens=merged_kwargs.get("max_tokens", 2048), + temperature=kwargs.get("temperature", 0.7), + max_tokens=kwargs.get("max_tokens", 2048), ) return response.choices[0].message.content or "" def is_available(self) -> bool: try: - import litellm # noqa: F401 - - return True + import openai # noqa: F401 + return bool(self._api_key and self._api_base) except ImportError: return False @property def name(self) -> str: - return "LiteLLM" + return "OpenAI Compatible" diff --git a/python/voice_to_notes/services/ai_provider.py b/python/voice_to_notes/services/ai_provider.py index 6fa4f51..5e8f14f 100644 --- a/python/voice_to_notes/services/ai_provider.py +++ b/python/voice_to_notes/services/ai_provider.py @@ -92,7 +92,7 @@ class AIProviderService: def create_default_service() -> AIProviderService: """Create an AIProviderService with all supported providers registered.""" from voice_to_notes.providers.anthropic_provider import AnthropicProvider - from voice_to_notes.providers.litellm_provider import LiteLLMProvider + from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider from voice_to_notes.providers.local_provider import LocalProvider from voice_to_notes.providers.openai_provider import OpenAIProvider @@ -100,5 +100,5 @@ def create_default_service() -> AIProviderService: service.register_provider("local", LocalProvider()) service.register_provider("openai", OpenAIProvider()) service.register_provider("anthropic", AnthropicProvider()) - service.register_provider("litellm", LiteLLMProvider()) + service.register_provider("litellm", OpenAICompatibleProvider()) return service diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py index b55ff65..6ac5f51 100644 --- a/python/voice_to_notes/services/diarize.py +++ b/python/voice_to_notes/services/diarize.py @@ -16,6 +16,7 @@ from typing import Any # np.isfinite(None) crashes when max_speakers is not set. os.environ.setdefault("PYANNOTE_METRICS_ENABLED", "false") +from voice_to_notes.utils.ffmpeg import get_ffmpeg_path from voice_to_notes.ipc.messages import progress_message from voice_to_notes.ipc.protocol import write_message @@ -40,7 +41,7 @@ def _ensure_wav(file_path: str) -> tuple[str, str | None]: try: subprocess.run( [ - "ffmpeg", "-y", "-i", file_path, + get_ffmpeg_path(), "-y", "-i", file_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", tmp.name, ], @@ -118,6 +119,14 @@ class DiarizeService: self._pipeline = Pipeline.from_pretrained(model_name, token=hf_token) print(f"[sidecar] Loaded diarization model: {model_name}", file=sys.stderr, flush=True) + # Move pipeline to GPU if available + try: + import torch + if torch.cuda.is_available(): + self._pipeline = self._pipeline.to(torch.device("cuda")) + print(f"[sidecar] Diarization pipeline moved to GPU", file=sys.stderr, flush=True) + except Exception as e: + print(f"[sidecar] GPU not available for diarization: {e}", file=sys.stderr, flush=True) return self._pipeline except Exception as e: last_error = e diff --git a/python/voice_to_notes/services/pipeline.py b/python/voice_to_notes/services/pipeline.py index b781ad2..903fb4c 100644 --- a/python/voice_to_notes/services/pipeline.py +++ b/python/voice_to_notes/services/pipeline.py @@ -2,6 +2,7 @@ from __future__ import annotations +import concurrent.futures import sys import time from dataclasses import dataclass, field @@ -13,6 +14,7 @@ from voice_to_notes.ipc.messages import ( speaker_update_message, ) from voice_to_notes.ipc.protocol import write_message +from voice_to_notes.utils.ffmpeg import get_ffprobe_path from voice_to_notes.services.diarize import DiarizeService, SpeakerSegment from voice_to_notes.services.transcribe import ( SegmentResult, @@ -82,7 +84,7 @@ class PipelineService: """ start_time = time.time() - # Step 1: Transcribe + # Step 0: Probe audio duration for conditional chunked transcription write_message( progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...") ) @@ -96,12 +98,11 @@ class PipelineService: "words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words], })) - # Probe audio duration for conditional chunked transcription audio_duration_sec = None try: import subprocess probe_result = subprocess.run( - ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", + [get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", file_path], capture_output=True, text=True, check=True, ) @@ -109,30 +110,33 @@ class PipelineService: except (subprocess.CalledProcessError, FileNotFoundError, ValueError): pass - from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC - if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC: - transcription = self._transcribe_service.transcribe_chunked( - request_id=request_id, - file_path=file_path, - model_name=model_name, - device=device, - compute_type=compute_type, - language=language, - on_segment=_emit_segment, - ) - else: - transcription = self._transcribe_service.transcribe( - request_id=request_id, - file_path=file_path, - model_name=model_name, - device=device, - compute_type=compute_type, - language=language, - on_segment=_emit_segment, - ) + def _run_transcription() -> TranscriptionResult: + """Run transcription (chunked or standard based on duration).""" + from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC + if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC: + return self._transcribe_service.transcribe_chunked( + request_id=request_id, + file_path=file_path, + model_name=model_name, + device=device, + compute_type=compute_type, + language=language, + on_segment=_emit_segment, + ) + else: + return self._transcribe_service.transcribe( + request_id=request_id, + file_path=file_path, + model_name=model_name, + device=device, + compute_type=compute_type, + language=language, + on_segment=_emit_segment, + ) if skip_diarization: - # Convert transcription directly without speaker labels + # Sequential: transcribe only, no diarization needed + transcription = _run_transcription() result = PipelineResult( language=transcription.language, language_probability=transcription.language_probability, @@ -150,37 +154,59 @@ class PipelineService: ) return result - # Step 2: Diarize (with graceful fallback) + # Parallel execution: run transcription (0-45%) and diarization (45-90%) + # concurrently, then merge (90-100%). write_message( - progress_message(request_id, 50, "pipeline", "Starting speaker diarization...") + progress_message( + request_id, 0, "pipeline", + "Starting transcription and diarization in parallel..." + ) ) diarization = None - try: - diarization = self._diarize_service.diarize( + diarization_error = None + + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + transcription_future = executor.submit(_run_transcription) + + # Use probed audio_duration_sec for diarization progress estimation + # (transcription hasn't finished yet, so we can't use transcription.duration_ms) + diarization_future = executor.submit( + self._diarize_service.diarize, request_id=request_id, file_path=file_path, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers, hf_token=hf_token, - audio_duration_sec=transcription.duration_ms / 1000.0, + audio_duration_sec=audio_duration_sec, ) - except Exception as e: - import traceback - print( - f"[sidecar] Diarization failed, falling back to transcription-only: {e}", - file=sys.stderr, - flush=True, - ) - traceback.print_exc(file=sys.stderr) + + # Wait for both futures. We need the transcription result regardless, + # but diarization may fail gracefully. + transcription = transcription_future.result() write_message( - progress_message( - request_id, 80, "pipeline", - f"Diarization failed ({e}), using transcription only..." - ) + progress_message(request_id, 45, "pipeline", "Transcription complete") ) + try: + diarization = diarization_future.result() + except Exception as e: + import traceback + diarization_error = e + print( + f"[sidecar] Diarization failed, falling back to transcription-only: {e}", + file=sys.stderr, + flush=True, + ) + traceback.print_exc(file=sys.stderr) + write_message( + progress_message( + request_id, 80, "pipeline", + f"Diarization failed ({e}), using transcription only..." + ) + ) + # Step 3: Merge (or skip if diarization failed) if diarization is not None: write_message( diff --git a/python/voice_to_notes/services/transcribe.py b/python/voice_to_notes/services/transcribe.py index e89c0ac..87bb01d 100644 --- a/python/voice_to_notes/services/transcribe.py +++ b/python/voice_to_notes/services/transcribe.py @@ -12,6 +12,7 @@ from faster_whisper import WhisperModel from voice_to_notes.ipc.messages import progress_message from voice_to_notes.ipc.protocol import write_message +from voice_to_notes.utils.ffmpeg import get_ffmpeg_path, get_ffprobe_path CHUNK_REPORT_SIZE = 10 LARGE_FILE_THRESHOLD_SEC = 3600 # 1 hour @@ -202,7 +203,7 @@ class TranscribeService: # Get total duration via ffprobe try: probe_result = subprocess.run( - ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", + [get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", file_path], capture_output=True, text=True, check=True, ) @@ -235,7 +236,7 @@ class TranscribeService: tmp.close() try: subprocess.run( - ["ffmpeg", "-y", "-ss", str(chunk_start), + [get_ffmpeg_path(), "-y", "-ss", str(chunk_start), "-t", str(chunk_duration_sec), "-i", file_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", diff --git a/python/voice_to_notes/utils/ffmpeg.py b/python/voice_to_notes/utils/ffmpeg.py new file mode 100644 index 0000000..6232449 --- /dev/null +++ b/python/voice_to_notes/utils/ffmpeg.py @@ -0,0 +1,43 @@ +"""Resolve ffmpeg/ffprobe paths for both frozen and development builds.""" + +from __future__ import annotations + +import os +import sys + + +def get_ffmpeg_path() -> str: + """Return the path to the ffmpeg binary. + + When running as a frozen PyInstaller bundle, looks next to sys.executable. + Otherwise falls back to the system PATH. + """ + if getattr(sys, "frozen", False): + # Frozen PyInstaller bundle — ffmpeg is next to the sidecar binary + bundle_dir = os.path.dirname(sys.executable) + candidates = [ + os.path.join(bundle_dir, "ffmpeg.exe" if sys.platform == "win32" else "ffmpeg"), + os.path.join(bundle_dir, "ffmpeg"), + ] + for path in candidates: + if os.path.isfile(path): + return path + return "ffmpeg" + + +def get_ffprobe_path() -> str: + """Return the path to the ffprobe binary. + + When running as a frozen PyInstaller bundle, looks next to sys.executable. + Otherwise falls back to the system PATH. + """ + if getattr(sys, "frozen", False): + bundle_dir = os.path.dirname(sys.executable) + candidates = [ + os.path.join(bundle_dir, "ffprobe.exe" if sys.platform == "win32" else "ffprobe"), + os.path.join(bundle_dir, "ffprobe"), + ] + for path in candidates: + if os.path.isfile(path): + return path + return "ffprobe" diff --git a/src-tauri/binaries/.gitkeep b/src-tauri/binaries/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src-tauri/src/commands/transcribe.rs b/src-tauri/src/commands/transcribe.rs index 7135987..f1521ad 100644 --- a/src-tauri/src/commands/transcribe.rs +++ b/src-tauri/src/commands/transcribe.rs @@ -73,7 +73,7 @@ pub fn download_diarize_model(hf_token: String) -> Result { /// Run the full transcription + diarization pipeline via the Python sidecar. #[tauri::command] -pub fn run_pipeline( +pub async fn run_pipeline( app: AppHandle, file_path: String, model: Option, @@ -106,25 +106,34 @@ pub fn run_pipeline( }), ); - let response = manager.send_and_receive_with_progress(&msg, |msg| { - let event_name = match msg.msg_type.as_str() { - "pipeline.segment" => "pipeline-segment", - "pipeline.speaker_update" => "pipeline-speaker-update", - _ => "pipeline-progress", - }; - let _ = app.emit(event_name, &msg.payload); - })?; + // Run the blocking sidecar I/O on a separate thread so the async runtime + // can deliver emitted events to the webview while processing is ongoing. + let app_handle = app.clone(); + tauri::async_runtime::spawn_blocking(move || { + let response = manager.send_and_receive_with_progress(&msg, |msg| { + let event_name = match msg.msg_type.as_str() { + "pipeline.segment" => "pipeline-segment", + "pipeline.speaker_update" => "pipeline-speaker-update", + _ => "pipeline-progress", + }; + if let Err(e) = app_handle.emit(event_name, &msg.payload) { + eprintln!("[sidecar-rs] Failed to emit {event_name}: {e}"); + } + })?; - if response.msg_type == "error" { - return Err(format!( - "Pipeline error: {}", - response - .payload - .get("message") - .and_then(|v| v.as_str()) - .unwrap_or("unknown") - )); - } + if response.msg_type == "error" { + return Err(format!( + "Pipeline error: {}", + response + .payload + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("unknown") + )); + } - Ok(response.payload) + Ok(response.payload) + }) + .await + .map_err(|e| format!("Pipeline task failed: {e}"))? } diff --git a/src-tauri/src/sidecar/mod.rs b/src-tauri/src/sidecar/mod.rs index 8444f56..56140a4 100644 --- a/src-tauri/src/sidecar/mod.rs +++ b/src-tauri/src/sidecar/mod.rs @@ -13,8 +13,13 @@ pub fn sidecar() -> &'static SidecarManager { INSTANCE.get_or_init(SidecarManager::new) } -/// Manages the Python sidecar process lifecycle. -/// Uses separated stdin/stdout ownership to avoid BufReader conflicts. +/// Manages the sidecar process lifecycle. +/// +/// Supports two modes: +/// - **Production**: spawns a frozen PyInstaller binary (no Python required) +/// - **Dev mode**: spawns system Python with `-m voice_to_notes.main` +/// +/// Dev mode is active when compiled in debug mode or when `VOICE_TO_NOTES_DEV=1`. pub struct SidecarManager { process: Mutex>, stdin: Mutex>, @@ -30,38 +35,141 @@ impl SidecarManager { } } + /// Check if we should use dev mode (system Python). + fn is_dev_mode() -> bool { + cfg!(debug_assertions) || std::env::var("VOICE_TO_NOTES_DEV").is_ok() + } + + /// Resolve the frozen sidecar binary path (production mode). + fn resolve_sidecar_path() -> Result { + let exe = std::env::current_exe().map_err(|e| format!("Cannot get current exe: {e}"))?; + let exe_dir = exe + .parent() + .ok_or_else(|| "Cannot get exe parent directory".to_string())?; + + let binary_name = if cfg!(target_os = "windows") { + "voice-to-notes-sidecar.exe" + } else { + "voice-to-notes-sidecar" + }; + + // Tauri places externalBin next to the app binary + let path = exe_dir.join(binary_name); + if path.exists() { + return Ok(path); + } + + // Also check inside a subdirectory (onedir PyInstaller output) + let subdir_path = exe_dir.join("voice-to-notes-sidecar").join(binary_name); + if subdir_path.exists() { + return Ok(subdir_path); + } + + Err(format!( + "Sidecar binary not found. Looked for:\n {}\n {}", + path.display(), + subdir_path.display(), + )) + } + + /// Find a working Python command for the current platform. + fn find_python_command() -> &'static str { + if cfg!(target_os = "windows") { + "python" + } else { + "python3" + } + } + + /// Resolve the Python sidecar directory for dev mode. + fn resolve_python_dir() -> Result { + let manifest_dir = env!("CARGO_MANIFEST_DIR"); + let python_dir = std::path::Path::new(manifest_dir) + .join("../python") + .canonicalize() + .map_err(|e| format!("Cannot find python directory: {e}"))?; + + if python_dir.exists() { + return Ok(python_dir); + } + + // Fallback: relative to current exe + let exe = std::env::current_exe().map_err(|e| e.to_string())?; + let alt = exe + .parent() + .ok_or_else(|| "No parent dir".to_string())? + .join("../python") + .canonicalize() + .map_err(|e| format!("Cannot find python directory: {e}"))?; + + Ok(alt) + } + /// Ensure the sidecar is running, starting it if needed. pub fn ensure_running(&self) -> Result<(), String> { if self.is_running() { return Ok(()); } - let python_path = std::env::current_dir() - .map_err(|e| e.to_string())? - .join("../python") - .canonicalize() - .map_err(|e| format!("Cannot find python directory: {e}"))?; - - self.start(&python_path.to_string_lossy()) + if Self::is_dev_mode() { + self.start_python_dev() + } else { + match Self::resolve_sidecar_path() { + Ok(path) => self.start_binary(&path), + Err(e) => { + eprintln!( + "[sidecar-rs] Frozen binary not found ({e}), falling back to dev mode" + ); + self.start_python_dev() + } + } + } } - /// Spawn the Python sidecar process. - pub fn start(&self, python_path: &str) -> Result<(), String> { - // Stop existing process if any + /// Spawn the frozen sidecar binary (production mode). + fn start_binary(&self, path: &std::path::Path) -> Result<(), String> { self.stop().ok(); + eprintln!("[sidecar-rs] Starting frozen sidecar: {}", path.display()); - let mut child = Command::new("python3") - .arg("-m") - .arg("voice_to_notes.main") - .current_dir(python_path) - .env("PYTHONPATH", python_path) + let child = Command::new(path) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::inherit()) .spawn() - .map_err(|e| format!("Failed to start sidecar: {e}"))?; + .map_err(|e| format!("Failed to start sidecar binary: {e}"))?; - // Take ownership of stdin and stdout separately + self.attach(child)?; + self.wait_for_ready() + } + + /// Spawn the Python sidecar in dev mode (system Python). + fn start_python_dev(&self) -> Result<(), String> { + self.stop().ok(); + let python_dir = Self::resolve_python_dir()?; + let python_cmd = Self::find_python_command(); + eprintln!( + "[sidecar-rs] Starting dev sidecar: {} -m voice_to_notes.main ({})", + python_cmd, + python_dir.display() + ); + + let child = Command::new(python_cmd) + .arg("-m") + .arg("voice_to_notes.main") + .current_dir(&python_dir) + .env("PYTHONPATH", &python_dir) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .map_err(|e| format!("Failed to start Python sidecar: {e}"))?; + + self.attach(child)?; + self.wait_for_ready() + } + + /// Take ownership of a spawned child's stdin/stdout and store the process handle. + fn attach(&self, mut child: Child) -> Result<(), String> { let stdin = child.stdin.take().ok_or("Failed to get sidecar stdin")?; let stdout = child.stdout.take().ok_or("Failed to get sidecar stdout")?; let buf_reader = BufReader::new(stdout); @@ -78,10 +186,6 @@ impl SidecarManager { let mut r = self.reader.lock().map_err(|e| e.to_string())?; *r = Some(buf_reader); } - - // Wait for the "ready" message - self.wait_for_ready()?; - Ok(()) } @@ -124,70 +228,6 @@ impl SidecarManager { self.send_and_receive_with_progress(msg, |_| {}) } - /// Send a message and read the response, calling on_progress for each progress message. - pub fn send_and_receive_with_progress( - &self, - msg: &IPCMessage, - on_progress: impl Fn(&IPCMessage), - ) -> Result { - // Write to stdin - { - let mut stdin_guard = self.stdin.lock().map_err(|e| e.to_string())?; - if let Some(ref mut stdin) = *stdin_guard { - let json = serde_json::to_string(msg).map_err(|e| e.to_string())?; - stdin - .write_all(json.as_bytes()) - .map_err(|e| format!("Write error: {e}"))?; - stdin - .write_all(b"\n") - .map_err(|e| format!("Write error: {e}"))?; - stdin.flush().map_err(|e| format!("Flush error: {e}"))?; - } else { - return Err("Sidecar stdin not available".to_string()); - } - } - - // Read from stdout - { - let mut reader_guard = self.reader.lock().map_err(|e| e.to_string())?; - if let Some(ref mut reader) = *reader_guard { - let mut line = String::new(); - loop { - line.clear(); - let bytes_read = reader - .read_line(&mut line) - .map_err(|e| format!("Read error: {e}"))?; - if bytes_read == 0 { - return Err("Sidecar closed stdout".to_string()); - } - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - // Skip non-JSON lines (library output that leaked to stdout) - let response: IPCMessage = match serde_json::from_str(trimmed) { - Ok(msg) => msg, - Err(_) => { - eprintln!( - "[sidecar-rs] Skipping non-JSON line: {}", - &trimmed[..trimmed.len().min(200)] - ); - continue; - } - }; - - if response.msg_type == "progress" { - on_progress(&response); - continue; - } - return Ok(response); - } - } else { - Err("Sidecar stdout not available".to_string()) - } - } - } - /// Send a message and receive the response, calling a callback for intermediate messages. /// Intermediate messages include progress, pipeline.segment, and pipeline.speaker_update. pub fn send_and_receive_with_progress( diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index 52f8401..be4cb48 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -46,7 +46,7 @@ "license": "MIT", "linux": { "deb": { - "depends": ["python3", "python3-pip"] + "depends": [] }, "appimage": { "bundleMediaFramework": true diff --git a/src/lib/components/AIChatPanel.svelte b/src/lib/components/AIChatPanel.svelte index 81a328f..1a53ae0 100644 --- a/src/lib/components/AIChatPanel.svelte +++ b/src/lib/components/AIChatPanel.svelte @@ -1,6 +1,7 @@ @@ -50,8 +70,8 @@
- {#each pipelineSteps as step} - {@const status = getStepStatus(step.key, stage)} + {#each pipelineSteps as step, idx} + {@const status = getStepStatus(idx)}
{#if status === 'done'} diff --git a/src/lib/components/SettingsModal.svelte b/src/lib/components/SettingsModal.svelte index 0b95d09..7f8de81 100644 --- a/src/lib/components/SettingsModal.svelte +++ b/src/lib/components/SettingsModal.svelte @@ -14,6 +14,7 @@ let activeTab = $state<'transcription' | 'speakers' | 'ai' | 'local'>('transcription'); let modelStatus = $state<'idle' | 'downloading' | 'success' | 'error'>('idle'); let modelError = $state(''); + let revealedFields = $state>(new Set()); async function testAndDownloadModel() { if (!localSettings.hf_token) { @@ -111,7 +112,10 @@ {:else if activeTab === 'speakers'}
- +
+ + +

Setup (one-time)

@@ -150,6 +154,23 @@ {#if modelStatus === 'error'}

{modelError}

{/if} +
+ + +

Hint the expected number of speakers to speed up diarization clustering.

+
{#if localSettings.ai_provider === 'openai'}
- +
+ + +
@@ -179,13 +203,27 @@ {:else if localSettings.ai_provider === 'anthropic'}
- +
+ + +
{:else if localSettings.ai_provider === 'litellm'} +
+ + +
+
+ +
+ + +
+
@@ -293,11 +331,36 @@ color: #aaa; margin-bottom: 0.3rem; } + .input-reveal { + display: flex; + gap: 0; + } + .input-reveal input { + flex: 1; + border-top-right-radius: 0; + border-bottom-right-radius: 0; + } + .reveal-btn { + background: #0f3460; + border: 1px solid #4a5568; + border-left: none; + color: #aaa; + padding: 0.5rem 0.6rem; + border-radius: 0 4px 4px 0; + cursor: pointer; + font-size: 0.75rem; + white-space: nowrap; + } + .reveal-btn:hover { + color: #e0e0e0; + background: #1a4a7a; + } .field input, .field select { width: 100%; background: #1a1a2e; color: #e0e0e0; + color-scheme: dark; border: 1px solid #4a5568; border-radius: 4px; padding: 0.5rem; diff --git a/src/lib/components/WaveformPlayer.svelte b/src/lib/components/WaveformPlayer.svelte index ae45220..4ffeda9 100644 --- a/src/lib/components/WaveformPlayer.svelte +++ b/src/lib/components/WaveformPlayer.svelte @@ -13,6 +13,7 @@ let container: HTMLDivElement; let wavesurfer: WaveSurfer | null = $state(null); let isReady = $state(false); + let isLoading = $state(false); let currentTime = $state('0:00'); let totalTime = $state('0:00'); @@ -32,6 +33,7 @@ barWidth: 2, barGap: 1, barRadius: 2, + backend: 'WebAudio', }); wavesurfer.on('timeupdate', (time: number) => { @@ -41,6 +43,7 @@ wavesurfer.on('ready', () => { isReady = true; + isLoading = false; const dur = wavesurfer!.getDuration(); durationMs.set(Math.round(dur * 1000)); totalTime = formatTime(dur); @@ -55,7 +58,7 @@ }); if (audioUrl) { - wavesurfer.load(audioUrl); + loadAudio(audioUrl); } }); @@ -89,16 +92,13 @@ console.warn('[voice-to-notes] seekTo ignored — audio not ready yet'); return; } - const timeSec = timeMs / 1000; - wavesurfer.setTime(timeSec); - if (!wavesurfer.isPlaying()) { - wavesurfer.play(); - } + wavesurfer.setTime(timeMs / 1000); } /** Load a new audio file. */ export function loadAudio(url: string) { isReady = false; + isLoading = true; wavesurfer?.load(url); } diff --git a/src/lib/stores/settings.ts b/src/lib/stores/settings.ts index 9eab830..86262c9 100644 --- a/src/lib/stores/settings.ts +++ b/src/lib/stores/settings.ts @@ -8,6 +8,8 @@ export interface AppSettings { openai_model: string; anthropic_model: string; litellm_model: string; + litellm_api_key: string; + litellm_api_base: string; local_model_path: string; local_binary_path: string; transcription_model: string; @@ -15,6 +17,7 @@ export interface AppSettings { transcription_language: string; skip_diarization: boolean; hf_token: string; + num_speakers: number | null; } const defaults: AppSettings = { @@ -24,6 +27,8 @@ const defaults: AppSettings = { openai_model: 'gpt-4o-mini', anthropic_model: 'claude-sonnet-4-6', litellm_model: 'gpt-4o-mini', + litellm_api_key: '', + litellm_api_base: '', local_model_path: '', local_binary_path: 'llama-server', transcription_model: 'base', @@ -31,6 +36,7 @@ const defaults: AppSettings = { transcription_language: '', skip_diarization: false, hf_token: '', + num_speakers: null, }; export const settings = writable({ ...defaults }); @@ -47,4 +53,20 @@ export async function loadSettings(): Promise { export async function saveSettings(s: AppSettings): Promise { settings.set(s); await invoke('save_settings', { settings: s }); + + // Configure the AI provider in the Python sidecar + const configMap: Record> = { + openai: { api_key: s.openai_api_key, model: s.openai_model }, + anthropic: { api_key: s.anthropic_api_key, model: s.anthropic_model }, + litellm: { api_key: s.litellm_api_key, api_base: s.litellm_api_base, model: s.litellm_model }, + local: { model: s.local_model_path, base_url: 'http://localhost:8080' }, + }; + const config = configMap[s.ai_provider]; + if (config) { + try { + await invoke('ai_configure', { provider: s.ai_provider, config }); + } catch { + // Sidecar may not be running yet — provider will be configured on first use + } + } } diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 02efd76..5e0f382 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -13,6 +13,7 @@ import type { Segment, Speaker } from '$lib/types/transcript'; import { onMount, tick } from 'svelte'; + let appReady = $state(false); let waveformPlayer: WaveformPlayer; let audioUrl = $state(''); let showSettings = $state(false); @@ -54,6 +55,8 @@ document.addEventListener('keydown', handleKeyDown); document.addEventListener('click', handleClickOutside); + appReady = true; + return () => { document.removeEventListener('keydown', handleKeyDown); document.removeEventListener('click', handleClickOutside); @@ -200,6 +203,7 @@ language: $settings.transcription_language || undefined, skipDiarization: $settings.skip_diarization || undefined, hfToken: $settings.hf_token || undefined, + numSpeakers: $settings.num_speakers && $settings.num_speakers > 0 ? $settings.num_speakers : undefined, }); // Create speaker entries from pipeline result @@ -303,60 +307,70 @@ } -
-

Voice to Notes

-
- - - {#if $segments.length > 0} -
- - {#if showExportMenu} -
- {#each exportFormats as fmt} - - {/each} -
+{#if !appReady} +
+

Voice to Notes

+

Loading...

+
+
+{:else} +
+
+

Voice to Notes

+
+
- {/if} + + + {#if $segments.length > 0} +
+ + {#if showExportMenu} +
+ {#each exportFormats as fmt} + + {/each} +
+ {/if} +
+ {/if} +
-
-
-
- - +
+
+ + +
+
- -
- + - showSettings = false} -/> + showSettings = false} + /> +{/if}