diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..0d775f8 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,21 @@ +{ + "permissions": { + "allow": [ + "Bash(git init:*)", + "Bash(git:*)", + "WebSearch", + "Bash(npm create:*)", + "Bash(cp:*)", + "Bash(npm install:*)", + "Bash(/home/jknapp/.cargo/bin/cargo test:*)", + "Bash(ruff:*)", + "Bash(npm run:*)", + "Bash(npx svelte-check:*)", + "Bash(pip install:*)", + "Bash(python3:*)", + "Bash(/home/jknapp/.cargo/bin/cargo check:*)", + "Bash(cargo check:*)", + "Bash(npm ls:*)" + ] + } +} diff --git a/.claude/worktrees/agent-a0bd87d1 b/.claude/worktrees/agent-a0bd87d1 new file mode 160000 index 0000000..67ed69d --- /dev/null +++ b/.claude/worktrees/agent-a0bd87d1 @@ -0,0 +1 @@ +Subproject commit 67ed69df00019859232c9f8b9978d50c65a0dcb3 diff --git a/.claude/worktrees/agent-a198b5f8 b/.claude/worktrees/agent-a198b5f8 new file mode 160000 index 0000000..6eb13bc --- /dev/null +++ b/.claude/worktrees/agent-a198b5f8 @@ -0,0 +1 @@ +Subproject commit 6eb13bce63b0b6ffe325b52734f54851b4090fb7 diff --git a/.claude/worktrees/agent-ad3d6fca b/.claude/worktrees/agent-ad3d6fca new file mode 160000 index 0000000..03af5a1 --- /dev/null +++ b/.claude/worktrees/agent-ad3d6fca @@ -0,0 +1 @@ +Subproject commit 03af5a189cbeb1ab876031fa399159beb29f59f7 diff --git a/.claude/worktrees/agent-aefe2597 b/.claude/worktrees/agent-aefe2597 new file mode 160000 index 0000000..16f4b57 --- /dev/null +++ b/.claude/worktrees/agent-aefe2597 @@ -0,0 +1 @@ +Subproject commit 16f4b5777126807e9aeca4e53b8f17f300c8ddeb diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml new file mode 100644 index 0000000..bc9e21a --- /dev/null +++ b/.gitea/workflows/build.yml @@ -0,0 +1,179 @@ +name: Build & Release + +on: + push: + branches: [main] + tags: ["v*"] + pull_request: + branches: [main] + +env: + PYTHON_VERSION: "3.11" + NODE_VERSION: "20" + +jobs: + build-sidecar: + name: Build sidecar (${{ matrix.target }}) + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-latest + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-latest + target: aarch64-apple-darwin + platform: macos + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Build sidecar + working-directory: python + run: python build_sidecar.py --cpu-only + + - name: Upload sidecar artifact + uses: actions/upload-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: python/dist/voice-to-notes-sidecar/ + retention-days: 7 + + build-tauri: + name: Build app (${{ matrix.target }}) + needs: build-sidecar + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-latest + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-latest + target: aarch64-apple-darwin + platform: macos + + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + # Note: 'cache: npm' requires the Gitea instance to have + # Actions cache configured. Remove this if caching is unavailable. + cache: npm + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + + - name: Install system dependencies (Linux) + if: matrix.platform == 'linux' + run: | + sudo apt-get update + sudo apt-get install -y libgtk-3-dev libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf + + - name: Download sidecar artifact + uses: actions/download-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: src-tauri/binaries/ + + - name: Make sidecar executable (Unix) + if: matrix.platform != 'windows' + run: chmod +x src-tauri/binaries/voice-to-notes-sidecar-${{ matrix.target }} + + - name: Install npm dependencies + run: npm ci + + - name: Build Tauri app + run: npm run tauri build + env: + TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} + TAURI_CONFIG: '{"bundle":{"externalBin":["binaries/voice-to-notes-sidecar"]}}' + + - name: Upload app artifacts (Linux) + if: matrix.platform == 'linux' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/deb/*.deb + src-tauri/target/release/bundle/appimage/*.AppImage + retention-days: 30 + + - name: Upload app artifacts (Windows) + if: matrix.platform == 'windows' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/msi/*.msi + src-tauri/target/release/bundle/nsis/*.exe + retention-days: 30 + + - name: Upload app artifacts (macOS) + if: matrix.platform == 'macos' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/dmg/*.dmg + src-tauri/target/release/bundle/macos/*.app + retention-days: 30 + + release: + name: Create Release + needs: build-tauri + if: github.ref == 'refs/heads/main' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download all app artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts/ + pattern: app-* + + - name: Generate release tag + id: tag + run: echo "tag=build-$(date +%Y%m%d-%H%M%S)" >> $GITHUB_OUTPUT + + - name: Create release + env: + BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }} + TAG: ${{ steps.tag.outputs.tag }} + run: | + # Create the release + RELEASE_ID=$(curl -s -X POST \ + -H "Authorization: token ${BUILD_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{\"tag_name\": \"${TAG}\", \"name\": \"Voice to Notes ${TAG}\", \"body\": \"Automated build from main branch.\", \"draft\": false, \"prerelease\": true}" \ + "${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}/releases" | jq -r '.id') + + echo "Release ID: ${RELEASE_ID}" + + # Upload all artifacts + find artifacts/ -type f \( -name "*.deb" -o -name "*.AppImage" -o -name "*.msi" -o -name "*.exe" -o -name "*.dmg" \) | while read file; do + filename=$(basename "$file") + echo "Uploading ${filename}..." + curl -s -X POST \ + -H "Authorization: token ${BUILD_TOKEN}" \ + -H "Content-Type: application/octet-stream" \ + --data-binary "@${file}" \ + "${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}/releases/${RELEASE_ID}/assets?name=${filename}" + done diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..7290873 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,141 @@ +name: Build & Release + +on: + push: + branches: [main] + tags: ["v*"] + pull_request: + branches: [main] + workflow_dispatch: + +env: + PYTHON_VERSION: "3.11" + NODE_VERSION: "20" + +jobs: + build-sidecar: + name: Build sidecar (${{ matrix.target }}) + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-20.04 + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-13 + target: x86_64-apple-darwin + platform: macos-intel + - runner: macos-14 + target: aarch64-apple-darwin + platform: macos-arm + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Build sidecar + working-directory: python + run: python build_sidecar.py --cpu-only + + - name: Upload sidecar artifact + uses: actions/upload-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: python/dist/voice-to-notes-sidecar/ + retention-days: 7 + + build-tauri: + name: Build app (${{ matrix.target }}) + needs: build-sidecar + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-20.04 + target: x86_64-unknown-linux-gnu + platform: linux + - runner: windows-latest + target: x86_64-pc-windows-msvc + platform: windows + - runner: macos-13 + target: x86_64-apple-darwin + platform: macos-intel + - runner: macos-14 + target: aarch64-apple-darwin + platform: macos-arm + + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: npm + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + + - name: Install system dependencies (Linux) + if: matrix.platform == 'linux' + run: | + sudo apt-get update + sudo apt-get install -y libgtk-3-dev libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf + + - name: Download sidecar artifact + uses: actions/download-artifact@v4 + with: + name: sidecar-${{ matrix.target }} + path: src-tauri/binaries/ + + - name: Make sidecar executable (Unix) + if: matrix.platform != 'windows' + run: chmod +x src-tauri/binaries/voice-to-notes-sidecar-${{ matrix.target }} + + - name: Install npm dependencies + run: npm ci + + - name: Build Tauri app + run: npm run tauri build + env: + TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} + TAURI_CONFIG: '{"bundle":{"externalBin":["binaries/voice-to-notes-sidecar"]}}' + + - name: Upload app artifacts (Linux) + if: matrix.platform == 'linux' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/deb/*.deb + src-tauri/target/release/bundle/appimage/*.AppImage + retention-days: 30 + + - name: Upload app artifacts (Windows) + if: matrix.platform == 'windows' + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/msi/*.msi + src-tauri/target/release/bundle/nsis/*.exe + retention-days: 30 + + - name: Upload app artifacts (macOS) + if: startsWith(matrix.platform, 'macos') + uses: actions/upload-artifact@v4 + with: + name: app-${{ matrix.target }} + path: | + src-tauri/target/release/bundle/dmg/*.dmg + src-tauri/target/release/bundle/macos/*.app + retention-days: 30 diff --git a/.gitignore b/.gitignore index 50cad14..59a5935 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,9 @@ Thumbs.db *.ogg *.flac !test/fixtures/* + +# Sidecar build artifacts +src-tauri/binaries/* +!src-tauri/binaries/.gitkeep +python/dist/ +python/build/ diff --git a/CLAUDE.md b/CLAUDE.md index 6e55905..f12ceab 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,7 +8,7 @@ Desktop app for transcribing audio/video with speaker identification. Runs local - **ML pipeline:** Python sidecar process (faster-whisper, pyannote.audio, wav2vec2) - **Database:** SQLite (via rusqlite in Rust) - **Local AI:** Bundled llama-server (llama.cpp) — default, no install needed -- **Cloud AI providers:** LiteLLM, OpenAI, Anthropic (optional, user-configured) +- **Cloud AI providers:** OpenAI, Anthropic, OpenAI-compatible endpoints (optional, user-configured) - **Caption export:** pysubs2 (Python) - **Audio UI:** wavesurfer.js - **Transcript editor:** TipTap (ProseMirror) @@ -40,7 +40,13 @@ docs/ # Architecture and design documents - Database: UUIDs as primary keys (TEXT type in SQLite) - All timestamps in milliseconds (integer) relative to media file start +## Distribution +- Python sidecar is frozen via PyInstaller into a standalone binary for distribution +- Tauri bundles the sidecar via `externalBin` — no Python required for end users +- CI/CD builds on Gitea Actions (Linux, Windows, macOS ARM) +- Dev mode uses system Python (`VOICE_TO_NOTES_DEV=1` or debug builds) + ## Platform Targets -- Linux (primary development target) -- Windows (must work, tested before release) -- macOS (future, not yet targeted) +- Linux x86_64 (primary development target) +- Windows x86_64 +- macOS aarch64 (Apple Silicon) diff --git a/README.md b/README.md index 740f612..350a55d 100644 --- a/README.md +++ b/README.md @@ -2,28 +2,90 @@ A desktop application that transcribes audio/video recordings with speaker identification, producing editable transcriptions with synchronized audio playback. -## Goals +## Features -- **Speech-to-Text Transcription** — Accurately convert spoken audio from recordings into text -- **Speaker Identification (Diarization)** — Detect and distinguish between different speakers in a conversation -- **Speaker Naming** — Assign and persist speaker names/IDs across the transcription -- **Synchronized Playback** — Click any transcribed text segment to play back the corresponding audio for review and correction -- **Export Formats** - - Closed captioning files (SRT, VTT) for video - - Plain text documents with speaker labels -- **AI Integration** — Connect to AI providers to ask questions about the conversation and generate condensed notes/summaries +- **Speech-to-Text Transcription** — Accurate transcription via faster-whisper (Whisper models) with word-level timestamps +- **Speaker Identification (Diarization)** — Detect and distinguish between speakers using pyannote.audio +- **Synchronized Playback** — Click any word to seek to that point in the audio (Web Audio API for instant playback) +- **AI Integration** — Ask questions about your transcript via OpenAI, Anthropic, or any OpenAI-compatible API (LiteLLM proxies, Ollama, vLLM) +- **Export Formats** — SRT, WebVTT, ASS captions, plain text, and Markdown with speaker labels +- **Cross-Platform** — Builds for Linux, Windows, and macOS (Apple Silicon) ## Platform Support -| Platform | Status | -|----------|--------| -| Linux | Planned (initial target) | -| Windows | Planned (initial target) | -| macOS | Future (pending hardware) | +| Platform | Architecture | Status | +|----------|-------------|--------| +| Linux | x86_64 | Supported | +| Windows | x86_64 | Supported | +| macOS | ARM (Apple Silicon) | Supported | -## Project Status +## Tech Stack -**Early planning phase** — Architecture and technology decisions in progress. +- **Desktop shell:** Tauri v2 (Rust backend + Svelte 5 / TypeScript frontend) +- **ML pipeline:** Python sidecar (faster-whisper, pyannote.audio) — frozen via PyInstaller for distribution +- **Audio playback:** wavesurfer.js with Web Audio API backend +- **AI providers:** OpenAI, Anthropic, OpenAI-compatible endpoints (local or remote) +- **Local AI:** Bundled llama-server (llama.cpp) +- **Caption export:** pysubs2 + +## Development + +### Prerequisites + +- Node.js 20+ +- Rust (stable) +- Python 3.11+ with ML dependencies +- System: `libgtk-3-dev`, `libwebkit2gtk-4.1-dev` (Linux) + +### Getting Started + +```bash +# Install frontend dependencies +npm install + +# Install Python sidecar dependencies +cd python && pip install -e . && cd .. + +# Run in dev mode (uses system Python for the sidecar) +npm run tauri:dev +``` + +### Building for Distribution + +```bash +# Build the frozen Python sidecar +npm run sidecar:build + +# Build the Tauri app (requires sidecar in src-tauri/binaries/) +npm run tauri build +``` + +### CI/CD + +Gitea Actions workflows are in `.gitea/workflows/`. The build pipeline: + +1. **Build sidecar** — PyInstaller-frozen Python binary per platform (CPU-only PyTorch) +2. **Build Tauri app** — Bundles the sidecar via `externalBin`, produces .deb/.AppImage (Linux), .msi (Windows), .dmg (macOS) + +#### Required Secrets + +| Secret | Purpose | Required? | +|--------|---------|-----------| +| `TAURI_SIGNING_PRIVATE_KEY` | Signs Tauri update bundles | Optional (for auto-updates) | + +No other secrets are needed for building. AI provider API keys and HuggingFace tokens are configured by end users in the app's Settings. + +### Project Structure + +``` +src/ # Svelte 5 frontend +src-tauri/ # Rust backend (Tauri commands, sidecar manager, SQLite) +python/ # Python sidecar (transcription, diarization, AI) + voice_to_notes/ # Python package + build_sidecar.py # PyInstaller build script + voice_to_notes.spec # PyInstaller spec +.gitea/workflows/ # Gitea Actions CI/CD +``` ## License diff --git a/package.json b/package.json index 543d205..3a5ca97 100644 --- a/package.json +++ b/package.json @@ -11,7 +11,9 @@ "check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch", "lint": "eslint .", "test": "vitest", - "tauri": "tauri" + "tauri": "tauri", + "tauri:dev": "VOICE_TO_NOTES_DEV=1 tauri dev", + "sidecar:build": "cd python && python3 build_sidecar.py" }, "license": "MIT", "dependencies": { diff --git a/python/build_sidecar.py b/python/build_sidecar.py new file mode 100644 index 0000000..3b855ad --- /dev/null +++ b/python/build_sidecar.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +"""Build the Voice to Notes sidecar as a standalone binary using PyInstaller. + +Usage: + python build_sidecar.py [--cpu-only] + +Produces a directory `dist/voice-to-notes-sidecar/` containing the frozen +sidecar binary and all dependencies. The main binary is renamed to include +the Tauri target triple for externalBin resolution. +""" + +from __future__ import annotations + +import argparse +import os +import platform +import shutil +import stat +import subprocess +import sys +import urllib.request +import zipfile +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +DIST_DIR = SCRIPT_DIR / "dist" +BUILD_DIR = SCRIPT_DIR / "build" +SPEC_FILE = SCRIPT_DIR / "voice_to_notes.spec" + +# Static ffmpeg download URLs (GPL-licensed builds) +FFMPEG_URLS: dict[str, str] = { + "linux-x86_64": "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz", + "darwin-x86_64": "https://evermeet.cx/ffmpeg/getrelease/zip", + "darwin-arm64": "https://evermeet.cx/ffmpeg/getrelease/zip", + "win32-x86_64": "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip", +} + + +def get_target_triple() -> str: + """Determine the Tauri-compatible target triple for the current platform.""" + machine = platform.machine().lower() + system = platform.system().lower() + + arch_map = { + "x86_64": "x86_64", + "amd64": "x86_64", + "aarch64": "aarch64", + "arm64": "aarch64", + } + arch = arch_map.get(machine, machine) + + if system == "linux": + return f"{arch}-unknown-linux-gnu" + elif system == "darwin": + return f"{arch}-apple-darwin" + elif system == "windows": + return f"{arch}-pc-windows-msvc" + else: + return f"{arch}-unknown-{system}" + + +def create_venv_and_install(cpu_only: bool) -> Path: + """Create a fresh venv and install dependencies.""" + venv_dir = BUILD_DIR / "sidecar-venv" + if venv_dir.exists(): + shutil.rmtree(venv_dir) + + print(f"[build] Creating venv at {venv_dir}") + subprocess.run([sys.executable, "-m", "venv", str(venv_dir)], check=True) + + # Determine pip and python paths inside venv + if sys.platform == "win32": + pip = str(venv_dir / "Scripts" / "pip") + python = str(venv_dir / "Scripts" / "python") + else: + pip = str(venv_dir / "bin" / "pip") + python = str(venv_dir / "bin" / "python") + + # Upgrade pip + subprocess.run([pip, "install", "--upgrade", "pip"], check=True) + + # Install torch (CPU-only to avoid bundling ~2GB of CUDA libs) + if cpu_only: + print("[build] Installing PyTorch (CPU-only)") + subprocess.run( + [pip, "install", "torch", "torchaudio", + "--index-url", "https://download.pytorch.org/whl/cpu"], + check=True, + ) + else: + print("[build] Installing PyTorch (default, may include CUDA)") + subprocess.run([pip, "install", "torch", "torchaudio"], check=True) + + # Install project and dev deps (includes pyinstaller) + print("[build] Installing project dependencies") + subprocess.run([pip, "install", "-e", f"{SCRIPT_DIR}[dev]"], check=True) + + return Path(python) + + +def run_pyinstaller(python: Path) -> Path: + """Run PyInstaller using the spec file.""" + print("[build] Running PyInstaller") + subprocess.run( + [str(python), "-m", "PyInstaller", "--clean", "--noconfirm", str(SPEC_FILE)], + cwd=str(SCRIPT_DIR), + check=True, + ) + output_dir = DIST_DIR / "voice-to-notes-sidecar" + if not output_dir.exists(): + raise RuntimeError(f"PyInstaller output not found at {output_dir}") + return output_dir + + +def download_ffmpeg(output_dir: Path) -> None: + """Download a static ffmpeg/ffprobe binary for the current platform.""" + system = sys.platform + machine = platform.machine().lower() + if machine in ("amd64", "x86_64"): + machine = "x86_64" + elif machine in ("aarch64", "arm64"): + machine = "arm64" + + key = f"{system}-{machine}" + if system == "win32": + key = f"win32-{machine}" + elif system == "linux": + key = f"linux-{machine}" + + url = FFMPEG_URLS.get(key) + if not url: + print(f"[build] Warning: No ffmpeg download URL for platform {key}, skipping") + return + + print(f"[build] Downloading ffmpeg for {key}") + tmp_path = output_dir / "ffmpeg_download" + try: + urllib.request.urlretrieve(url, str(tmp_path)) + + if url.endswith(".tar.xz"): + # Linux static build + import tarfile + with tarfile.open(str(tmp_path), "r:xz") as tar: + for member in tar.getmembers(): + basename = os.path.basename(member.name) + if basename in ("ffmpeg", "ffprobe"): + member.name = basename + tar.extract(member, path=str(output_dir)) + dest = output_dir / basename + dest.chmod(dest.stat().st_mode | stat.S_IEXEC) + elif url.endswith(".zip"): + with zipfile.ZipFile(str(tmp_path), "r") as zf: + for name in zf.namelist(): + basename = os.path.basename(name) + if basename in ("ffmpeg", "ffprobe", "ffmpeg.exe", "ffprobe.exe"): + data = zf.read(name) + dest = output_dir / basename + dest.write_bytes(data) + if sys.platform != "win32": + dest.chmod(dest.stat().st_mode | stat.S_IEXEC) + print("[build] ffmpeg downloaded successfully") + except Exception as e: + print(f"[build] Warning: Failed to download ffmpeg: {e}") + finally: + if tmp_path.exists(): + tmp_path.unlink() + + +def rename_binary(output_dir: Path, target_triple: str) -> None: + """Rename the main binary to include the target triple for Tauri.""" + if sys.platform == "win32": + src = output_dir / "voice-to-notes-sidecar.exe" + dst = output_dir / f"voice-to-notes-sidecar-{target_triple}.exe" + else: + src = output_dir / "voice-to-notes-sidecar" + dst = output_dir / f"voice-to-notes-sidecar-{target_triple}" + + if src.exists(): + print(f"[build] Renaming {src.name} -> {dst.name}") + src.rename(dst) + else: + print(f"[build] Warning: Expected binary not found at {src}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build the Voice to Notes sidecar binary") + parser.add_argument( + "--cpu-only", + action="store_true", + default=True, + help="Install CPU-only PyTorch (default: True, avoids bundling CUDA)", + ) + parser.add_argument( + "--with-cuda", + action="store_true", + help="Install PyTorch with CUDA support", + ) + args = parser.parse_args() + cpu_only = not args.with_cuda + + target_triple = get_target_triple() + print(f"[build] Target triple: {target_triple}") + print(f"[build] CPU-only: {cpu_only}") + + python = create_venv_and_install(cpu_only) + output_dir = run_pyinstaller(python) + download_ffmpeg(output_dir) + rename_binary(output_dir, target_triple) + + print(f"\n[build] Done! Sidecar built at: {output_dir}") + print(f"[build] Copy contents to src-tauri/binaries/ for Tauri bundling") + + +if __name__ == "__main__": + main() diff --git a/python/pyproject.toml b/python/pyproject.toml index 62f118a..d7f3f8f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "faster-whisper>=1.1.0", "pyannote.audio>=3.1.0", "pysubs2>=1.7.0", + "openai>=1.0.0", + "anthropic>=0.20.0", ] [project.optional-dependencies] @@ -20,6 +22,7 @@ dev = [ "ruff>=0.8.0", "pytest>=8.0.0", "pytest-asyncio>=0.24.0", + "pyinstaller>=6.0", ] [tool.ruff] diff --git a/python/tests/test_diarize.py b/python/tests/test_diarize.py index 3ceccc6..b3f75f0 100644 --- a/python/tests/test_diarize.py +++ b/python/tests/test_diarize.py @@ -1,7 +1,13 @@ """Tests for diarization service data structures and payload conversion.""" +import time +from unittest.mock import MagicMock, patch + +import pytest + from voice_to_notes.services.diarize import ( DiarizationResult, + DiarizeService, SpeakerSegment, diarization_to_payload, ) @@ -31,3 +37,74 @@ def test_diarization_to_payload_empty(): assert payload["num_speakers"] == 0 assert payload["speaker_segments"] == [] assert payload["speakers"] == [] + + +def test_diarize_threading_progress(monkeypatch): + """Test that diarization emits progress while running in background thread.""" + # Track written messages + written_messages = [] + def mock_write(msg): + written_messages.append(msg) + + # Mock pipeline that takes ~5 seconds + def slow_pipeline(file_path, **kwargs): + time.sleep(5) + # Return a mock diarization result (use spec=object to prevent + # hasattr returning True for speaker_diarization) + mock_result = MagicMock(spec=[]) + mock_track = MagicMock() + mock_track.start = 0.0 + mock_track.end = 5.0 + mock_result.itertracks = MagicMock(return_value=[(mock_track, None, "SPEAKER_00")]) + return mock_result + + mock_pipeline_obj = MagicMock() + mock_pipeline_obj.side_effect = slow_pipeline + + service = DiarizeService() + service._pipeline = mock_pipeline_obj + + with patch("voice_to_notes.services.diarize.write_message", mock_write): + result = service.diarize( + request_id="req-1", + file_path="/fake/audio.wav", + audio_duration_sec=60.0, + ) + + # Filter for diarizing progress messages (not loading_diarization or done) + diarizing_msgs = [ + m for m in written_messages + if m.type == "progress" and m.payload.get("stage") == "diarizing" + and "elapsed" in m.payload.get("message", "") + ] + + # Should have at least 1 progress message (5s sleep / 2s interval = ~2 messages) + assert len(diarizing_msgs) >= 1, ( + f"Expected at least 1 diarizing progress message, got {len(diarizing_msgs)}" + ) + + # Progress percent should be between 20 and 85 + for msg in diarizing_msgs: + pct = msg.payload["percent"] + assert 20 <= pct <= 85, f"Progress {pct} out of expected range 20-85" + + # Result should be valid + assert result.num_speakers == 1 + assert result.speakers == ["SPEAKER_00"] + + +def test_diarize_threading_error_propagation(monkeypatch): + """Test that errors from the background thread are properly raised.""" + mock_pipeline_obj = MagicMock() + mock_pipeline_obj.side_effect = RuntimeError("Pipeline crashed") + + service = DiarizeService() + service._pipeline = mock_pipeline_obj + + with patch("voice_to_notes.services.diarize.write_message", lambda m: None): + with pytest.raises(RuntimeError, match="Pipeline crashed"): + service.diarize( + request_id="req-1", + file_path="/fake/audio.wav", + audio_duration_sec=30.0, + ) diff --git a/python/tests/test_messages.py b/python/tests/test_messages.py index 12b71db..eb2a75d 100644 --- a/python/tests/test_messages.py +++ b/python/tests/test_messages.py @@ -3,8 +3,10 @@ from voice_to_notes.ipc.messages import ( IPCMessage, error_message, + partial_segment_message, progress_message, ready_message, + speaker_update_message, ) @@ -48,3 +50,16 @@ def test_ready_message(): assert msg.type == "ready" assert msg.id == "system" assert "version" in msg.payload + + +def test_partial_segment_message(): + msg = partial_segment_message("req-1", {"index": 0, "text": "hello"}) + assert msg.type == "pipeline.segment" + assert msg.payload["index"] == 0 + assert msg.payload["text"] == "hello" + + +def test_speaker_update_message(): + msg = speaker_update_message("req-1", [{"index": 0, "speaker": "SPEAKER_00"}]) + assert msg.type == "pipeline.speaker_update" + assert msg.payload["updates"][0]["speaker"] == "SPEAKER_00" diff --git a/python/tests/test_pipeline.py b/python/tests/test_pipeline.py index a1d3fef..33789aa 100644 --- a/python/tests/test_pipeline.py +++ b/python/tests/test_pipeline.py @@ -88,3 +88,18 @@ def test_merge_results_no_speaker_segments(): result = service._merge_results(transcription, []) assert result.segments[0].speaker is None + + +def test_speaker_update_generation(): + """Test that speaker updates are generated after merge.""" + result = PipelineResult( + segments=[ + PipelineSegment(text="Hello", start_ms=0, end_ms=1000, speaker="SPEAKER_00"), + PipelineSegment(text="World", start_ms=1000, end_ms=2000, speaker="SPEAKER_01"), + PipelineSegment(text="Foo", start_ms=2000, end_ms=3000, speaker=None), + ], + ) + updates = [{"index": i, "speaker": seg.speaker} for i, seg in enumerate(result.segments) if seg.speaker] + assert len(updates) == 2 + assert updates[0] == {"index": 0, "speaker": "SPEAKER_00"} + assert updates[1] == {"index": 1, "speaker": "SPEAKER_01"} diff --git a/python/tests/test_protocol.py b/python/tests/test_protocol.py index 95a588a..688c41f 100644 --- a/python/tests/test_protocol.py +++ b/python/tests/test_protocol.py @@ -5,16 +5,23 @@ import json from voice_to_notes.ipc.messages import IPCMessage from voice_to_notes.ipc.protocol import read_message, write_message +import voice_to_notes.ipc.protocol as protocol -def test_write_message(capsys): - msg = IPCMessage(id="req-1", type="pong", payload={"ok": True}) - write_message(msg) - captured = capsys.readouterr() - parsed = json.loads(captured.out.strip()) - assert parsed["id"] == "req-1" - assert parsed["type"] == "pong" - assert parsed["payload"]["ok"] is True +def test_write_message(): + buf = io.StringIO() + # Temporarily replace the IPC output stream + old_out = protocol._ipc_out + protocol._ipc_out = buf + try: + msg = IPCMessage(id="req-1", type="pong", payload={"ok": True}) + write_message(msg) + parsed = json.loads(buf.getvalue().strip()) + assert parsed["id"] == "req-1" + assert parsed["type"] == "pong" + assert parsed["payload"]["ok"] is True + finally: + protocol._ipc_out = old_out def test_read_message(monkeypatch): diff --git a/python/tests/test_transcribe.py b/python/tests/test_transcribe.py index b9e4220..4365c36 100644 --- a/python/tests/test_transcribe.py +++ b/python/tests/test_transcribe.py @@ -1,7 +1,10 @@ """Tests for transcription service.""" +import inspect + from voice_to_notes.services.transcribe import ( SegmentResult, + TranscribeService, TranscriptionResult, WordResult, result_to_payload, @@ -49,3 +52,149 @@ def test_result_to_payload_empty(): assert payload["segments"] == [] assert payload["language"] == "" assert payload["duration_ms"] == 0 + + +def test_on_segment_callback(): + """Test that on_segment callback is invoked with correct SegmentResult and index.""" + callback_args = [] + + def mock_callback(seg: SegmentResult, index: int): + callback_args.append((seg.text, index)) + + # Test that passing on_segment doesn't break the function signature + # (Full integration test would require mocking WhisperModel) + service = TranscribeService() + # Verify the parameter exists by checking the signature + sig = inspect.signature(service.transcribe) + assert "on_segment" in sig.parameters + + +def test_progress_every_segment(monkeypatch): + """Verify a progress message is sent for every segment, not just every 5th.""" + from unittest.mock import MagicMock, patch + from voice_to_notes.services.transcribe import TranscribeService + + # Mock WhisperModel + mock_model = MagicMock() + + # Create mock segments (8 of them to test > 5) + mock_segments = [] + for i in range(8): + seg = MagicMock() + seg.start = i * 1.0 + seg.end = (i + 1) * 1.0 + seg.text = f"Segment {i}" + seg.words = [] + mock_segments.append(seg) + + # Mock info object + mock_info = MagicMock() + mock_info.language = "en" + mock_info.language_probability = 0.99 + mock_info.duration = 8.0 + + mock_model.transcribe.return_value = (iter(mock_segments), mock_info) + + # Track write_message calls + written_messages = [] + + def mock_write(msg): + written_messages.append(msg) + + service = TranscribeService() + service._model = mock_model + service._current_model_name = "base" + service._current_device = "cpu" + service._current_compute_type = "int8" + + with patch("voice_to_notes.services.transcribe.write_message", mock_write): + service.transcribe("req-1", "/fake/audio.wav") + + # Filter for "transcribing" stage progress messages + transcribing_msgs = [ + m for m in written_messages + if m.type == "progress" and m.payload.get("stage") == "transcribing" + ] + + # Should have one per segment (8) + the initial "Starting transcription..." message + # The initial "Starting transcription..." is also stage "transcribing" — so 8 + 1 = 9 + assert len(transcribing_msgs) >= 8, ( + f"Expected at least 8 transcribing progress messages (one per segment), got {len(transcribing_msgs)}" + ) + + +def test_chunk_report_size_progress(): + """Test CHUNK_REPORT_SIZE progress emission.""" + from voice_to_notes.services.transcribe import CHUNK_REPORT_SIZE + assert CHUNK_REPORT_SIZE == 10 + + +def test_transcribe_chunked_with_mocked_ffmpeg(monkeypatch): + """Test transcribe_chunked with mocked ffmpeg/ffprobe and mocked WhisperModel.""" + from unittest.mock import MagicMock, patch + from voice_to_notes.services.transcribe import TranscribeService, SegmentResult, WordResult + + # Mock subprocess.run for ffprobe (returns duration of 700s = ~2 chunks at 300s each) + original_run = __import__("subprocess").run + + def mock_subprocess_run(cmd, **kwargs): + if "ffprobe" in cmd: + result = MagicMock() + result.stdout = "700.0\n" + result.returncode = 0 + return result + elif "ffmpeg" in cmd: + # Create an empty temp file (simulate chunk extraction) + # The output file is the last argument + import pathlib + output_file = cmd[-1] + pathlib.Path(output_file).touch() + result = MagicMock() + result.returncode = 0 + return result + return original_run(cmd, **kwargs) + + # Mock WhisperModel + mock_model = MagicMock() + def mock_transcribe_call(file_path, **kwargs): + mock_segments = [] + for i in range(3): + seg = MagicMock() + seg.start = i * 1.0 + seg.end = (i + 1) * 1.0 + seg.text = f"Segment {i}" + seg.words = [] + mock_segments.append(seg) + mock_info = MagicMock() + mock_info.language = "en" + mock_info.language_probability = 0.99 + mock_info.duration = 300.0 + return iter(mock_segments), mock_info + + mock_model.transcribe = mock_transcribe_call + + service = TranscribeService() + service._model = mock_model + service._current_model_name = "base" + service._current_device = "cpu" + service._current_compute_type = "int8" + + written_messages = [] + def mock_write(msg): + written_messages.append(msg) + + with patch("subprocess.run", mock_subprocess_run), \ + patch("voice_to_notes.services.transcribe.write_message", mock_write): + result = service.transcribe_chunked("req-1", "/fake/long_audio.wav") + + # Should have segments from multiple chunks + assert len(result.segments) > 0 + + # Verify timestamp offsets — segments from chunk 1 should start at 0, + # segments from chunk 2 should be offset by 300000ms + if len(result.segments) > 3: + # Chunk 2 segments should have offset timestamps + assert result.segments[3].start_ms >= 300000 + + assert result.duration_ms == 700000 + assert result.language == "en" diff --git a/python/voice_to_notes.spec b/python/voice_to_notes.spec new file mode 100644 index 0000000..0687bb1 --- /dev/null +++ b/python/voice_to_notes.spec @@ -0,0 +1,67 @@ +# -*- mode: python ; coding: utf-8 -*- +"""PyInstaller spec for the Voice to Notes sidecar binary.""" + +from PyInstaller.utils.hooks import collect_all + +block_cipher = None + +# Collect all files for packages that have shared libraries / data files +# PyInstaller often misses these for ML packages +ctranslate2_datas, ctranslate2_binaries, ctranslate2_hiddenimports = collect_all("ctranslate2") +faster_whisper_datas, faster_whisper_binaries, faster_whisper_hiddenimports = collect_all( + "faster_whisper" +) +pyannote_datas, pyannote_binaries, pyannote_hiddenimports = collect_all("pyannote") + +a = Analysis( + ["voice_to_notes/main.py"], + pathex=[], + binaries=ctranslate2_binaries + faster_whisper_binaries + pyannote_binaries, + datas=ctranslate2_datas + faster_whisper_datas + pyannote_datas, + hiddenimports=[ + "torch", + "torchaudio", + "huggingface_hub", + "pysubs2", + "openai", + "anthropic", + "litellm", + ] + + ctranslate2_hiddenimports + + faster_whisper_hiddenimports + + pyannote_hiddenimports, + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=["tkinter", "test", "unittest", "pip", "setuptools"], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + [], + exclude_binaries=True, + name="voice-to-notes-sidecar", + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + console=True, +) + +coll = COLLECT( + exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=True, + upx_exclude=[], + name="voice-to-notes-sidecar", +) diff --git a/python/voice_to_notes/hardware/detect.py b/python/voice_to_notes/hardware/detect.py index f69c401..408d26a 100644 --- a/python/voice_to_notes/hardware/detect.py +++ b/python/voice_to_notes/hardware/detect.py @@ -2,7 +2,10 @@ from __future__ import annotations +import ctypes import os +import platform +import subprocess import sys from dataclasses import dataclass @@ -21,6 +24,77 @@ class HardwareInfo: recommended_compute_type: str = "int8" +def _detect_ram_mb() -> int: + """Detect total system RAM in MB (cross-platform). + + Tries platform-specific methods in order: + 1. Linux: read /proc/meminfo + 2. macOS: sysctl hw.memsize + 3. Windows: GlobalMemoryStatusEx via ctypes + 4. Fallback: os.sysconf (most Unix systems) + + Returns 0 if all methods fail. + """ + # Linux: read /proc/meminfo + if sys.platform == "linux": + try: + with open("/proc/meminfo") as f: + for line in f: + if line.startswith("MemTotal:"): + # Value is in kB + return int(line.split()[1]) // 1024 + except (FileNotFoundError, ValueError, OSError): + pass + + # macOS: sysctl hw.memsize (returns bytes) + if sys.platform == "darwin": + try: + result = subprocess.run( + ["sysctl", "-n", "hw.memsize"], + capture_output=True, + text=True, + check=True, + ) + return int(result.stdout.strip()) // (1024 * 1024) + except (subprocess.SubprocessError, ValueError, OSError): + pass + + # Windows: GlobalMemoryStatusEx via ctypes + if sys.platform == "win32": + try: + + class MEMORYSTATUSEX(ctypes.Structure): + _fields_ = [ + ("dwLength", ctypes.c_ulong), + ("dwMemoryLoad", ctypes.c_ulong), + ("ullTotalPhys", ctypes.c_ulonglong), + ("ullAvailPhys", ctypes.c_ulonglong), + ("ullTotalPageFile", ctypes.c_ulonglong), + ("ullAvailPageFile", ctypes.c_ulonglong), + ("ullTotalVirtual", ctypes.c_ulonglong), + ("ullAvailVirtual", ctypes.c_ulonglong), + ("ullAvailExtendedVirtual", ctypes.c_ulonglong), + ] + + mem_status = MEMORYSTATUSEX() + mem_status.dwLength = ctypes.sizeof(MEMORYSTATUSEX) + if ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(mem_status)): + return int(mem_status.ullTotalPhys) // (1024 * 1024) + except (AttributeError, OSError): + pass + + # Fallback: os.sysconf (works on most Unix systems) + try: + page_size = os.sysconf("SC_PAGE_SIZE") + phys_pages = os.sysconf("SC_PHYS_PAGES") + if page_size > 0 and phys_pages > 0: + return (page_size * phys_pages) // (1024 * 1024) + except (ValueError, OSError, AttributeError): + pass + + return 0 + + def detect_hardware() -> HardwareInfo: """Detect available hardware and recommend model configuration.""" info = HardwareInfo() @@ -28,16 +102,8 @@ def detect_hardware() -> HardwareInfo: # CPU info info.cpu_cores = os.cpu_count() or 1 - # RAM info - try: - with open("/proc/meminfo") as f: - for line in f: - if line.startswith("MemTotal:"): - # Value is in kB - info.ram_mb = int(line.split()[1]) // 1024 - break - except (FileNotFoundError, ValueError): - pass + # RAM info (cross-platform) + info.ram_mb = _detect_ram_mb() # CUDA detection try: diff --git a/python/voice_to_notes/ipc/handlers.py b/python/voice_to_notes/ipc/handlers.py index 5d27734..dcc20ef 100644 --- a/python/voice_to_notes/ipc/handlers.py +++ b/python/voice_to_notes/ipc/handlers.py @@ -88,6 +88,79 @@ def make_diarize_handler() -> HandlerFunc: return handler +def make_diarize_download_handler() -> HandlerFunc: + """Create a handler that downloads/validates the diarization model.""" + import os + + def handler(msg: IPCMessage) -> IPCMessage: + payload = msg.payload + hf_token = payload.get("hf_token") + + try: + import huggingface_hub + + # Disable pyannote telemetry (has a bug in v4.0.4) + os.environ.setdefault("PYANNOTE_METRICS_ENABLED", "false") + from pyannote.audio import Pipeline + + # Persist token globally so ALL huggingface_hub downloads use auth. + # Setting env var alone isn't enough — pyannote's internal sub-downloads + # (e.g. PLDA.from_pretrained) don't forward the token= parameter. + # login() writes the token to ~/.cache/huggingface/token which + # huggingface_hub reads automatically for all downloads. + if hf_token: + os.environ["HF_TOKEN"] = hf_token + huggingface_hub.login(token=hf_token, add_to_git_credential=False) + + # Pre-download sub-models that pyannote loads internally. + # This ensures they're cached before Pipeline.from_pretrained + # tries to load them (where token forwarding can fail). + sub_models = [ + "pyannote/segmentation-3.0", + "pyannote/speaker-diarization-community-1", + ] + for model_id in sub_models: + print(f"[sidecar] Pre-downloading {model_id}...", file=sys.stderr, flush=True) + huggingface_hub.snapshot_download(model_id, token=hf_token) + + print("[sidecar] Downloading diarization pipeline...", file=sys.stderr, flush=True) + pipeline = Pipeline.from_pretrained( + "pyannote/speaker-diarization-3.1", + token=hf_token, + ) + print("[sidecar] Diarization model downloaded successfully", file=sys.stderr, flush=True) + return IPCMessage( + id=msg.id, + type="diarize.download.result", + payload={"ok": True}, + ) + except Exception as e: + error_msg = str(e) + print(f"[sidecar] Model download error: {error_msg}", file=sys.stderr, flush=True) + # Make common errors more user-friendly + if "403" in error_msg or "gated" in error_msg.lower(): + # Try to extract the specific model name from the error + import re + model_match = re.search(r"pyannote/[\w-]+", error_msg) + if model_match: + model_name = model_match.group(0) + error_msg = ( + f"Access denied for {model_name}. " + f"Please visit huggingface.co/{model_name} " + f"and accept the license agreement, then try again." + ) + else: + error_msg = ( + "Access denied. Please accept the license agreements for all " + "required pyannote models on HuggingFace." + ) + elif "401" in error_msg: + error_msg = "Invalid token. Please check your HuggingFace token." + return error_message(msg.id, "download_error", error_msg) + + return handler + + def make_pipeline_handler() -> HandlerFunc: """Create a full pipeline handler (transcribe + diarize + merge).""" from voice_to_notes.services.pipeline import PipelineService, pipeline_result_to_payload @@ -107,6 +180,7 @@ def make_pipeline_handler() -> HandlerFunc: min_speakers=payload.get("min_speakers"), max_speakers=payload.get("max_speakers"), skip_diarization=payload.get("skip_diarization", False), + hf_token=payload.get("hf_token"), ) return IPCMessage( id=msg.id, @@ -186,10 +260,12 @@ def make_ai_chat_handler() -> HandlerFunc: model=config.get("model", "claude-sonnet-4-6"), )) elif provider_name == "litellm": - from voice_to_notes.providers.litellm_provider import LiteLLMProvider + from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider - service.register_provider("litellm", LiteLLMProvider( + service.register_provider("litellm", OpenAICompatibleProvider( model=config.get("model", "gpt-4o-mini"), + api_key=config.get("api_key"), + api_base=config.get("api_base"), )) return IPCMessage( id=msg.id, diff --git a/python/voice_to_notes/ipc/messages.py b/python/voice_to_notes/ipc/messages.py index 6abc3d5..4e08df9 100644 --- a/python/voice_to_notes/ipc/messages.py +++ b/python/voice_to_notes/ipc/messages.py @@ -34,6 +34,14 @@ def progress_message(request_id: str, percent: int, stage: str, message: str) -> ) +def partial_segment_message(request_id: str, segment_data: dict) -> IPCMessage: + return IPCMessage(id=request_id, type="pipeline.segment", payload=segment_data) + + +def speaker_update_message(request_id: str, updates: list[dict]) -> IPCMessage: + return IPCMessage(id=request_id, type="pipeline.speaker_update", payload={"updates": updates}) + + def error_message(request_id: str, code: str, message: str) -> IPCMessage: return IPCMessage( id=request_id, diff --git a/python/voice_to_notes/ipc/protocol.py b/python/voice_to_notes/ipc/protocol.py index e55393f..57054d7 100644 --- a/python/voice_to_notes/ipc/protocol.py +++ b/python/voice_to_notes/ipc/protocol.py @@ -1,13 +1,53 @@ -"""JSON-line protocol reader/writer over stdin/stdout.""" +"""JSON-line protocol reader/writer over stdin/stdout. + +IMPORTANT: stdout is reserved exclusively for IPC messages. +At init time we save the real stdout, then redirect sys.stdout → stderr +so that any rogue print() calls from libraries don't corrupt the IPC stream. +""" from __future__ import annotations +import io import json +import os import sys from typing import Any from voice_to_notes.ipc.messages import IPCMessage +# Save the real stdout fd for IPC before any library can pollute it. +# Then redirect sys.stdout to stderr so library prints go to stderr. +_ipc_out: io.TextIOWrapper | None = None + + +def init_ipc() -> None: + """Capture real stdout for IPC and redirect sys.stdout to stderr. + + Must be called once at sidecar startup, before importing any ML libraries. + """ + global _ipc_out + if _ipc_out is not None: + return # already initialised + + # Duplicate the real stdout fd so we keep it even after redirect + real_stdout_fd = os.dup(sys.stdout.fileno()) + _ipc_out = io.TextIOWrapper( + io.BufferedWriter(io.FileIO(real_stdout_fd, "w")), + encoding="utf-8", + line_buffering=True, + ) + + # Redirect sys.stdout → stderr so print() from libraries goes to stderr + sys.stdout = sys.stderr + + +def _get_ipc_out() -> io.TextIOWrapper: + """Return the IPC output stream, falling back to sys.__stdout__.""" + if _ipc_out is not None: + return _ipc_out + # Fallback if init_ipc() was never called (e.g. in tests) + return sys.__stdout__ + def read_message() -> IPCMessage | None: """Read a single JSON-line message from stdin. Returns None on EOF.""" @@ -29,17 +69,19 @@ def read_message() -> IPCMessage | None: def write_message(msg: IPCMessage) -> None: - """Write a JSON-line message to stdout.""" + """Write a JSON-line message to the IPC channel (real stdout).""" + out = _get_ipc_out() line = json.dumps(msg.to_dict(), separators=(",", ":")) - sys.stdout.write(line + "\n") - sys.stdout.flush() + out.write(line + "\n") + out.flush() def write_dict(data: dict[str, Any]) -> None: - """Write a raw dict as a JSON-line message to stdout.""" + """Write a raw dict as a JSON-line message to the IPC channel.""" + out = _get_ipc_out() line = json.dumps(data, separators=(",", ":")) - sys.stdout.write(line + "\n") - sys.stdout.flush() + out.write(line + "\n") + out.flush() def _log(message: str) -> None: diff --git a/python/voice_to_notes/main.py b/python/voice_to_notes/main.py index fedff95..d72d1df 100644 --- a/python/voice_to_notes/main.py +++ b/python/voice_to_notes/main.py @@ -5,18 +5,25 @@ from __future__ import annotations import signal import sys -from voice_to_notes.ipc.handlers import ( +# CRITICAL: Capture real stdout for IPC *before* importing any ML libraries +# that might print to stdout and corrupt the JSON-line protocol. +from voice_to_notes.ipc.protocol import init_ipc + +init_ipc() + +from voice_to_notes.ipc.handlers import ( # noqa: E402 HandlerRegistry, hardware_detect_handler, make_ai_chat_handler, + make_diarize_download_handler, make_diarize_handler, make_export_handler, make_pipeline_handler, make_transcribe_handler, ping_handler, ) -from voice_to_notes.ipc.messages import ready_message -from voice_to_notes.ipc.protocol import read_message, write_message +from voice_to_notes.ipc.messages import ready_message # noqa: E402 +from voice_to_notes.ipc.protocol import read_message, write_message # noqa: E402 def create_registry() -> HandlerRegistry: @@ -26,6 +33,7 @@ def create_registry() -> HandlerRegistry: registry.register("transcribe.start", make_transcribe_handler()) registry.register("hardware.detect", hardware_detect_handler) registry.register("diarize.start", make_diarize_handler()) + registry.register("diarize.download", make_diarize_download_handler()) registry.register("pipeline.start", make_pipeline_handler()) registry.register("export.start", make_export_handler()) registry.register("ai.chat", make_ai_chat_handler()) diff --git a/python/voice_to_notes/providers/litellm_provider.py b/python/voice_to_notes/providers/litellm_provider.py index fd48a0a..faa91fe 100644 --- a/python/voice_to_notes/providers/litellm_provider.py +++ b/python/voice_to_notes/providers/litellm_provider.py @@ -1,4 +1,4 @@ -"""LiteLLM provider — multi-provider gateway.""" +"""OpenAI-compatible provider — works with any OpenAI-compatible API endpoint.""" from __future__ import annotations @@ -7,36 +7,44 @@ from typing import Any from voice_to_notes.providers.base import AIProvider -class LiteLLMProvider(AIProvider): - """Routes through LiteLLM for access to 100+ LLM providers.""" +class OpenAICompatibleProvider(AIProvider): + """Connects to any OpenAI-compatible API (LiteLLM proxy, Ollama, vLLM, etc.).""" - def __init__(self, model: str = "gpt-4o-mini", **kwargs: Any) -> None: + def __init__( + self, + api_key: str | None = None, + api_base: str | None = None, + model: str = "gpt-4o-mini", + **kwargs: Any, + ) -> None: + self._api_key = api_key or "sk-no-key" + self._api_base = api_base self._model = model self._extra_kwargs = kwargs def chat(self, messages: list[dict[str, str]], **kwargs: Any) -> str: - try: - import litellm - except ImportError: - raise RuntimeError("litellm package is required. Install with: pip install litellm") + from openai import OpenAI - merged_kwargs = {**self._extra_kwargs, **kwargs} - response = litellm.completion( - model=merged_kwargs.get("model", self._model), + client_kwargs: dict[str, Any] = {"api_key": self._api_key} + if self._api_base: + client_kwargs["base_url"] = self._api_base + + client = OpenAI(**client_kwargs) + response = client.chat.completions.create( + model=kwargs.get("model", self._model), messages=messages, - temperature=merged_kwargs.get("temperature", 0.7), - max_tokens=merged_kwargs.get("max_tokens", 2048), + temperature=kwargs.get("temperature", 0.7), + max_tokens=kwargs.get("max_tokens", 2048), ) return response.choices[0].message.content or "" def is_available(self) -> bool: try: - import litellm # noqa: F401 - - return True + import openai # noqa: F401 + return bool(self._api_key and self._api_base) except ImportError: return False @property def name(self) -> str: - return "LiteLLM" + return "OpenAI Compatible" diff --git a/python/voice_to_notes/services/ai_provider.py b/python/voice_to_notes/services/ai_provider.py index 6fa4f51..5e8f14f 100644 --- a/python/voice_to_notes/services/ai_provider.py +++ b/python/voice_to_notes/services/ai_provider.py @@ -92,7 +92,7 @@ class AIProviderService: def create_default_service() -> AIProviderService: """Create an AIProviderService with all supported providers registered.""" from voice_to_notes.providers.anthropic_provider import AnthropicProvider - from voice_to_notes.providers.litellm_provider import LiteLLMProvider + from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider from voice_to_notes.providers.local_provider import LocalProvider from voice_to_notes.providers.openai_provider import OpenAIProvider @@ -100,5 +100,5 @@ def create_default_service() -> AIProviderService: service.register_provider("local", LocalProvider()) service.register_provider("openai", OpenAIProvider()) service.register_provider("anthropic", AnthropicProvider()) - service.register_provider("litellm", LiteLLMProvider()) + service.register_provider("litellm", OpenAICompatibleProvider()) return service diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py index 201ca9c..6ac5f51 100644 --- a/python/voice_to_notes/services/diarize.py +++ b/python/voice_to_notes/services/diarize.py @@ -2,15 +2,69 @@ from __future__ import annotations +import os +import subprocess import sys +import tempfile +import threading import time from dataclasses import dataclass, field +from pathlib import Path from typing import Any +# Disable pyannote telemetry — it has a bug in v4.0.4 where +# np.isfinite(None) crashes when max_speakers is not set. +os.environ.setdefault("PYANNOTE_METRICS_ENABLED", "false") + +from voice_to_notes.utils.ffmpeg import get_ffmpeg_path from voice_to_notes.ipc.messages import progress_message from voice_to_notes.ipc.protocol import write_message +def _ensure_wav(file_path: str) -> tuple[str, str | None]: + """Convert audio to 16kHz mono WAV if needed. + + pyannote.audio v4.0.4 has a bug where its AudioDecoder returns + duration=None for some formats (FLAC, etc.), causing crashes. + Converting to WAV ensures the duration header is always present. + + Returns: + (path_to_use, temp_path_or_None) + If conversion was needed, temp_path is the WAV file to clean up. + """ + ext = Path(file_path).suffix.lower() + if ext == ".wav": + return file_path, None + + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp.close() + try: + subprocess.run( + [ + get_ffmpeg_path(), "-y", "-i", file_path, + "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", + tmp.name, + ], + check=True, + capture_output=True, + ) + print( + f"[sidecar] Converted {ext} to WAV for diarization", + file=sys.stderr, + flush=True, + ) + return tmp.name, tmp.name + except (subprocess.CalledProcessError, FileNotFoundError) as e: + # ffmpeg not available or failed — try original file and hope for the best + print( + f"[sidecar] WAV conversion failed ({e}), using original file", + file=sys.stderr, + flush=True, + ) + os.unlink(tmp.name) + return file_path, None + + @dataclass class SpeakerSegment: """A time span assigned to a speaker.""" @@ -35,45 +89,59 @@ class DiarizeService: def __init__(self) -> None: self._pipeline: Any = None - def _ensure_pipeline(self) -> Any: + def _ensure_pipeline(self, hf_token: str | None = None) -> Any: """Load the pyannote diarization pipeline (lazy).""" if self._pipeline is not None: return self._pipeline print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True) - try: - from pyannote.audio import Pipeline + # Use token from argument, fall back to environment variable + if not hf_token: + hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None - self._pipeline = Pipeline.from_pretrained( - "pyannote/speaker-diarization-3.1", - use_auth_token=False, - ) - except Exception: - # Fall back to a simpler approach if the model isn't available - # pyannote requires HuggingFace token for some models - # Try the community model first + # Persist token globally so ALL huggingface_hub sub-downloads use auth. + # Pyannote has internal dependencies that don't forward the token= param. + if hf_token: + os.environ["HF_TOKEN"] = hf_token + import huggingface_hub + huggingface_hub.login(token=hf_token, add_to_git_credential=False) + + models = [ + "pyannote/speaker-diarization-3.1", + "pyannote/speaker-diarization", + ] + + last_error: Exception | None = None + for model_name in models: try: from pyannote.audio import Pipeline - self._pipeline = Pipeline.from_pretrained( - "pyannote/speaker-diarization", - use_auth_token=False, - ) + self._pipeline = Pipeline.from_pretrained(model_name, token=hf_token) + print(f"[sidecar] Loaded diarization model: {model_name}", file=sys.stderr, flush=True) + # Move pipeline to GPU if available + try: + import torch + if torch.cuda.is_available(): + self._pipeline = self._pipeline.to(torch.device("cuda")) + print(f"[sidecar] Diarization pipeline moved to GPU", file=sys.stderr, flush=True) + except Exception as e: + print(f"[sidecar] GPU not available for diarization: {e}", file=sys.stderr, flush=True) + return self._pipeline except Exception as e: + last_error = e print( - f"[sidecar] Warning: Could not load pyannote pipeline: {e}", + f"[sidecar] Warning: Could not load {model_name}: {e}", file=sys.stderr, flush=True, ) - raise RuntimeError( - "pyannote.audio pipeline not available. " - "You may need to accept the model license at " - "https://huggingface.co/pyannote/speaker-diarization-3.1 " - "and set a HF_TOKEN environment variable." - ) from e - return self._pipeline + raise RuntimeError( + "pyannote.audio pipeline not available. " + "You may need to accept the model license at " + "https://huggingface.co/pyannote/speaker-diarization-3.1 " + "and set a HF_TOKEN environment variable." + ) from last_error def diarize( self, @@ -82,6 +150,8 @@ class DiarizeService: num_speakers: int | None = None, min_speakers: int | None = None, max_speakers: int | None = None, + hf_token: str | None = None, + audio_duration_sec: float | None = None, ) -> DiarizationResult: """Run speaker diarization on an audio file. @@ -99,7 +169,7 @@ class DiarizeService: progress_message(request_id, 0, "loading_diarization", "Loading diarization model...") ) - pipeline = self._ensure_pipeline() + pipeline = self._ensure_pipeline(hf_token=hf_token) write_message( progress_message(request_id, 20, "diarizing", "Running speaker diarization...") @@ -116,8 +186,55 @@ class DiarizeService: if max_speakers is not None: kwargs["max_speakers"] = max_speakers - # Run diarization - diarization = pipeline(file_path, **kwargs) + # Convert to WAV to work around pyannote v4.0.4 duration bug + audio_path, temp_wav = _ensure_wav(file_path) + + print( + f"[sidecar] Running diarization on {audio_path} with kwargs: {kwargs}", + file=sys.stderr, + flush=True, + ) + + # Run diarization in background thread for progress reporting + result_holder: list = [None] + error_holder: list[Exception | None] = [None] + done_event = threading.Event() + + def _run(): + try: + result_holder[0] = pipeline(audio_path, **kwargs) + except Exception as e: + error_holder[0] = e + finally: + done_event.set() + + thread = threading.Thread(target=_run, daemon=True) + thread.start() + + elapsed = 0.0 + estimated_total = max(audio_duration_sec * 0.5, 30.0) if audio_duration_sec else 120.0 + while not done_event.wait(timeout=2.0): + elapsed += 2.0 + pct = min(20 + int((elapsed / estimated_total) * 65), 85) + write_message(progress_message( + request_id, pct, "diarizing", + f"Analyzing speakers ({int(elapsed)}s elapsed)...")) + + thread.join() + + # Clean up temp file + if temp_wav: + os.unlink(temp_wav) + + if error_holder[0] is not None: + raise error_holder[0] + raw_result = result_holder[0] + + # pyannote 4.0+ returns DiarizeOutput; older versions return Annotation directly + if hasattr(raw_result, "speaker_diarization"): + diarization = raw_result.speaker_diarization + else: + diarization = raw_result # Convert pyannote output to our format result = DiarizationResult() diff --git a/python/voice_to_notes/services/pipeline.py b/python/voice_to_notes/services/pipeline.py index 2d1f66b..903fb4c 100644 --- a/python/voice_to_notes/services/pipeline.py +++ b/python/voice_to_notes/services/pipeline.py @@ -2,13 +2,19 @@ from __future__ import annotations +import concurrent.futures import sys import time from dataclasses import dataclass, field from typing import Any -from voice_to_notes.ipc.messages import progress_message +from voice_to_notes.ipc.messages import ( + partial_segment_message, + progress_message, + speaker_update_message, +) from voice_to_notes.ipc.protocol import write_message +from voice_to_notes.utils.ffmpeg import get_ffprobe_path from voice_to_notes.services.diarize import DiarizeService, SpeakerSegment from voice_to_notes.services.transcribe import ( SegmentResult, @@ -60,6 +66,7 @@ class PipelineService: min_speakers: int | None = None, max_speakers: int | None = None, skip_diarization: bool = False, + hf_token: str | None = None, ) -> PipelineResult: """Run the full transcription + diarization pipeline. @@ -77,22 +84,59 @@ class PipelineService: """ start_time = time.time() - # Step 1: Transcribe + # Step 0: Probe audio duration for conditional chunked transcription write_message( progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...") ) - transcription = self._transcribe_service.transcribe( - request_id=request_id, - file_path=file_path, - model_name=model_name, - device=device, - compute_type=compute_type, - language=language, - ) + def _emit_segment(seg: SegmentResult, index: int) -> None: + write_message(partial_segment_message(request_id, { + "index": index, + "text": seg.text, + "start_ms": seg.start_ms, + "end_ms": seg.end_ms, + "words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words], + })) + + audio_duration_sec = None + try: + import subprocess + probe_result = subprocess.run( + [get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", file_path], + capture_output=True, text=True, check=True, + ) + audio_duration_sec = float(probe_result.stdout.strip()) + except (subprocess.CalledProcessError, FileNotFoundError, ValueError): + pass + + def _run_transcription() -> TranscriptionResult: + """Run transcription (chunked or standard based on duration).""" + from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC + if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC: + return self._transcribe_service.transcribe_chunked( + request_id=request_id, + file_path=file_path, + model_name=model_name, + device=device, + compute_type=compute_type, + language=language, + on_segment=_emit_segment, + ) + else: + return self._transcribe_service.transcribe( + request_id=request_id, + file_path=file_path, + model_name=model_name, + device=device, + compute_type=compute_type, + language=language, + on_segment=_emit_segment, + ) if skip_diarization: - # Convert transcription directly without speaker labels + # Sequential: transcribe only, no diarization needed + transcription = _run_transcription() result = PipelineResult( language=transcription.language, language_probability=transcription.language_probability, @@ -110,27 +154,83 @@ class PipelineService: ) return result - # Step 2: Diarize + # Parallel execution: run transcription (0-45%) and diarization (45-90%) + # concurrently, then merge (90-100%). write_message( - progress_message(request_id, 50, "pipeline", "Starting speaker diarization...") + progress_message( + request_id, 0, "pipeline", + "Starting transcription and diarization in parallel..." + ) ) - diarization = self._diarize_service.diarize( - request_id=request_id, - file_path=file_path, - num_speakers=num_speakers, - min_speakers=min_speakers, - max_speakers=max_speakers, - ) + diarization = None + diarization_error = None - # Step 3: Merge - write_message( - progress_message(request_id, 90, "pipeline", "Merging transcript with speakers...") - ) + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + transcription_future = executor.submit(_run_transcription) - result = self._merge_results(transcription, diarization.speaker_segments) - result.speakers = diarization.speakers - result.num_speakers = diarization.num_speakers + # Use probed audio_duration_sec for diarization progress estimation + # (transcription hasn't finished yet, so we can't use transcription.duration_ms) + diarization_future = executor.submit( + self._diarize_service.diarize, + request_id=request_id, + file_path=file_path, + num_speakers=num_speakers, + min_speakers=min_speakers, + max_speakers=max_speakers, + hf_token=hf_token, + audio_duration_sec=audio_duration_sec, + ) + + # Wait for both futures. We need the transcription result regardless, + # but diarization may fail gracefully. + transcription = transcription_future.result() + write_message( + progress_message(request_id, 45, "pipeline", "Transcription complete") + ) + + try: + diarization = diarization_future.result() + except Exception as e: + import traceback + diarization_error = e + print( + f"[sidecar] Diarization failed, falling back to transcription-only: {e}", + file=sys.stderr, + flush=True, + ) + traceback.print_exc(file=sys.stderr) + write_message( + progress_message( + request_id, 80, "pipeline", + f"Diarization failed ({e}), using transcription only..." + ) + ) + + # Step 3: Merge (or skip if diarization failed) + if diarization is not None: + write_message( + progress_message(request_id, 90, "merging", "Merging transcript with speakers...") + ) + result = self._merge_results(transcription, diarization.speaker_segments) + result.speakers = diarization.speakers + result.num_speakers = diarization.num_speakers + else: + result = PipelineResult( + language=transcription.language, + language_probability=transcription.language_probability, + duration_ms=transcription.duration_ms, + ) + for seg in transcription.segments: + result.segments.append( + PipelineSegment( + text=seg.text, + start_ms=seg.start_ms, + end_ms=seg.end_ms, + speaker=None, + words=seg.words, + ) + ) elapsed = time.time() - start_time print( @@ -140,6 +240,10 @@ class PipelineService: flush=True, ) + updates = [{"index": i, "speaker": seg.speaker} for i, seg in enumerate(result.segments) if seg.speaker] + if updates: + write_message(speaker_update_message(request_id, updates)) + write_message( progress_message(request_id, 100, "done", "Pipeline complete") ) diff --git a/python/voice_to_notes/services/transcribe.py b/python/voice_to_notes/services/transcribe.py index 2539cfc..87bb01d 100644 --- a/python/voice_to_notes/services/transcribe.py +++ b/python/voice_to_notes/services/transcribe.py @@ -4,6 +4,7 @@ from __future__ import annotations import sys import time +from collections.abc import Callable from dataclasses import dataclass, field from typing import Any @@ -11,6 +12,10 @@ from faster_whisper import WhisperModel from voice_to_notes.ipc.messages import progress_message from voice_to_notes.ipc.protocol import write_message +from voice_to_notes.utils.ffmpeg import get_ffmpeg_path, get_ffprobe_path + +CHUNK_REPORT_SIZE = 10 +LARGE_FILE_THRESHOLD_SEC = 3600 # 1 hour @dataclass @@ -90,6 +95,7 @@ class TranscribeService: device: str = "cpu", compute_type: str = "int8", language: str | None = None, + on_segment: Callable[[SegmentResult, int], None] | None = None, ) -> TranscriptionResult: """Transcribe an audio file with word-level timestamps. @@ -145,16 +151,23 @@ class TranscribeService: ) ) - # Send progress every few segments - if segment_count % 5 == 0: - write_message( - progress_message( - request_id, - progress_pct, - "transcribing", - f"Processed {segment_count} segments...", - ) + if on_segment: + on_segment(result.segments[-1], segment_count - 1) + + write_message( + progress_message( + request_id, + progress_pct, + "transcribing", + f"Transcribing segment {segment_count} ({progress_pct}% of audio)...", ) + ) + + if segment_count % CHUNK_REPORT_SIZE == 0: + write_message(progress_message( + request_id, progress_pct, "transcribing", + f"Completed chunk of {CHUNK_REPORT_SIZE} segments " + f"({segment_count} total, {progress_pct}% of audio)...")) elapsed = time.time() - start_time print( @@ -166,6 +179,113 @@ class TranscribeService: write_message(progress_message(request_id, 100, "done", "Transcription complete")) return result + def transcribe_chunked( + self, + request_id: str, + file_path: str, + model_name: str = "base", + device: str = "cpu", + compute_type: str = "int8", + language: str | None = None, + on_segment: Callable[[SegmentResult, int], None] | None = None, + chunk_duration_sec: int = 300, + ) -> TranscriptionResult: + """Transcribe a large audio file by splitting into chunks. + + Uses ffmpeg to split the file into chunks, transcribes each chunk, + then merges the results with corrected timestamps. + + Falls back to standard transcribe() if ffmpeg is not available. + """ + import subprocess + import tempfile + + # Get total duration via ffprobe + try: + probe_result = subprocess.run( + [get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", file_path], + capture_output=True, text=True, check=True, + ) + total_duration = float(probe_result.stdout.strip()) + except (subprocess.CalledProcessError, FileNotFoundError, ValueError): + # ffprobe not available or failed — fall back to standard transcription + write_message(progress_message( + request_id, 5, "transcribing", + "ffmpeg not available, using standard transcription...")) + return self.transcribe(request_id, file_path, model_name, device, + compute_type, language, on_segment=on_segment) + + num_chunks = max(1, int(total_duration / chunk_duration_sec) + 1) + write_message(progress_message( + request_id, 5, "transcribing", + f"Splitting {total_duration:.0f}s file into {num_chunks} chunks...")) + + merged_result = TranscriptionResult() + global_segment_index = 0 + + for chunk_idx in range(num_chunks): + chunk_start = chunk_idx * chunk_duration_sec + if chunk_start >= total_duration: + break + + chunk_start_ms = int(chunk_start * 1000) + + # Extract chunk to temp file + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp.close() + try: + subprocess.run( + [get_ffmpeg_path(), "-y", "-ss", str(chunk_start), + "-t", str(chunk_duration_sec), + "-i", file_path, + "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", + tmp.name], + capture_output=True, check=True, + ) + + # Wrap on_segment to offset the index + chunk_on_segment = None + if on_segment: + base_index = global_segment_index + def chunk_on_segment(seg: SegmentResult, idx: int, _base=base_index) -> None: + on_segment(seg, _base + idx) + + chunk_result = self.transcribe( + request_id, tmp.name, model_name, device, + compute_type, language, on_segment=chunk_on_segment, + ) + + # Offset timestamps and merge + for seg in chunk_result.segments: + seg.start_ms += chunk_start_ms + seg.end_ms += chunk_start_ms + for word in seg.words: + word.start_ms += chunk_start_ms + word.end_ms += chunk_start_ms + merged_result.segments.append(seg) + + global_segment_index += len(chunk_result.segments) + + # Take language from first chunk + if chunk_idx == 0: + merged_result.language = chunk_result.language + merged_result.language_probability = chunk_result.language_probability + + finally: + import os + os.unlink(tmp.name) + + # Chunk progress + chunk_pct = min(10 + int(((chunk_idx + 1) / num_chunks) * 80), 90) + write_message(progress_message( + request_id, chunk_pct, "transcribing", + f"Completed chunk {chunk_idx + 1}/{num_chunks}...")) + + merged_result.duration_ms = int(total_duration * 1000) + write_message(progress_message(request_id, 100, "done", "Transcription complete")) + return merged_result + def result_to_payload(result: TranscriptionResult) -> dict[str, Any]: """Convert TranscriptionResult to IPC payload dict.""" diff --git a/python/voice_to_notes/utils/ffmpeg.py b/python/voice_to_notes/utils/ffmpeg.py new file mode 100644 index 0000000..6232449 --- /dev/null +++ b/python/voice_to_notes/utils/ffmpeg.py @@ -0,0 +1,43 @@ +"""Resolve ffmpeg/ffprobe paths for both frozen and development builds.""" + +from __future__ import annotations + +import os +import sys + + +def get_ffmpeg_path() -> str: + """Return the path to the ffmpeg binary. + + When running as a frozen PyInstaller bundle, looks next to sys.executable. + Otherwise falls back to the system PATH. + """ + if getattr(sys, "frozen", False): + # Frozen PyInstaller bundle — ffmpeg is next to the sidecar binary + bundle_dir = os.path.dirname(sys.executable) + candidates = [ + os.path.join(bundle_dir, "ffmpeg.exe" if sys.platform == "win32" else "ffmpeg"), + os.path.join(bundle_dir, "ffmpeg"), + ] + for path in candidates: + if os.path.isfile(path): + return path + return "ffmpeg" + + +def get_ffprobe_path() -> str: + """Return the path to the ffprobe binary. + + When running as a frozen PyInstaller bundle, looks next to sys.executable. + Otherwise falls back to the system PATH. + """ + if getattr(sys, "frozen", False): + bundle_dir = os.path.dirname(sys.executable) + candidates = [ + os.path.join(bundle_dir, "ffprobe.exe" if sys.platform == "win32" else "ffprobe"), + os.path.join(bundle_dir, "ffprobe"), + ] + for path in candidates: + if os.path.isfile(path): + return path + return "ffprobe" diff --git a/src-tauri/binaries/.gitkeep b/src-tauri/binaries/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src-tauri/src/commands/ai.rs b/src-tauri/src/commands/ai.rs index 99e699e..a102e81 100644 --- a/src-tauri/src/commands/ai.rs +++ b/src-tauri/src/commands/ai.rs @@ -39,7 +39,11 @@ pub fn ai_chat( if response.msg_type == "error" { return Err(format!( "AI error: {}", - response.payload.get("message").and_then(|v| v.as_str()).unwrap_or("unknown") + response + .payload + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("unknown") )); } diff --git a/src-tauri/src/commands/export.rs b/src-tauri/src/commands/export.rs index fb91026..9d2f4a1 100644 --- a/src-tauri/src/commands/export.rs +++ b/src-tauri/src/commands/export.rs @@ -33,7 +33,11 @@ pub fn export_transcript( if response.msg_type == "error" { return Err(format!( "Export error: {}", - response.payload.get("message").and_then(|v| v.as_str()).unwrap_or("unknown") + response + .payload + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("unknown") )); } diff --git a/src-tauri/src/commands/system.rs b/src-tauri/src/commands/system.rs index daed78d..985b465 100644 --- a/src-tauri/src/commands/system.rs +++ b/src-tauri/src/commands/system.rs @@ -22,9 +22,7 @@ pub fn llama_start( threads: Option, ) -> Result { let config = LlamaConfig { - binary_path: PathBuf::from( - binary_path.unwrap_or_else(|| "llama-server".to_string()), - ), + binary_path: PathBuf::from(binary_path.unwrap_or_else(|| "llama-server".to_string())), model_path: PathBuf::from(model_path), port: port.unwrap_or(0), n_gpu_layers: n_gpu_layers.unwrap_or(0), diff --git a/src-tauri/src/commands/transcribe.rs b/src-tauri/src/commands/transcribe.rs index 9e2239a..f1521ad 100644 --- a/src-tauri/src/commands/transcribe.rs +++ b/src-tauri/src/commands/transcribe.rs @@ -1,4 +1,5 @@ use serde_json::{json, Value}; +use tauri::{AppHandle, Emitter}; use crate::sidecar::messages::IPCMessage; use crate::sidecar::sidecar; @@ -32,16 +33,48 @@ pub fn transcribe_file( if response.msg_type == "error" { return Err(format!( "Transcription error: {}", - response.payload.get("message").and_then(|v| v.as_str()).unwrap_or("unknown") + response + .payload + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("unknown") )); } Ok(response.payload) } +/// Download and validate the diarization model via the Python sidecar. +#[tauri::command] +pub fn download_diarize_model(hf_token: String) -> Result { + let manager = sidecar(); + manager.ensure_running()?; + + let request_id = uuid::Uuid::new_v4().to_string(); + let msg = IPCMessage::new( + &request_id, + "diarize.download", + json!({ + "hf_token": hf_token, + }), + ); + + let response = manager.send_and_receive(&msg)?; + + if response.msg_type == "error" { + return Ok(json!({ + "ok": false, + "error": response.payload.get("message").and_then(|v| v.as_str()).unwrap_or("unknown"), + })); + } + + Ok(json!({ "ok": true })) +} + /// Run the full transcription + diarization pipeline via the Python sidecar. #[tauri::command] -pub fn run_pipeline( +pub async fn run_pipeline( + app: AppHandle, file_path: String, model: Option, device: Option, @@ -50,6 +83,7 @@ pub fn run_pipeline( min_speakers: Option, max_speakers: Option, skip_diarization: Option, + hf_token: Option, ) -> Result { let manager = sidecar(); manager.ensure_running()?; @@ -68,17 +102,38 @@ pub fn run_pipeline( "min_speakers": min_speakers, "max_speakers": max_speakers, "skip_diarization": skip_diarization.unwrap_or(false), + "hf_token": hf_token, }), ); - let response = manager.send_and_receive(&msg)?; + // Run the blocking sidecar I/O on a separate thread so the async runtime + // can deliver emitted events to the webview while processing is ongoing. + let app_handle = app.clone(); + tauri::async_runtime::spawn_blocking(move || { + let response = manager.send_and_receive_with_progress(&msg, |msg| { + let event_name = match msg.msg_type.as_str() { + "pipeline.segment" => "pipeline-segment", + "pipeline.speaker_update" => "pipeline-speaker-update", + _ => "pipeline-progress", + }; + if let Err(e) = app_handle.emit(event_name, &msg.payload) { + eprintln!("[sidecar-rs] Failed to emit {event_name}: {e}"); + } + })?; - if response.msg_type == "error" { - return Err(format!( - "Pipeline error: {}", - response.payload.get("message").and_then(|v| v.as_str()).unwrap_or("unknown") - )); - } + if response.msg_type == "error" { + return Err(format!( + "Pipeline error: {}", + response + .payload + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("unknown") + )); + } - Ok(response.payload) + Ok(response.payload) + }) + .await + .map_err(|e| format!("Pipeline task failed: {e}"))? } diff --git a/src-tauri/src/db/schema.rs b/src-tauri/src/db/schema.rs index 000eded..70494f7 100644 --- a/src-tauri/src/db/schema.rs +++ b/src-tauri/src/db/schema.rs @@ -96,11 +96,7 @@ pub fn create_tables(conn: &Connection) -> Result<(), DatabaseError> { )?; // Initialize schema version if empty - let count: i32 = conn.query_row( - "SELECT COUNT(*) FROM schema_version", - [], - |row| row.get(0), - )?; + let count: i32 = conn.query_row("SELECT COUNT(*) FROM schema_version", [], |row| row.get(0))?; if count == 0 { conn.execute( "INSERT INTO schema_version (version) VALUES (?1)", diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index ff523ae..0f3e476 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -4,12 +4,15 @@ pub mod llama; pub mod sidecar; pub mod state; +use tauri::window::Color; +use tauri::Manager; + use commands::ai::{ai_chat, ai_configure, ai_list_providers}; use commands::export::export_transcript; use commands::project::{create_project, get_project, list_projects}; use commands::settings::{load_settings, save_settings}; use commands::system::{get_data_dir, llama_list_models, llama_start, llama_status, llama_stop}; -use commands::transcribe::{run_pipeline, transcribe_file}; +use commands::transcribe::{download_diarize_model, run_pipeline, transcribe_file}; use state::AppState; #[cfg_attr(mobile, tauri::mobile_entry_point)] @@ -20,12 +23,20 @@ pub fn run() { .plugin(tauri_plugin_opener::init()) .plugin(tauri_plugin_dialog::init()) .manage(app_state) + .setup(|app| { + // Set the webview background to match the app's dark theme + if let Some(window) = app.get_webview_window("main") { + let _ = window.set_background_color(Some(Color(10, 10, 35, 255))); + } + Ok(()) + }) .invoke_handler(tauri::generate_handler![ create_project, get_project, list_projects, transcribe_file, run_pipeline, + download_diarize_model, export_transcript, ai_chat, ai_list_providers, diff --git a/src-tauri/src/llama/mod.rs b/src-tauri/src/llama/mod.rs index 4cebccf..2cb5b81 100644 --- a/src-tauri/src/llama/mod.rs +++ b/src-tauri/src/llama/mod.rs @@ -237,11 +237,7 @@ impl LlamaManager { /// Get the current status. pub fn status(&self) -> LlamaStatus { - let running = self - .process - .lock() - .ok() - .map_or(false, |p| p.is_some()); + let running = self.process.lock().ok().map_or(false, |p| p.is_some()); let port = self.port.lock().ok().map_or(0, |p| *p); let model = self .model_path diff --git a/src-tauri/src/sidecar/mod.rs b/src-tauri/src/sidecar/mod.rs index dd60840..56140a4 100644 --- a/src-tauri/src/sidecar/mod.rs +++ b/src-tauri/src/sidecar/mod.rs @@ -13,8 +13,13 @@ pub fn sidecar() -> &'static SidecarManager { INSTANCE.get_or_init(SidecarManager::new) } -/// Manages the Python sidecar process lifecycle. -/// Uses separated stdin/stdout ownership to avoid BufReader conflicts. +/// Manages the sidecar process lifecycle. +/// +/// Supports two modes: +/// - **Production**: spawns a frozen PyInstaller binary (no Python required) +/// - **Dev mode**: spawns system Python with `-m voice_to_notes.main` +/// +/// Dev mode is active when compiled in debug mode or when `VOICE_TO_NOTES_DEV=1`. pub struct SidecarManager { process: Mutex>, stdin: Mutex>, @@ -30,38 +35,141 @@ impl SidecarManager { } } + /// Check if we should use dev mode (system Python). + fn is_dev_mode() -> bool { + cfg!(debug_assertions) || std::env::var("VOICE_TO_NOTES_DEV").is_ok() + } + + /// Resolve the frozen sidecar binary path (production mode). + fn resolve_sidecar_path() -> Result { + let exe = std::env::current_exe().map_err(|e| format!("Cannot get current exe: {e}"))?; + let exe_dir = exe + .parent() + .ok_or_else(|| "Cannot get exe parent directory".to_string())?; + + let binary_name = if cfg!(target_os = "windows") { + "voice-to-notes-sidecar.exe" + } else { + "voice-to-notes-sidecar" + }; + + // Tauri places externalBin next to the app binary + let path = exe_dir.join(binary_name); + if path.exists() { + return Ok(path); + } + + // Also check inside a subdirectory (onedir PyInstaller output) + let subdir_path = exe_dir.join("voice-to-notes-sidecar").join(binary_name); + if subdir_path.exists() { + return Ok(subdir_path); + } + + Err(format!( + "Sidecar binary not found. Looked for:\n {}\n {}", + path.display(), + subdir_path.display(), + )) + } + + /// Find a working Python command for the current platform. + fn find_python_command() -> &'static str { + if cfg!(target_os = "windows") { + "python" + } else { + "python3" + } + } + + /// Resolve the Python sidecar directory for dev mode. + fn resolve_python_dir() -> Result { + let manifest_dir = env!("CARGO_MANIFEST_DIR"); + let python_dir = std::path::Path::new(manifest_dir) + .join("../python") + .canonicalize() + .map_err(|e| format!("Cannot find python directory: {e}"))?; + + if python_dir.exists() { + return Ok(python_dir); + } + + // Fallback: relative to current exe + let exe = std::env::current_exe().map_err(|e| e.to_string())?; + let alt = exe + .parent() + .ok_or_else(|| "No parent dir".to_string())? + .join("../python") + .canonicalize() + .map_err(|e| format!("Cannot find python directory: {e}"))?; + + Ok(alt) + } + /// Ensure the sidecar is running, starting it if needed. pub fn ensure_running(&self) -> Result<(), String> { if self.is_running() { return Ok(()); } - let python_path = std::env::current_dir() - .map_err(|e| e.to_string())? - .join("../python") - .canonicalize() - .map_err(|e| format!("Cannot find python directory: {e}"))?; - - self.start(&python_path.to_string_lossy()) + if Self::is_dev_mode() { + self.start_python_dev() + } else { + match Self::resolve_sidecar_path() { + Ok(path) => self.start_binary(&path), + Err(e) => { + eprintln!( + "[sidecar-rs] Frozen binary not found ({e}), falling back to dev mode" + ); + self.start_python_dev() + } + } + } } - /// Spawn the Python sidecar process. - pub fn start(&self, python_path: &str) -> Result<(), String> { - // Stop existing process if any + /// Spawn the frozen sidecar binary (production mode). + fn start_binary(&self, path: &std::path::Path) -> Result<(), String> { self.stop().ok(); + eprintln!("[sidecar-rs] Starting frozen sidecar: {}", path.display()); - let mut child = Command::new("python3") - .arg("-m") - .arg("voice_to_notes.main") - .current_dir(python_path) - .env("PYTHONPATH", python_path) + let child = Command::new(path) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::inherit()) .spawn() - .map_err(|e| format!("Failed to start sidecar: {e}"))?; + .map_err(|e| format!("Failed to start sidecar binary: {e}"))?; - // Take ownership of stdin and stdout separately + self.attach(child)?; + self.wait_for_ready() + } + + /// Spawn the Python sidecar in dev mode (system Python). + fn start_python_dev(&self) -> Result<(), String> { + self.stop().ok(); + let python_dir = Self::resolve_python_dir()?; + let python_cmd = Self::find_python_command(); + eprintln!( + "[sidecar-rs] Starting dev sidecar: {} -m voice_to_notes.main ({})", + python_cmd, + python_dir.display() + ); + + let child = Command::new(python_cmd) + .arg("-m") + .arg("voice_to_notes.main") + .current_dir(&python_dir) + .env("PYTHONPATH", &python_dir) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .map_err(|e| format!("Failed to start Python sidecar: {e}"))?; + + self.attach(child)?; + self.wait_for_ready() + } + + /// Take ownership of a spawned child's stdin/stdout and store the process handle. + fn attach(&self, mut child: Child) -> Result<(), String> { let stdin = child.stdin.take().ok_or("Failed to get sidecar stdin")?; let stdout = child.stdout.take().ok_or("Failed to get sidecar stdout")?; let buf_reader = BufReader::new(stdout); @@ -78,10 +186,6 @@ impl SidecarManager { let mut r = self.reader.lock().map_err(|e| e.to_string())?; *r = Some(buf_reader); } - - // Wait for the "ready" message - self.wait_for_ready()?; - Ok(()) } @@ -107,16 +211,33 @@ impl SidecarManager { return Ok(()); } } - // Non-ready message: something is wrong - break; + // Non-JSON or non-ready line — skip and keep waiting + eprintln!( + "[sidecar-rs] Skipping pre-ready line: {}", + &trimmed[..trimmed.len().min(200)] + ); + continue; } } Err("Sidecar did not send ready message".to_string()) } /// Send a message to the sidecar and read the response. - /// This is a blocking call. + /// This is a blocking call. Progress messages are skipped. pub fn send_and_receive(&self, msg: &IPCMessage) -> Result { + self.send_and_receive_with_progress(msg, |_| {}) + } + + /// Send a message and receive the response, calling a callback for intermediate messages. + /// Intermediate messages include progress, pipeline.segment, and pipeline.speaker_update. + pub fn send_and_receive_with_progress( + &self, + msg: &IPCMessage, + on_intermediate: F, + ) -> Result + where + F: Fn(&IPCMessage), + { // Write to stdin { let mut stdin_guard = self.stdin.lock().map_err(|e| e.to_string())?; @@ -151,11 +272,17 @@ impl SidecarManager { if trimmed.is_empty() { continue; } - let response: IPCMessage = serde_json::from_str(trimmed) - .map_err(|e| format!("Parse error: {e}"))?; + let response: IPCMessage = + serde_json::from_str(trimmed).map_err(|e| format!("Parse error: {e}"))?; - // Skip progress messages, return the final result/error - if response.msg_type != "progress" { + // Forward intermediate messages via callback, return the final result/error + let is_intermediate = matches!( + response.msg_type.as_str(), + "progress" | "pipeline.segment" | "pipeline.speaker_update" + ); + if is_intermediate { + on_intermediate(&response); + } else { return Ok(response); } } diff --git a/src-tauri/src/state.rs b/src-tauri/src/state.rs index 2ad86db..320a22a 100644 --- a/src-tauri/src/state.rs +++ b/src-tauri/src/state.rs @@ -15,12 +15,10 @@ pub struct AppState { impl AppState { pub fn new() -> Result { let data_dir = LlamaManager::data_dir(); - std::fs::create_dir_all(&data_dir) - .map_err(|e| format!("Cannot create data dir: {e}"))?; + std::fs::create_dir_all(&data_dir).map_err(|e| format!("Cannot create data dir: {e}"))?; let db_path = data_dir.join("voice_to_notes.db"); - let conn = db::open_database(&db_path) - .map_err(|e| format!("Cannot open database: {e}"))?; + let conn = db::open_database(&db_path).map_err(|e| format!("Cannot open database: {e}"))?; Ok(Self { db: Mutex::new(conn), diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index 5d02585..be4cb48 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -16,7 +16,9 @@ "width": 1200, "height": 800, "minWidth": 800, - "minHeight": 600 + "minHeight": 600, + "decorations": true, + "transparent": false } ], "security": { @@ -44,7 +46,7 @@ "license": "MIT", "linux": { "deb": { - "depends": ["python3", "python3-pip"] + "depends": [] }, "appimage": { "bundleMediaFramework": true diff --git a/src/app.html b/src/app.html index b155e3e..5ad1b28 100644 --- a/src/app.html +++ b/src/app.html @@ -1,5 +1,5 @@ - + @@ -7,7 +7,7 @@ Voice to Notes %sveltekit.head% - +
%sveltekit.body%
diff --git a/src/lib/components/AIChatPanel.svelte b/src/lib/components/AIChatPanel.svelte index 81a328f..1a53ae0 100644 --- a/src/lib/components/AIChatPanel.svelte +++ b/src/lib/components/AIChatPanel.svelte @@ -1,6 +1,7 @@ {#if visible}
-

{stage}

-
-
+
+
+

{displayStage}

-

{percent}% — {message}

+ +
+ {#each pipelineSteps as step, idx} + {@const status = getStepStatus(idx)} +
+ + {#if status === 'done'} + ✓ + {:else if status === 'active'} + ⟳ + {:else} + · + {/if} + + {step.label} +
+ {/each} +
+ +

{message || 'Please wait...'}

+

This may take several minutes for large files

{/if} @@ -25,34 +97,81 @@ .overlay { position: fixed; inset: 0; - background: rgba(0, 0, 0, 0.7); + background: rgba(0, 0, 0, 0.8); display: flex; align-items: center; justify-content: center; - z-index: 1000; + z-index: 9999; } .progress-card { background: #16213e; - padding: 2rem; + padding: 2rem 2.5rem; border-radius: 12px; - min-width: 400px; + min-width: 380px; + max-width: 440px; color: #e0e0e0; + border: 1px solid #2a3a5e; + box-shadow: 0 8px 32px rgba(0, 0, 0, 0.5); } - h3 { margin: 0 0 1rem; text-transform: capitalize; } - .bar-track { - height: 8px; - background: #0f3460; - border-radius: 4px; - overflow: hidden; + .spinner-row { + display: flex; + align-items: center; + gap: 0.75rem; + margin-bottom: 1.25rem; } - .bar-fill { - height: 100%; - background: #e94560; - transition: width 0.3s; + .spinner { + width: 20px; + height: 20px; + border: 3px solid #2a3a5e; + border-top-color: #e94560; + border-radius: 50%; + animation: spin 0.8s linear infinite; + flex-shrink: 0; } - p { + @keyframes spin { + to { transform: rotate(360deg); } + } + h3 { + margin: 0; + font-size: 1.1rem; + } + .steps { + display: flex; + flex-direction: column; + gap: 0.4rem; + margin-bottom: 1rem; + } + .step { + display: flex; + align-items: center; + gap: 0.5rem; + font-size: 0.85rem; + color: #555; + } + .step-done { + color: #4ecdc4; + } + .step-active { + color: #e0e0e0; + font-weight: 500; + } + .step-icon { + width: 1.2rem; + text-align: center; + flex-shrink: 0; + } + .step-active .step-icon { + animation: spin 1.5s linear infinite; + display: inline-block; + } + .status-text { + margin: 0.75rem 0 0; + font-size: 0.85rem; + color: #b0b0b0; + } + .hint-text { margin: 0.5rem 0 0; - font-size: 0.875rem; - color: #999; + font-size: 0.75rem; + color: #555; } diff --git a/src/lib/components/SettingsModal.svelte b/src/lib/components/SettingsModal.svelte index 659b9e7..7f8de81 100644 --- a/src/lib/components/SettingsModal.svelte +++ b/src/lib/components/SettingsModal.svelte @@ -1,4 +1,6 @@ @@ -95,11 +106,17 @@
- - + - + {currentTime} / {totalTime}
@@ -129,9 +146,13 @@ cursor: pointer; font-size: 1rem; } - .control-btn:hover { + .control-btn:hover:not(:disabled) { background: #1a4a7a; } + .control-btn:disabled { + opacity: 0.4; + cursor: not-allowed; + } .play-btn { padding: 0.4rem 1rem; font-size: 1.2rem; diff --git a/src/lib/stores/settings.ts b/src/lib/stores/settings.ts index 32da0ee..86262c9 100644 --- a/src/lib/stores/settings.ts +++ b/src/lib/stores/settings.ts @@ -8,12 +8,16 @@ export interface AppSettings { openai_model: string; anthropic_model: string; litellm_model: string; + litellm_api_key: string; + litellm_api_base: string; local_model_path: string; local_binary_path: string; transcription_model: string; transcription_device: string; transcription_language: string; skip_diarization: boolean; + hf_token: string; + num_speakers: number | null; } const defaults: AppSettings = { @@ -23,12 +27,16 @@ const defaults: AppSettings = { openai_model: 'gpt-4o-mini', anthropic_model: 'claude-sonnet-4-6', litellm_model: 'gpt-4o-mini', + litellm_api_key: '', + litellm_api_base: '', local_model_path: '', local_binary_path: 'llama-server', transcription_model: 'base', transcription_device: 'cpu', transcription_language: '', skip_diarization: false, + hf_token: '', + num_speakers: null, }; export const settings = writable({ ...defaults }); @@ -45,4 +53,20 @@ export async function loadSettings(): Promise { export async function saveSettings(s: AppSettings): Promise { settings.set(s); await invoke('save_settings', { settings: s }); + + // Configure the AI provider in the Python sidecar + const configMap: Record> = { + openai: { api_key: s.openai_api_key, model: s.openai_model }, + anthropic: { api_key: s.anthropic_api_key, model: s.anthropic_model }, + litellm: { api_key: s.litellm_api_key, api_base: s.litellm_api_base, model: s.litellm_model }, + local: { model: s.local_model_path, base_url: 'http://localhost:8080' }, + }; + const config = configMap[s.ai_provider]; + if (config) { + try { + await invoke('ai_configure', { provider: s.ai_provider, config }); + } catch { + // Sidecar may not be running yet — provider will be configured on first use + } + } } diff --git a/src/routes/+layout.svelte b/src/routes/+layout.svelte index a655c08..2112a61 100644 --- a/src/routes/+layout.svelte +++ b/src/routes/+layout.svelte @@ -10,6 +10,7 @@ padding: 0; background: #0a0a23; color: #e0e0e0; + color-scheme: dark; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; overflow: hidden; diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 9b139a0..5e0f382 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -1,5 +1,6 @@ -
-

Voice to Notes

-
- - - {#if $segments.length > 0} -
- - {#if showExportMenu} -
- {#each exportFormats as fmt} - - {/each} -
+{#if !appReady} +
+

Voice to Notes

+

Loading...

+
+
+{:else} +
+
+

Voice to Notes

+
+
- {/if} + + + {#if $segments.length > 0} +
+ + {#if showExportMenu} +
+ {#each exportFormats as fmt} + + {/each} +
+ {/if} +
+ {/if} +
-
-
-
- - +
+
+ + +
+
- -
- + - showSettings = false} -/> + showSettings = false} + /> +{/if}