Cross-platform distribution, UI improvements, and performance optimizations
- PyInstaller frozen sidecar: spec file, build script, and ffmpeg path resolver for self-contained distribution without Python prerequisites - Dual-mode sidecar launcher: frozen binary (production) with dev mode fallback - Parallel transcription + diarization pipeline (~30-40% faster) - GPU auto-detection for diarization (CUDA when available) - Async run_pipeline command for real-time progress event delivery - Web Audio API backend for instant playback and seeking - OpenAI-compatible provider replacing LiteLLM client-side routing - Cross-platform RAM detection (Linux/macOS/Windows) - Settings: speaker count hint, token reveal toggles, dark dropdown styling - Loading splash screen, flexbox layout fix for viewport overflow - Gitea Actions CI/CD pipeline (Linux, Windows, macOS ARM) - Updated README and CLAUDE.md documentation Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
215
python/build_sidecar.py
Normal file
215
python/build_sidecar.py
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build the Voice to Notes sidecar as a standalone binary using PyInstaller.
|
||||
|
||||
Usage:
|
||||
python build_sidecar.py [--cpu-only]
|
||||
|
||||
Produces a directory `dist/voice-to-notes-sidecar/` containing the frozen
|
||||
sidecar binary and all dependencies. The main binary is renamed to include
|
||||
the Tauri target triple for externalBin resolution.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
DIST_DIR = SCRIPT_DIR / "dist"
|
||||
BUILD_DIR = SCRIPT_DIR / "build"
|
||||
SPEC_FILE = SCRIPT_DIR / "voice_to_notes.spec"
|
||||
|
||||
# Static ffmpeg download URLs (GPL-licensed builds)
|
||||
FFMPEG_URLS: dict[str, str] = {
|
||||
"linux-x86_64": "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz",
|
||||
"darwin-x86_64": "https://evermeet.cx/ffmpeg/getrelease/zip",
|
||||
"darwin-arm64": "https://evermeet.cx/ffmpeg/getrelease/zip",
|
||||
"win32-x86_64": "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip",
|
||||
}
|
||||
|
||||
|
||||
def get_target_triple() -> str:
|
||||
"""Determine the Tauri-compatible target triple for the current platform."""
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
|
||||
arch_map = {
|
||||
"x86_64": "x86_64",
|
||||
"amd64": "x86_64",
|
||||
"aarch64": "aarch64",
|
||||
"arm64": "aarch64",
|
||||
}
|
||||
arch = arch_map.get(machine, machine)
|
||||
|
||||
if system == "linux":
|
||||
return f"{arch}-unknown-linux-gnu"
|
||||
elif system == "darwin":
|
||||
return f"{arch}-apple-darwin"
|
||||
elif system == "windows":
|
||||
return f"{arch}-pc-windows-msvc"
|
||||
else:
|
||||
return f"{arch}-unknown-{system}"
|
||||
|
||||
|
||||
def create_venv_and_install(cpu_only: bool) -> Path:
|
||||
"""Create a fresh venv and install dependencies."""
|
||||
venv_dir = BUILD_DIR / "sidecar-venv"
|
||||
if venv_dir.exists():
|
||||
shutil.rmtree(venv_dir)
|
||||
|
||||
print(f"[build] Creating venv at {venv_dir}")
|
||||
subprocess.run([sys.executable, "-m", "venv", str(venv_dir)], check=True)
|
||||
|
||||
# Determine pip and python paths inside venv
|
||||
if sys.platform == "win32":
|
||||
pip = str(venv_dir / "Scripts" / "pip")
|
||||
python = str(venv_dir / "Scripts" / "python")
|
||||
else:
|
||||
pip = str(venv_dir / "bin" / "pip")
|
||||
python = str(venv_dir / "bin" / "python")
|
||||
|
||||
# Upgrade pip
|
||||
subprocess.run([pip, "install", "--upgrade", "pip"], check=True)
|
||||
|
||||
# Install torch (CPU-only to avoid bundling ~2GB of CUDA libs)
|
||||
if cpu_only:
|
||||
print("[build] Installing PyTorch (CPU-only)")
|
||||
subprocess.run(
|
||||
[pip, "install", "torch", "torchaudio",
|
||||
"--index-url", "https://download.pytorch.org/whl/cpu"],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
print("[build] Installing PyTorch (default, may include CUDA)")
|
||||
subprocess.run([pip, "install", "torch", "torchaudio"], check=True)
|
||||
|
||||
# Install project and dev deps (includes pyinstaller)
|
||||
print("[build] Installing project dependencies")
|
||||
subprocess.run([pip, "install", "-e", f"{SCRIPT_DIR}[dev]"], check=True)
|
||||
|
||||
return Path(python)
|
||||
|
||||
|
||||
def run_pyinstaller(python: Path) -> Path:
|
||||
"""Run PyInstaller using the spec file."""
|
||||
print("[build] Running PyInstaller")
|
||||
subprocess.run(
|
||||
[str(python), "-m", "PyInstaller", "--clean", "--noconfirm", str(SPEC_FILE)],
|
||||
cwd=str(SCRIPT_DIR),
|
||||
check=True,
|
||||
)
|
||||
output_dir = DIST_DIR / "voice-to-notes-sidecar"
|
||||
if not output_dir.exists():
|
||||
raise RuntimeError(f"PyInstaller output not found at {output_dir}")
|
||||
return output_dir
|
||||
|
||||
|
||||
def download_ffmpeg(output_dir: Path) -> None:
|
||||
"""Download a static ffmpeg/ffprobe binary for the current platform."""
|
||||
system = sys.platform
|
||||
machine = platform.machine().lower()
|
||||
if machine in ("amd64", "x86_64"):
|
||||
machine = "x86_64"
|
||||
elif machine in ("aarch64", "arm64"):
|
||||
machine = "arm64"
|
||||
|
||||
key = f"{system}-{machine}"
|
||||
if system == "win32":
|
||||
key = f"win32-{machine}"
|
||||
elif system == "linux":
|
||||
key = f"linux-{machine}"
|
||||
|
||||
url = FFMPEG_URLS.get(key)
|
||||
if not url:
|
||||
print(f"[build] Warning: No ffmpeg download URL for platform {key}, skipping")
|
||||
return
|
||||
|
||||
print(f"[build] Downloading ffmpeg for {key}")
|
||||
tmp_path = output_dir / "ffmpeg_download"
|
||||
try:
|
||||
urllib.request.urlretrieve(url, str(tmp_path))
|
||||
|
||||
if url.endswith(".tar.xz"):
|
||||
# Linux static build
|
||||
import tarfile
|
||||
with tarfile.open(str(tmp_path), "r:xz") as tar:
|
||||
for member in tar.getmembers():
|
||||
basename = os.path.basename(member.name)
|
||||
if basename in ("ffmpeg", "ffprobe"):
|
||||
member.name = basename
|
||||
tar.extract(member, path=str(output_dir))
|
||||
dest = output_dir / basename
|
||||
dest.chmod(dest.stat().st_mode | stat.S_IEXEC)
|
||||
elif url.endswith(".zip"):
|
||||
with zipfile.ZipFile(str(tmp_path), "r") as zf:
|
||||
for name in zf.namelist():
|
||||
basename = os.path.basename(name)
|
||||
if basename in ("ffmpeg", "ffprobe", "ffmpeg.exe", "ffprobe.exe"):
|
||||
data = zf.read(name)
|
||||
dest = output_dir / basename
|
||||
dest.write_bytes(data)
|
||||
if sys.platform != "win32":
|
||||
dest.chmod(dest.stat().st_mode | stat.S_IEXEC)
|
||||
print("[build] ffmpeg downloaded successfully")
|
||||
except Exception as e:
|
||||
print(f"[build] Warning: Failed to download ffmpeg: {e}")
|
||||
finally:
|
||||
if tmp_path.exists():
|
||||
tmp_path.unlink()
|
||||
|
||||
|
||||
def rename_binary(output_dir: Path, target_triple: str) -> None:
|
||||
"""Rename the main binary to include the target triple for Tauri."""
|
||||
if sys.platform == "win32":
|
||||
src = output_dir / "voice-to-notes-sidecar.exe"
|
||||
dst = output_dir / f"voice-to-notes-sidecar-{target_triple}.exe"
|
||||
else:
|
||||
src = output_dir / "voice-to-notes-sidecar"
|
||||
dst = output_dir / f"voice-to-notes-sidecar-{target_triple}"
|
||||
|
||||
if src.exists():
|
||||
print(f"[build] Renaming {src.name} -> {dst.name}")
|
||||
src.rename(dst)
|
||||
else:
|
||||
print(f"[build] Warning: Expected binary not found at {src}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Build the Voice to Notes sidecar binary")
|
||||
parser.add_argument(
|
||||
"--cpu-only",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Install CPU-only PyTorch (default: True, avoids bundling CUDA)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with-cuda",
|
||||
action="store_true",
|
||||
help="Install PyTorch with CUDA support",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
cpu_only = not args.with_cuda
|
||||
|
||||
target_triple = get_target_triple()
|
||||
print(f"[build] Target triple: {target_triple}")
|
||||
print(f"[build] CPU-only: {cpu_only}")
|
||||
|
||||
python = create_venv_and_install(cpu_only)
|
||||
output_dir = run_pyinstaller(python)
|
||||
download_ffmpeg(output_dir)
|
||||
rename_binary(output_dir, target_triple)
|
||||
|
||||
print(f"\n[build] Done! Sidecar built at: {output_dir}")
|
||||
print(f"[build] Copy contents to src-tauri/binaries/ for Tauri bundling")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -13,6 +13,8 @@ dependencies = [
|
||||
"faster-whisper>=1.1.0",
|
||||
"pyannote.audio>=3.1.0",
|
||||
"pysubs2>=1.7.0",
|
||||
"openai>=1.0.0",
|
||||
"anthropic>=0.20.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -20,6 +22,7 @@ dev = [
|
||||
"ruff>=0.8.0",
|
||||
"pytest>=8.0.0",
|
||||
"pytest-asyncio>=0.24.0",
|
||||
"pyinstaller>=6.0",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
||||
67
python/voice_to_notes.spec
Normal file
67
python/voice_to_notes.spec
Normal file
@@ -0,0 +1,67 @@
|
||||
# -*- mode: python ; coding: utf-8 -*-
|
||||
"""PyInstaller spec for the Voice to Notes sidecar binary."""
|
||||
|
||||
from PyInstaller.utils.hooks import collect_all
|
||||
|
||||
block_cipher = None
|
||||
|
||||
# Collect all files for packages that have shared libraries / data files
|
||||
# PyInstaller often misses these for ML packages
|
||||
ctranslate2_datas, ctranslate2_binaries, ctranslate2_hiddenimports = collect_all("ctranslate2")
|
||||
faster_whisper_datas, faster_whisper_binaries, faster_whisper_hiddenimports = collect_all(
|
||||
"faster_whisper"
|
||||
)
|
||||
pyannote_datas, pyannote_binaries, pyannote_hiddenimports = collect_all("pyannote")
|
||||
|
||||
a = Analysis(
|
||||
["voice_to_notes/main.py"],
|
||||
pathex=[],
|
||||
binaries=ctranslate2_binaries + faster_whisper_binaries + pyannote_binaries,
|
||||
datas=ctranslate2_datas + faster_whisper_datas + pyannote_datas,
|
||||
hiddenimports=[
|
||||
"torch",
|
||||
"torchaudio",
|
||||
"huggingface_hub",
|
||||
"pysubs2",
|
||||
"openai",
|
||||
"anthropic",
|
||||
"litellm",
|
||||
]
|
||||
+ ctranslate2_hiddenimports
|
||||
+ faster_whisper_hiddenimports
|
||||
+ pyannote_hiddenimports,
|
||||
hookspath=[],
|
||||
hooksconfig={},
|
||||
runtime_hooks=[],
|
||||
excludes=["tkinter", "test", "unittest", "pip", "setuptools"],
|
||||
win_no_prefer_redirects=False,
|
||||
win_private_assemblies=False,
|
||||
cipher=block_cipher,
|
||||
noarchive=False,
|
||||
)
|
||||
|
||||
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
|
||||
|
||||
exe = EXE(
|
||||
pyz,
|
||||
a.scripts,
|
||||
[],
|
||||
exclude_binaries=True,
|
||||
name="voice-to-notes-sidecar",
|
||||
debug=False,
|
||||
bootloader_ignore_signals=False,
|
||||
strip=False,
|
||||
upx=True,
|
||||
console=True,
|
||||
)
|
||||
|
||||
coll = COLLECT(
|
||||
exe,
|
||||
a.binaries,
|
||||
a.zipfiles,
|
||||
a.datas,
|
||||
strip=False,
|
||||
upx=True,
|
||||
upx_exclude=[],
|
||||
name="voice-to-notes-sidecar",
|
||||
)
|
||||
@@ -2,7 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
|
||||
@@ -21,6 +24,77 @@ class HardwareInfo:
|
||||
recommended_compute_type: str = "int8"
|
||||
|
||||
|
||||
def _detect_ram_mb() -> int:
|
||||
"""Detect total system RAM in MB (cross-platform).
|
||||
|
||||
Tries platform-specific methods in order:
|
||||
1. Linux: read /proc/meminfo
|
||||
2. macOS: sysctl hw.memsize
|
||||
3. Windows: GlobalMemoryStatusEx via ctypes
|
||||
4. Fallback: os.sysconf (most Unix systems)
|
||||
|
||||
Returns 0 if all methods fail.
|
||||
"""
|
||||
# Linux: read /proc/meminfo
|
||||
if sys.platform == "linux":
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal:"):
|
||||
# Value is in kB
|
||||
return int(line.split()[1]) // 1024
|
||||
except (FileNotFoundError, ValueError, OSError):
|
||||
pass
|
||||
|
||||
# macOS: sysctl hw.memsize (returns bytes)
|
||||
if sys.platform == "darwin":
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["sysctl", "-n", "hw.memsize"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return int(result.stdout.strip()) // (1024 * 1024)
|
||||
except (subprocess.SubprocessError, ValueError, OSError):
|
||||
pass
|
||||
|
||||
# Windows: GlobalMemoryStatusEx via ctypes
|
||||
if sys.platform == "win32":
|
||||
try:
|
||||
|
||||
class MEMORYSTATUSEX(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("dwLength", ctypes.c_ulong),
|
||||
("dwMemoryLoad", ctypes.c_ulong),
|
||||
("ullTotalPhys", ctypes.c_ulonglong),
|
||||
("ullAvailPhys", ctypes.c_ulonglong),
|
||||
("ullTotalPageFile", ctypes.c_ulonglong),
|
||||
("ullAvailPageFile", ctypes.c_ulonglong),
|
||||
("ullTotalVirtual", ctypes.c_ulonglong),
|
||||
("ullAvailVirtual", ctypes.c_ulonglong),
|
||||
("ullAvailExtendedVirtual", ctypes.c_ulonglong),
|
||||
]
|
||||
|
||||
mem_status = MEMORYSTATUSEX()
|
||||
mem_status.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
|
||||
if ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(mem_status)):
|
||||
return int(mem_status.ullTotalPhys) // (1024 * 1024)
|
||||
except (AttributeError, OSError):
|
||||
pass
|
||||
|
||||
# Fallback: os.sysconf (works on most Unix systems)
|
||||
try:
|
||||
page_size = os.sysconf("SC_PAGE_SIZE")
|
||||
phys_pages = os.sysconf("SC_PHYS_PAGES")
|
||||
if page_size > 0 and phys_pages > 0:
|
||||
return (page_size * phys_pages) // (1024 * 1024)
|
||||
except (ValueError, OSError, AttributeError):
|
||||
pass
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def detect_hardware() -> HardwareInfo:
|
||||
"""Detect available hardware and recommend model configuration."""
|
||||
info = HardwareInfo()
|
||||
@@ -28,16 +102,8 @@ def detect_hardware() -> HardwareInfo:
|
||||
# CPU info
|
||||
info.cpu_cores = os.cpu_count() or 1
|
||||
|
||||
# RAM info
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal:"):
|
||||
# Value is in kB
|
||||
info.ram_mb = int(line.split()[1]) // 1024
|
||||
break
|
||||
except (FileNotFoundError, ValueError):
|
||||
pass
|
||||
# RAM info (cross-platform)
|
||||
info.ram_mb = _detect_ram_mb()
|
||||
|
||||
# CUDA detection
|
||||
try:
|
||||
|
||||
@@ -260,10 +260,12 @@ def make_ai_chat_handler() -> HandlerFunc:
|
||||
model=config.get("model", "claude-sonnet-4-6"),
|
||||
))
|
||||
elif provider_name == "litellm":
|
||||
from voice_to_notes.providers.litellm_provider import LiteLLMProvider
|
||||
from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider
|
||||
|
||||
service.register_provider("litellm", LiteLLMProvider(
|
||||
service.register_provider("litellm", OpenAICompatibleProvider(
|
||||
model=config.get("model", "gpt-4o-mini"),
|
||||
api_key=config.get("api_key"),
|
||||
api_base=config.get("api_base"),
|
||||
))
|
||||
return IPCMessage(
|
||||
id=msg.id,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""LiteLLM provider — multi-provider gateway."""
|
||||
"""OpenAI-compatible provider — works with any OpenAI-compatible API endpoint."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -7,36 +7,44 @@ from typing import Any
|
||||
from voice_to_notes.providers.base import AIProvider
|
||||
|
||||
|
||||
class LiteLLMProvider(AIProvider):
|
||||
"""Routes through LiteLLM for access to 100+ LLM providers."""
|
||||
class OpenAICompatibleProvider(AIProvider):
|
||||
"""Connects to any OpenAI-compatible API (LiteLLM proxy, Ollama, vLLM, etc.)."""
|
||||
|
||||
def __init__(self, model: str = "gpt-4o-mini", **kwargs: Any) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
api_base: str | None = None,
|
||||
model: str = "gpt-4o-mini",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
self._api_key = api_key or "sk-no-key"
|
||||
self._api_base = api_base
|
||||
self._model = model
|
||||
self._extra_kwargs = kwargs
|
||||
|
||||
def chat(self, messages: list[dict[str, str]], **kwargs: Any) -> str:
|
||||
try:
|
||||
import litellm
|
||||
except ImportError:
|
||||
raise RuntimeError("litellm package is required. Install with: pip install litellm")
|
||||
from openai import OpenAI
|
||||
|
||||
merged_kwargs = {**self._extra_kwargs, **kwargs}
|
||||
response = litellm.completion(
|
||||
model=merged_kwargs.get("model", self._model),
|
||||
client_kwargs: dict[str, Any] = {"api_key": self._api_key}
|
||||
if self._api_base:
|
||||
client_kwargs["base_url"] = self._api_base
|
||||
|
||||
client = OpenAI(**client_kwargs)
|
||||
response = client.chat.completions.create(
|
||||
model=kwargs.get("model", self._model),
|
||||
messages=messages,
|
||||
temperature=merged_kwargs.get("temperature", 0.7),
|
||||
max_tokens=merged_kwargs.get("max_tokens", 2048),
|
||||
temperature=kwargs.get("temperature", 0.7),
|
||||
max_tokens=kwargs.get("max_tokens", 2048),
|
||||
)
|
||||
return response.choices[0].message.content or ""
|
||||
|
||||
def is_available(self) -> bool:
|
||||
try:
|
||||
import litellm # noqa: F401
|
||||
|
||||
return True
|
||||
import openai # noqa: F401
|
||||
return bool(self._api_key and self._api_base)
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "LiteLLM"
|
||||
return "OpenAI Compatible"
|
||||
|
||||
@@ -92,7 +92,7 @@ class AIProviderService:
|
||||
def create_default_service() -> AIProviderService:
|
||||
"""Create an AIProviderService with all supported providers registered."""
|
||||
from voice_to_notes.providers.anthropic_provider import AnthropicProvider
|
||||
from voice_to_notes.providers.litellm_provider import LiteLLMProvider
|
||||
from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider
|
||||
from voice_to_notes.providers.local_provider import LocalProvider
|
||||
from voice_to_notes.providers.openai_provider import OpenAIProvider
|
||||
|
||||
@@ -100,5 +100,5 @@ def create_default_service() -> AIProviderService:
|
||||
service.register_provider("local", LocalProvider())
|
||||
service.register_provider("openai", OpenAIProvider())
|
||||
service.register_provider("anthropic", AnthropicProvider())
|
||||
service.register_provider("litellm", LiteLLMProvider())
|
||||
service.register_provider("litellm", OpenAICompatibleProvider())
|
||||
return service
|
||||
|
||||
@@ -16,6 +16,7 @@ from typing import Any
|
||||
# np.isfinite(None) crashes when max_speakers is not set.
|
||||
os.environ.setdefault("PYANNOTE_METRICS_ENABLED", "false")
|
||||
|
||||
from voice_to_notes.utils.ffmpeg import get_ffmpeg_path
|
||||
from voice_to_notes.ipc.messages import progress_message
|
||||
from voice_to_notes.ipc.protocol import write_message
|
||||
|
||||
@@ -40,7 +41,7 @@ def _ensure_wav(file_path: str) -> tuple[str, str | None]:
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg", "-y", "-i", file_path,
|
||||
get_ffmpeg_path(), "-y", "-i", file_path,
|
||||
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
|
||||
tmp.name,
|
||||
],
|
||||
@@ -118,6 +119,14 @@ class DiarizeService:
|
||||
|
||||
self._pipeline = Pipeline.from_pretrained(model_name, token=hf_token)
|
||||
print(f"[sidecar] Loaded diarization model: {model_name}", file=sys.stderr, flush=True)
|
||||
# Move pipeline to GPU if available
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
self._pipeline = self._pipeline.to(torch.device("cuda"))
|
||||
print(f"[sidecar] Diarization pipeline moved to GPU", file=sys.stderr, flush=True)
|
||||
except Exception as e:
|
||||
print(f"[sidecar] GPU not available for diarization: {e}", file=sys.stderr, flush=True)
|
||||
return self._pipeline
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
@@ -13,6 +14,7 @@ from voice_to_notes.ipc.messages import (
|
||||
speaker_update_message,
|
||||
)
|
||||
from voice_to_notes.ipc.protocol import write_message
|
||||
from voice_to_notes.utils.ffmpeg import get_ffprobe_path
|
||||
from voice_to_notes.services.diarize import DiarizeService, SpeakerSegment
|
||||
from voice_to_notes.services.transcribe import (
|
||||
SegmentResult,
|
||||
@@ -82,7 +84,7 @@ class PipelineService:
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Step 1: Transcribe
|
||||
# Step 0: Probe audio duration for conditional chunked transcription
|
||||
write_message(
|
||||
progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...")
|
||||
)
|
||||
@@ -96,12 +98,11 @@ class PipelineService:
|
||||
"words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words],
|
||||
}))
|
||||
|
||||
# Probe audio duration for conditional chunked transcription
|
||||
audio_duration_sec = None
|
||||
try:
|
||||
import subprocess
|
||||
probe_result = subprocess.run(
|
||||
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
|
||||
[get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
@@ -109,30 +110,33 @@ class PipelineService:
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
||||
pass
|
||||
|
||||
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
|
||||
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
|
||||
transcription = self._transcribe_service.transcribe_chunked(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
else:
|
||||
transcription = self._transcribe_service.transcribe(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
def _run_transcription() -> TranscriptionResult:
|
||||
"""Run transcription (chunked or standard based on duration)."""
|
||||
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
|
||||
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
|
||||
return self._transcribe_service.transcribe_chunked(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
else:
|
||||
return self._transcribe_service.transcribe(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
|
||||
if skip_diarization:
|
||||
# Convert transcription directly without speaker labels
|
||||
# Sequential: transcribe only, no diarization needed
|
||||
transcription = _run_transcription()
|
||||
result = PipelineResult(
|
||||
language=transcription.language,
|
||||
language_probability=transcription.language_probability,
|
||||
@@ -150,37 +154,59 @@ class PipelineService:
|
||||
)
|
||||
return result
|
||||
|
||||
# Step 2: Diarize (with graceful fallback)
|
||||
# Parallel execution: run transcription (0-45%) and diarization (45-90%)
|
||||
# concurrently, then merge (90-100%).
|
||||
write_message(
|
||||
progress_message(request_id, 50, "pipeline", "Starting speaker diarization...")
|
||||
progress_message(
|
||||
request_id, 0, "pipeline",
|
||||
"Starting transcription and diarization in parallel..."
|
||||
)
|
||||
)
|
||||
|
||||
diarization = None
|
||||
try:
|
||||
diarization = self._diarize_service.diarize(
|
||||
diarization_error = None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
transcription_future = executor.submit(_run_transcription)
|
||||
|
||||
# Use probed audio_duration_sec for diarization progress estimation
|
||||
# (transcription hasn't finished yet, so we can't use transcription.duration_ms)
|
||||
diarization_future = executor.submit(
|
||||
self._diarize_service.diarize,
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
num_speakers=num_speakers,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
hf_token=hf_token,
|
||||
audio_duration_sec=transcription.duration_ms / 1000.0,
|
||||
audio_duration_sec=audio_duration_sec,
|
||||
)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(
|
||||
f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
|
||||
# Wait for both futures. We need the transcription result regardless,
|
||||
# but diarization may fail gracefully.
|
||||
transcription = transcription_future.result()
|
||||
write_message(
|
||||
progress_message(
|
||||
request_id, 80, "pipeline",
|
||||
f"Diarization failed ({e}), using transcription only..."
|
||||
)
|
||||
progress_message(request_id, 45, "pipeline", "Transcription complete")
|
||||
)
|
||||
|
||||
try:
|
||||
diarization = diarization_future.result()
|
||||
except Exception as e:
|
||||
import traceback
|
||||
diarization_error = e
|
||||
print(
|
||||
f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
write_message(
|
||||
progress_message(
|
||||
request_id, 80, "pipeline",
|
||||
f"Diarization failed ({e}), using transcription only..."
|
||||
)
|
||||
)
|
||||
|
||||
# Step 3: Merge (or skip if diarization failed)
|
||||
if diarization is not None:
|
||||
write_message(
|
||||
|
||||
@@ -12,6 +12,7 @@ from faster_whisper import WhisperModel
|
||||
|
||||
from voice_to_notes.ipc.messages import progress_message
|
||||
from voice_to_notes.ipc.protocol import write_message
|
||||
from voice_to_notes.utils.ffmpeg import get_ffmpeg_path, get_ffprobe_path
|
||||
|
||||
CHUNK_REPORT_SIZE = 10
|
||||
LARGE_FILE_THRESHOLD_SEC = 3600 # 1 hour
|
||||
@@ -202,7 +203,7 @@ class TranscribeService:
|
||||
# Get total duration via ffprobe
|
||||
try:
|
||||
probe_result = subprocess.run(
|
||||
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
|
||||
[get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
@@ -235,7 +236,7 @@ class TranscribeService:
|
||||
tmp.close()
|
||||
try:
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-ss", str(chunk_start),
|
||||
[get_ffmpeg_path(), "-y", "-ss", str(chunk_start),
|
||||
"-t", str(chunk_duration_sec),
|
||||
"-i", file_path,
|
||||
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
|
||||
|
||||
43
python/voice_to_notes/utils/ffmpeg.py
Normal file
43
python/voice_to_notes/utils/ffmpeg.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Resolve ffmpeg/ffprobe paths for both frozen and development builds."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def get_ffmpeg_path() -> str:
|
||||
"""Return the path to the ffmpeg binary.
|
||||
|
||||
When running as a frozen PyInstaller bundle, looks next to sys.executable.
|
||||
Otherwise falls back to the system PATH.
|
||||
"""
|
||||
if getattr(sys, "frozen", False):
|
||||
# Frozen PyInstaller bundle — ffmpeg is next to the sidecar binary
|
||||
bundle_dir = os.path.dirname(sys.executable)
|
||||
candidates = [
|
||||
os.path.join(bundle_dir, "ffmpeg.exe" if sys.platform == "win32" else "ffmpeg"),
|
||||
os.path.join(bundle_dir, "ffmpeg"),
|
||||
]
|
||||
for path in candidates:
|
||||
if os.path.isfile(path):
|
||||
return path
|
||||
return "ffmpeg"
|
||||
|
||||
|
||||
def get_ffprobe_path() -> str:
|
||||
"""Return the path to the ffprobe binary.
|
||||
|
||||
When running as a frozen PyInstaller bundle, looks next to sys.executable.
|
||||
Otherwise falls back to the system PATH.
|
||||
"""
|
||||
if getattr(sys, "frozen", False):
|
||||
bundle_dir = os.path.dirname(sys.executable)
|
||||
candidates = [
|
||||
os.path.join(bundle_dir, "ffprobe.exe" if sys.platform == "win32" else "ffprobe"),
|
||||
os.path.join(bundle_dir, "ffprobe"),
|
||||
]
|
||||
for path in candidates:
|
||||
if os.path.isfile(path):
|
||||
return path
|
||||
return "ffprobe"
|
||||
Reference in New Issue
Block a user