Cross-platform distribution, UI improvements, and performance optimizations

- PyInstaller frozen sidecar: spec file, build script, and ffmpeg path resolver
  for self-contained distribution without Python prerequisites
- Dual-mode sidecar launcher: frozen binary (production) with dev mode fallback
- Parallel transcription + diarization pipeline (~30-40% faster)
- GPU auto-detection for diarization (CUDA when available)
- Async run_pipeline command for real-time progress event delivery
- Web Audio API backend for instant playback and seeking
- OpenAI-compatible provider replacing LiteLLM client-side routing
- Cross-platform RAM detection (Linux/macOS/Windows)
- Settings: speaker count hint, token reveal toggles, dark dropdown styling
- Loading splash screen, flexbox layout fix for viewport overflow
- Gitea Actions CI/CD pipeline (Linux, Windows, macOS ARM)
- Updated README and CLAUDE.md documentation

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude
2026-03-20 21:33:43 -07:00
parent 42ccd3e21d
commit 58faa83cb3
27 changed files with 1301 additions and 283 deletions

View File

@@ -2,7 +2,10 @@
from __future__ import annotations
import ctypes
import os
import platform
import subprocess
import sys
from dataclasses import dataclass
@@ -21,6 +24,77 @@ class HardwareInfo:
recommended_compute_type: str = "int8"
def _detect_ram_mb() -> int:
"""Detect total system RAM in MB (cross-platform).
Tries platform-specific methods in order:
1. Linux: read /proc/meminfo
2. macOS: sysctl hw.memsize
3. Windows: GlobalMemoryStatusEx via ctypes
4. Fallback: os.sysconf (most Unix systems)
Returns 0 if all methods fail.
"""
# Linux: read /proc/meminfo
if sys.platform == "linux":
try:
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemTotal:"):
# Value is in kB
return int(line.split()[1]) // 1024
except (FileNotFoundError, ValueError, OSError):
pass
# macOS: sysctl hw.memsize (returns bytes)
if sys.platform == "darwin":
try:
result = subprocess.run(
["sysctl", "-n", "hw.memsize"],
capture_output=True,
text=True,
check=True,
)
return int(result.stdout.strip()) // (1024 * 1024)
except (subprocess.SubprocessError, ValueError, OSError):
pass
# Windows: GlobalMemoryStatusEx via ctypes
if sys.platform == "win32":
try:
class MEMORYSTATUSEX(ctypes.Structure):
_fields_ = [
("dwLength", ctypes.c_ulong),
("dwMemoryLoad", ctypes.c_ulong),
("ullTotalPhys", ctypes.c_ulonglong),
("ullAvailPhys", ctypes.c_ulonglong),
("ullTotalPageFile", ctypes.c_ulonglong),
("ullAvailPageFile", ctypes.c_ulonglong),
("ullTotalVirtual", ctypes.c_ulonglong),
("ullAvailVirtual", ctypes.c_ulonglong),
("ullAvailExtendedVirtual", ctypes.c_ulonglong),
]
mem_status = MEMORYSTATUSEX()
mem_status.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
if ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(mem_status)):
return int(mem_status.ullTotalPhys) // (1024 * 1024)
except (AttributeError, OSError):
pass
# Fallback: os.sysconf (works on most Unix systems)
try:
page_size = os.sysconf("SC_PAGE_SIZE")
phys_pages = os.sysconf("SC_PHYS_PAGES")
if page_size > 0 and phys_pages > 0:
return (page_size * phys_pages) // (1024 * 1024)
except (ValueError, OSError, AttributeError):
pass
return 0
def detect_hardware() -> HardwareInfo:
"""Detect available hardware and recommend model configuration."""
info = HardwareInfo()
@@ -28,16 +102,8 @@ def detect_hardware() -> HardwareInfo:
# CPU info
info.cpu_cores = os.cpu_count() or 1
# RAM info
try:
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemTotal:"):
# Value is in kB
info.ram_mb = int(line.split()[1]) // 1024
break
except (FileNotFoundError, ValueError):
pass
# RAM info (cross-platform)
info.ram_mb = _detect_ram_mb()
# CUDA detection
try:

View File

@@ -260,10 +260,12 @@ def make_ai_chat_handler() -> HandlerFunc:
model=config.get("model", "claude-sonnet-4-6"),
))
elif provider_name == "litellm":
from voice_to_notes.providers.litellm_provider import LiteLLMProvider
from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider
service.register_provider("litellm", LiteLLMProvider(
service.register_provider("litellm", OpenAICompatibleProvider(
model=config.get("model", "gpt-4o-mini"),
api_key=config.get("api_key"),
api_base=config.get("api_base"),
))
return IPCMessage(
id=msg.id,

View File

@@ -1,4 +1,4 @@
"""LiteLLM provider — multi-provider gateway."""
"""OpenAI-compatible provider — works with any OpenAI-compatible API endpoint."""
from __future__ import annotations
@@ -7,36 +7,44 @@ from typing import Any
from voice_to_notes.providers.base import AIProvider
class LiteLLMProvider(AIProvider):
"""Routes through LiteLLM for access to 100+ LLM providers."""
class OpenAICompatibleProvider(AIProvider):
"""Connects to any OpenAI-compatible API (LiteLLM proxy, Ollama, vLLM, etc.)."""
def __init__(self, model: str = "gpt-4o-mini", **kwargs: Any) -> None:
def __init__(
self,
api_key: str | None = None,
api_base: str | None = None,
model: str = "gpt-4o-mini",
**kwargs: Any,
) -> None:
self._api_key = api_key or "sk-no-key"
self._api_base = api_base
self._model = model
self._extra_kwargs = kwargs
def chat(self, messages: list[dict[str, str]], **kwargs: Any) -> str:
try:
import litellm
except ImportError:
raise RuntimeError("litellm package is required. Install with: pip install litellm")
from openai import OpenAI
merged_kwargs = {**self._extra_kwargs, **kwargs}
response = litellm.completion(
model=merged_kwargs.get("model", self._model),
client_kwargs: dict[str, Any] = {"api_key": self._api_key}
if self._api_base:
client_kwargs["base_url"] = self._api_base
client = OpenAI(**client_kwargs)
response = client.chat.completions.create(
model=kwargs.get("model", self._model),
messages=messages,
temperature=merged_kwargs.get("temperature", 0.7),
max_tokens=merged_kwargs.get("max_tokens", 2048),
temperature=kwargs.get("temperature", 0.7),
max_tokens=kwargs.get("max_tokens", 2048),
)
return response.choices[0].message.content or ""
def is_available(self) -> bool:
try:
import litellm # noqa: F401
return True
import openai # noqa: F401
return bool(self._api_key and self._api_base)
except ImportError:
return False
@property
def name(self) -> str:
return "LiteLLM"
return "OpenAI Compatible"

View File

@@ -92,7 +92,7 @@ class AIProviderService:
def create_default_service() -> AIProviderService:
"""Create an AIProviderService with all supported providers registered."""
from voice_to_notes.providers.anthropic_provider import AnthropicProvider
from voice_to_notes.providers.litellm_provider import LiteLLMProvider
from voice_to_notes.providers.litellm_provider import OpenAICompatibleProvider
from voice_to_notes.providers.local_provider import LocalProvider
from voice_to_notes.providers.openai_provider import OpenAIProvider
@@ -100,5 +100,5 @@ def create_default_service() -> AIProviderService:
service.register_provider("local", LocalProvider())
service.register_provider("openai", OpenAIProvider())
service.register_provider("anthropic", AnthropicProvider())
service.register_provider("litellm", LiteLLMProvider())
service.register_provider("litellm", OpenAICompatibleProvider())
return service

View File

@@ -16,6 +16,7 @@ from typing import Any
# np.isfinite(None) crashes when max_speakers is not set.
os.environ.setdefault("PYANNOTE_METRICS_ENABLED", "false")
from voice_to_notes.utils.ffmpeg import get_ffmpeg_path
from voice_to_notes.ipc.messages import progress_message
from voice_to_notes.ipc.protocol import write_message
@@ -40,7 +41,7 @@ def _ensure_wav(file_path: str) -> tuple[str, str | None]:
try:
subprocess.run(
[
"ffmpeg", "-y", "-i", file_path,
get_ffmpeg_path(), "-y", "-i", file_path,
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
tmp.name,
],
@@ -118,6 +119,14 @@ class DiarizeService:
self._pipeline = Pipeline.from_pretrained(model_name, token=hf_token)
print(f"[sidecar] Loaded diarization model: {model_name}", file=sys.stderr, flush=True)
# Move pipeline to GPU if available
try:
import torch
if torch.cuda.is_available():
self._pipeline = self._pipeline.to(torch.device("cuda"))
print(f"[sidecar] Diarization pipeline moved to GPU", file=sys.stderr, flush=True)
except Exception as e:
print(f"[sidecar] GPU not available for diarization: {e}", file=sys.stderr, flush=True)
return self._pipeline
except Exception as e:
last_error = e

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import concurrent.futures
import sys
import time
from dataclasses import dataclass, field
@@ -13,6 +14,7 @@ from voice_to_notes.ipc.messages import (
speaker_update_message,
)
from voice_to_notes.ipc.protocol import write_message
from voice_to_notes.utils.ffmpeg import get_ffprobe_path
from voice_to_notes.services.diarize import DiarizeService, SpeakerSegment
from voice_to_notes.services.transcribe import (
SegmentResult,
@@ -82,7 +84,7 @@ class PipelineService:
"""
start_time = time.time()
# Step 1: Transcribe
# Step 0: Probe audio duration for conditional chunked transcription
write_message(
progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...")
)
@@ -96,12 +98,11 @@ class PipelineService:
"words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words],
}))
# Probe audio duration for conditional chunked transcription
audio_duration_sec = None
try:
import subprocess
probe_result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
[get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
capture_output=True, text=True, check=True,
)
@@ -109,30 +110,33 @@ class PipelineService:
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
pass
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
transcription = self._transcribe_service.transcribe_chunked(
request_id=request_id,
file_path=file_path,
model_name=model_name,
device=device,
compute_type=compute_type,
language=language,
on_segment=_emit_segment,
)
else:
transcription = self._transcribe_service.transcribe(
request_id=request_id,
file_path=file_path,
model_name=model_name,
device=device,
compute_type=compute_type,
language=language,
on_segment=_emit_segment,
)
def _run_transcription() -> TranscriptionResult:
"""Run transcription (chunked or standard based on duration)."""
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
return self._transcribe_service.transcribe_chunked(
request_id=request_id,
file_path=file_path,
model_name=model_name,
device=device,
compute_type=compute_type,
language=language,
on_segment=_emit_segment,
)
else:
return self._transcribe_service.transcribe(
request_id=request_id,
file_path=file_path,
model_name=model_name,
device=device,
compute_type=compute_type,
language=language,
on_segment=_emit_segment,
)
if skip_diarization:
# Convert transcription directly without speaker labels
# Sequential: transcribe only, no diarization needed
transcription = _run_transcription()
result = PipelineResult(
language=transcription.language,
language_probability=transcription.language_probability,
@@ -150,37 +154,59 @@ class PipelineService:
)
return result
# Step 2: Diarize (with graceful fallback)
# Parallel execution: run transcription (0-45%) and diarization (45-90%)
# concurrently, then merge (90-100%).
write_message(
progress_message(request_id, 50, "pipeline", "Starting speaker diarization...")
progress_message(
request_id, 0, "pipeline",
"Starting transcription and diarization in parallel..."
)
)
diarization = None
try:
diarization = self._diarize_service.diarize(
diarization_error = None
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
transcription_future = executor.submit(_run_transcription)
# Use probed audio_duration_sec for diarization progress estimation
# (transcription hasn't finished yet, so we can't use transcription.duration_ms)
diarization_future = executor.submit(
self._diarize_service.diarize,
request_id=request_id,
file_path=file_path,
num_speakers=num_speakers,
min_speakers=min_speakers,
max_speakers=max_speakers,
hf_token=hf_token,
audio_duration_sec=transcription.duration_ms / 1000.0,
audio_duration_sec=audio_duration_sec,
)
except Exception as e:
import traceback
print(
f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
file=sys.stderr,
flush=True,
)
traceback.print_exc(file=sys.stderr)
# Wait for both futures. We need the transcription result regardless,
# but diarization may fail gracefully.
transcription = transcription_future.result()
write_message(
progress_message(
request_id, 80, "pipeline",
f"Diarization failed ({e}), using transcription only..."
)
progress_message(request_id, 45, "pipeline", "Transcription complete")
)
try:
diarization = diarization_future.result()
except Exception as e:
import traceback
diarization_error = e
print(
f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
file=sys.stderr,
flush=True,
)
traceback.print_exc(file=sys.stderr)
write_message(
progress_message(
request_id, 80, "pipeline",
f"Diarization failed ({e}), using transcription only..."
)
)
# Step 3: Merge (or skip if diarization failed)
if diarization is not None:
write_message(

View File

@@ -12,6 +12,7 @@ from faster_whisper import WhisperModel
from voice_to_notes.ipc.messages import progress_message
from voice_to_notes.ipc.protocol import write_message
from voice_to_notes.utils.ffmpeg import get_ffmpeg_path, get_ffprobe_path
CHUNK_REPORT_SIZE = 10
LARGE_FILE_THRESHOLD_SEC = 3600 # 1 hour
@@ -202,7 +203,7 @@ class TranscribeService:
# Get total duration via ffprobe
try:
probe_result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
[get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
capture_output=True, text=True, check=True,
)
@@ -235,7 +236,7 @@ class TranscribeService:
tmp.close()
try:
subprocess.run(
["ffmpeg", "-y", "-ss", str(chunk_start),
[get_ffmpeg_path(), "-y", "-ss", str(chunk_start),
"-t", str(chunk_duration_sec),
"-i", file_path,
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",

View File

@@ -0,0 +1,43 @@
"""Resolve ffmpeg/ffprobe paths for both frozen and development builds."""
from __future__ import annotations
import os
import sys
def get_ffmpeg_path() -> str:
"""Return the path to the ffmpeg binary.
When running as a frozen PyInstaller bundle, looks next to sys.executable.
Otherwise falls back to the system PATH.
"""
if getattr(sys, "frozen", False):
# Frozen PyInstaller bundle — ffmpeg is next to the sidecar binary
bundle_dir = os.path.dirname(sys.executable)
candidates = [
os.path.join(bundle_dir, "ffmpeg.exe" if sys.platform == "win32" else "ffmpeg"),
os.path.join(bundle_dir, "ffmpeg"),
]
for path in candidates:
if os.path.isfile(path):
return path
return "ffmpeg"
def get_ffprobe_path() -> str:
"""Return the path to the ffprobe binary.
When running as a frozen PyInstaller bundle, looks next to sys.executable.
Otherwise falls back to the system PATH.
"""
if getattr(sys, "frozen", False):
bundle_dir = os.path.dirname(sys.executable)
candidates = [
os.path.join(bundle_dir, "ffprobe.exe" if sys.platform == "win32" else "ffprobe"),
os.path.join(bundle_dir, "ffprobe"),
]
for path in candidates:
if os.path.isfile(path):
return path
return "ffprobe"