Cross-platform distribution, UI improvements, and performance optimizations
- PyInstaller frozen sidecar: spec file, build script, and ffmpeg path resolver for self-contained distribution without Python prerequisites - Dual-mode sidecar launcher: frozen binary (production) with dev mode fallback - Parallel transcription + diarization pipeline (~30-40% faster) - GPU auto-detection for diarization (CUDA when available) - Async run_pipeline command for real-time progress event delivery - Web Audio API backend for instant playback and seeking - OpenAI-compatible provider replacing LiteLLM client-side routing - Cross-platform RAM detection (Linux/macOS/Windows) - Settings: speaker count hint, token reveal toggles, dark dropdown styling - Loading splash screen, flexbox layout fix for viewport overflow - Gitea Actions CI/CD pipeline (Linux, Windows, macOS ARM) - Updated README and CLAUDE.md documentation Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
@@ -13,6 +14,7 @@ from voice_to_notes.ipc.messages import (
|
||||
speaker_update_message,
|
||||
)
|
||||
from voice_to_notes.ipc.protocol import write_message
|
||||
from voice_to_notes.utils.ffmpeg import get_ffprobe_path
|
||||
from voice_to_notes.services.diarize import DiarizeService, SpeakerSegment
|
||||
from voice_to_notes.services.transcribe import (
|
||||
SegmentResult,
|
||||
@@ -82,7 +84,7 @@ class PipelineService:
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Step 1: Transcribe
|
||||
# Step 0: Probe audio duration for conditional chunked transcription
|
||||
write_message(
|
||||
progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...")
|
||||
)
|
||||
@@ -96,12 +98,11 @@ class PipelineService:
|
||||
"words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words],
|
||||
}))
|
||||
|
||||
# Probe audio duration for conditional chunked transcription
|
||||
audio_duration_sec = None
|
||||
try:
|
||||
import subprocess
|
||||
probe_result = subprocess.run(
|
||||
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
|
||||
[get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1", file_path],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
@@ -109,30 +110,33 @@ class PipelineService:
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
||||
pass
|
||||
|
||||
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
|
||||
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
|
||||
transcription = self._transcribe_service.transcribe_chunked(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
else:
|
||||
transcription = self._transcribe_service.transcribe(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
def _run_transcription() -> TranscriptionResult:
|
||||
"""Run transcription (chunked or standard based on duration)."""
|
||||
from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
|
||||
if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
|
||||
return self._transcribe_service.transcribe_chunked(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
else:
|
||||
return self._transcribe_service.transcribe(
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
model_name=model_name,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
language=language,
|
||||
on_segment=_emit_segment,
|
||||
)
|
||||
|
||||
if skip_diarization:
|
||||
# Convert transcription directly without speaker labels
|
||||
# Sequential: transcribe only, no diarization needed
|
||||
transcription = _run_transcription()
|
||||
result = PipelineResult(
|
||||
language=transcription.language,
|
||||
language_probability=transcription.language_probability,
|
||||
@@ -150,37 +154,59 @@ class PipelineService:
|
||||
)
|
||||
return result
|
||||
|
||||
# Step 2: Diarize (with graceful fallback)
|
||||
# Parallel execution: run transcription (0-45%) and diarization (45-90%)
|
||||
# concurrently, then merge (90-100%).
|
||||
write_message(
|
||||
progress_message(request_id, 50, "pipeline", "Starting speaker diarization...")
|
||||
progress_message(
|
||||
request_id, 0, "pipeline",
|
||||
"Starting transcription and diarization in parallel..."
|
||||
)
|
||||
)
|
||||
|
||||
diarization = None
|
||||
try:
|
||||
diarization = self._diarize_service.diarize(
|
||||
diarization_error = None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
transcription_future = executor.submit(_run_transcription)
|
||||
|
||||
# Use probed audio_duration_sec for diarization progress estimation
|
||||
# (transcription hasn't finished yet, so we can't use transcription.duration_ms)
|
||||
diarization_future = executor.submit(
|
||||
self._diarize_service.diarize,
|
||||
request_id=request_id,
|
||||
file_path=file_path,
|
||||
num_speakers=num_speakers,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
hf_token=hf_token,
|
||||
audio_duration_sec=transcription.duration_ms / 1000.0,
|
||||
audio_duration_sec=audio_duration_sec,
|
||||
)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(
|
||||
f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
|
||||
# Wait for both futures. We need the transcription result regardless,
|
||||
# but diarization may fail gracefully.
|
||||
transcription = transcription_future.result()
|
||||
write_message(
|
||||
progress_message(
|
||||
request_id, 80, "pipeline",
|
||||
f"Diarization failed ({e}), using transcription only..."
|
||||
)
|
||||
progress_message(request_id, 45, "pipeline", "Transcription complete")
|
||||
)
|
||||
|
||||
try:
|
||||
diarization = diarization_future.result()
|
||||
except Exception as e:
|
||||
import traceback
|
||||
diarization_error = e
|
||||
print(
|
||||
f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
write_message(
|
||||
progress_message(
|
||||
request_id, 80, "pipeline",
|
||||
f"Diarization failed ({e}), using transcription only..."
|
||||
)
|
||||
)
|
||||
|
||||
# Step 3: Merge (or skip if diarization failed)
|
||||
if diarization is not None:
|
||||
write_message(
|
||||
|
||||
Reference in New Issue
Block a user