Cross-platform distribution, UI improvements, and performance optimizations

- PyInstaller frozen sidecar: spec file, build script, and ffmpeg path resolver for self-contained distribution without Python prerequisites - Dual-mode sidecar launcher: frozen binary (production) with dev mode fallback - Parallel transcription + diarization pipeline (~30-40% faster) - GPU auto-detection for diarization (CUDA when available) - Async run_pipeline command for real-time progress event delivery - Web Audio API backend for instant playback and seeking - OpenAI-compatible provider replacing LiteLLM client-side routing - Cross-platform RAM detection (Linux/macOS/Windows) - Settings: speaker count hint, token reveal toggles, dark dropdown styling - Loading splash screen, flexbox layout fix for viewport overflow - Gitea Actions CI/CD pipeline (Linux, Windows, macOS ARM) - Updated README and CLAUDE.md documentation Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 21:33:43 -07:00
parent 42ccd3e21d
commit 58faa83cb3
27 changed files with 1301 additions and 283 deletions
--- a/python/voice_to_notes/services/pipeline.py
+++ b/python/voice_to_notes/services/pipeline.py
@@ -2,6 +2,7 @@

 from __future__ import annotations

+import concurrent.futures
 import sys
 import time
 from dataclasses import dataclass, field
@@ -13,6 +14,7 @@ from voice_to_notes.ipc.messages import (
    speaker_update_message,
 )
 from voice_to_notes.ipc.protocol import write_message
+from voice_to_notes.utils.ffmpeg import get_ffprobe_path
 from voice_to_notes.services.diarize import DiarizeService, SpeakerSegment
 from voice_to_notes.services.transcribe import (
    SegmentResult,
@@ -82,7 +84,7 @@ class PipelineService:
        """
        start_time = time.time()

-        # Step 1: Transcribe
+        # Step 0: Probe audio duration for conditional chunked transcription
        write_message(
            progress_message(request_id, 0, "pipeline", "Starting transcription pipeline...")
        )
@@ -96,12 +98,11 @@ class PipelineService:
                "words": [{"word": w.word, "start_ms": w.start_ms, "end_ms": w.end_ms, "confidence": w.confidence} for w in seg.words],
            }))

-        # Probe audio duration for conditional chunked transcription
        audio_duration_sec = None
        try:
            import subprocess
            probe_result = subprocess.run(
-                ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
+                [get_ffprobe_path(), "-v", "quiet", "-show_entries", "format=duration",
                 "-of", "default=noprint_wrappers=1:nokey=1", file_path],
                capture_output=True, text=True, check=True,
            )
@@ -109,30 +110,33 @@ class PipelineService:
        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
            pass

-        from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
-        if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
-            transcription = self._transcribe_service.transcribe_chunked(
-                request_id=request_id,
-                file_path=file_path,
-                model_name=model_name,
-                device=device,
-                compute_type=compute_type,
-                language=language,
-                on_segment=_emit_segment,
-            )
-        else:
-            transcription = self._transcribe_service.transcribe(
-                request_id=request_id,
-                file_path=file_path,
-                model_name=model_name,
-                device=device,
-                compute_type=compute_type,
-                language=language,
-                on_segment=_emit_segment,
-            )
+        def _run_transcription() -> TranscriptionResult:
+            """Run transcription (chunked or standard based on duration)."""
+            from voice_to_notes.services.transcribe import LARGE_FILE_THRESHOLD_SEC
+            if audio_duration_sec and audio_duration_sec > LARGE_FILE_THRESHOLD_SEC:
+                return self._transcribe_service.transcribe_chunked(
+                    request_id=request_id,
+                    file_path=file_path,
+                    model_name=model_name,
+                    device=device,
+                    compute_type=compute_type,
+                    language=language,
+                    on_segment=_emit_segment,
+                )
+            else:
+                return self._transcribe_service.transcribe(
+                    request_id=request_id,
+                    file_path=file_path,
+                    model_name=model_name,
+                    device=device,
+                    compute_type=compute_type,
+                    language=language,
+                    on_segment=_emit_segment,
+                )

        if skip_diarization:
-            # Convert transcription directly without speaker labels
+            # Sequential: transcribe only, no diarization needed
+            transcription = _run_transcription()
            result = PipelineResult(
                language=transcription.language,
                language_probability=transcription.language_probability,
@@ -150,37 +154,59 @@ class PipelineService:
                )
            return result

-        # Step 2: Diarize (with graceful fallback)
+        # Parallel execution: run transcription (0-45%) and diarization (45-90%)
+        # concurrently, then merge (90-100%).
        write_message(
-            progress_message(request_id, 50, "pipeline", "Starting speaker diarization...")
+            progress_message(
+                request_id, 0, "pipeline",
+                "Starting transcription and diarization in parallel..."
+            )
        )

        diarization = None
-        try:
-            diarization = self._diarize_service.diarize(
+        diarization_error = None
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            transcription_future = executor.submit(_run_transcription)
+
+            # Use probed audio_duration_sec for diarization progress estimation
+            # (transcription hasn't finished yet, so we can't use transcription.duration_ms)
+            diarization_future = executor.submit(
+                self._diarize_service.diarize,
                request_id=request_id,
                file_path=file_path,
                num_speakers=num_speakers,
                min_speakers=min_speakers,
                max_speakers=max_speakers,
                hf_token=hf_token,
-                audio_duration_sec=transcription.duration_ms / 1000.0,
+                audio_duration_sec=audio_duration_sec,
            )
-        except Exception as e:
-            import traceback
-            print(
-                f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
-                file=sys.stderr,
-                flush=True,
-            )
-            traceback.print_exc(file=sys.stderr)
+
+            # Wait for both futures. We need the transcription result regardless,
+            # but diarization may fail gracefully.
+            transcription = transcription_future.result()
            write_message(
-                progress_message(
-                    request_id, 80, "pipeline",
-                    f"Diarization failed ({e}), using transcription only..."
-                )
+                progress_message(request_id, 45, "pipeline", "Transcription complete")
            )

+            try:
+                diarization = diarization_future.result()
+            except Exception as e:
+                import traceback
+                diarization_error = e
+                print(
+                    f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
+                    file=sys.stderr,
+                    flush=True,
+                )
+                traceback.print_exc(file=sys.stderr)
+                write_message(
+                    progress_message(
+                        request_id, 80, "pipeline",
+                        f"Diarization failed ({e}), using transcription only..."
+                    )
+                )
+
        # Step 3: Merge (or skip if diarization failed)
        if diarization is not None:
            write_message(