From 669d88f1432c8a4cbceaabae0a7baa6fc30d834f Mon Sep 17 00:00:00 2001 From: Josh Knapp Date: Thu, 26 Feb 2026 17:14:25 -0800 Subject: [PATCH 01/14] Fix progress feedback, diarization fallback, and dropdown readability - Stream pipeline progress to frontend via Tauri events so the progress overlay updates in real time during transcription/diarization - Gracefully fall back to transcription-only when diarization fails (e.g. pyannote not installed) instead of erroring the whole pipeline - Add color-scheme: dark to fix native select/option elements rendering with unreadable white backgrounds Co-Authored-By: Claude Opus 4.6 --- python/voice_to_notes/services/pipeline.py | 62 ++++++++++++++++------ src-tauri/src/commands/transcribe.rs | 6 ++- src-tauri/src/sidecar/mod.rs | 18 +++++-- src/routes/+layout.svelte | 1 + src/routes/+page.svelte | 15 ++++++ 5 files changed, 81 insertions(+), 21 deletions(-) diff --git a/python/voice_to_notes/services/pipeline.py b/python/voice_to_notes/services/pipeline.py index 2d1f66b..fe4bf04 100644 --- a/python/voice_to_notes/services/pipeline.py +++ b/python/voice_to_notes/services/pipeline.py @@ -110,27 +110,57 @@ class PipelineService: ) return result - # Step 2: Diarize + # Step 2: Diarize (with graceful fallback) write_message( progress_message(request_id, 50, "pipeline", "Starting speaker diarization...") ) - diarization = self._diarize_service.diarize( - request_id=request_id, - file_path=file_path, - num_speakers=num_speakers, - min_speakers=min_speakers, - max_speakers=max_speakers, - ) + diarization = None + try: + diarization = self._diarize_service.diarize( + request_id=request_id, + file_path=file_path, + num_speakers=num_speakers, + min_speakers=min_speakers, + max_speakers=max_speakers, + ) + except Exception as e: + print( + f"[sidecar] Diarization failed, falling back to transcription-only: {e}", + file=sys.stderr, + flush=True, + ) + write_message( + progress_message( + request_id, 80, "pipeline", + "Diarization unavailable, using transcription only..." + ) + ) - # Step 3: Merge - write_message( - progress_message(request_id, 90, "pipeline", "Merging transcript with speakers...") - ) - - result = self._merge_results(transcription, diarization.speaker_segments) - result.speakers = diarization.speakers - result.num_speakers = diarization.num_speakers + # Step 3: Merge (or skip if diarization failed) + if diarization is not None: + write_message( + progress_message(request_id, 90, "pipeline", "Merging transcript with speakers...") + ) + result = self._merge_results(transcription, diarization.speaker_segments) + result.speakers = diarization.speakers + result.num_speakers = diarization.num_speakers + else: + result = PipelineResult( + language=transcription.language, + language_probability=transcription.language_probability, + duration_ms=transcription.duration_ms, + ) + for seg in transcription.segments: + result.segments.append( + PipelineSegment( + text=seg.text, + start_ms=seg.start_ms, + end_ms=seg.end_ms, + speaker=None, + words=seg.words, + ) + ) elapsed = time.time() - start_time print( diff --git a/src-tauri/src/commands/transcribe.rs b/src-tauri/src/commands/transcribe.rs index 9e2239a..2f2e64a 100644 --- a/src-tauri/src/commands/transcribe.rs +++ b/src-tauri/src/commands/transcribe.rs @@ -1,4 +1,5 @@ use serde_json::{json, Value}; +use tauri::{AppHandle, Emitter}; use crate::sidecar::messages::IPCMessage; use crate::sidecar::sidecar; @@ -42,6 +43,7 @@ pub fn transcribe_file( /// Run the full transcription + diarization pipeline via the Python sidecar. #[tauri::command] pub fn run_pipeline( + app: AppHandle, file_path: String, model: Option, device: Option, @@ -71,7 +73,9 @@ pub fn run_pipeline( }), ); - let response = manager.send_and_receive(&msg)?; + let response = manager.send_and_receive_with_progress(&msg, |progress| { + let _ = app.emit("pipeline-progress", &progress.payload); + })?; if response.msg_type == "error" { return Err(format!( diff --git a/src-tauri/src/sidecar/mod.rs b/src-tauri/src/sidecar/mod.rs index dd60840..c8cc33e 100644 --- a/src-tauri/src/sidecar/mod.rs +++ b/src-tauri/src/sidecar/mod.rs @@ -115,8 +115,17 @@ impl SidecarManager { } /// Send a message to the sidecar and read the response. - /// This is a blocking call. + /// This is a blocking call. Progress messages are skipped. pub fn send_and_receive(&self, msg: &IPCMessage) -> Result { + self.send_and_receive_with_progress(msg, |_| {}) + } + + /// Send a message and read the response, calling on_progress for each progress message. + pub fn send_and_receive_with_progress( + &self, + msg: &IPCMessage, + on_progress: impl Fn(&IPCMessage), + ) -> Result { // Write to stdin { let mut stdin_guard = self.stdin.lock().map_err(|e| e.to_string())?; @@ -154,10 +163,11 @@ impl SidecarManager { let response: IPCMessage = serde_json::from_str(trimmed) .map_err(|e| format!("Parse error: {e}"))?; - // Skip progress messages, return the final result/error - if response.msg_type != "progress" { - return Ok(response); + if response.msg_type == "progress" { + on_progress(&response); + continue; } + return Ok(response); } } else { Err("Sidecar stdout not available".to_string()) diff --git a/src/routes/+layout.svelte b/src/routes/+layout.svelte index a655c08..2112a61 100644 --- a/src/routes/+layout.svelte +++ b/src/routes/+layout.svelte @@ -10,6 +10,7 @@ padding: 0; background: #0a0a23; color: #e0e0e0; + color-scheme: dark; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; overflow: hidden; diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 9b139a0..2260060 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -1,5 +1,6 @@ {#if visible} @@ -14,12 +46,28 @@
-

{stage || 'Processing...'}

+

{displayStage}

-
-
+ +
+ {#each pipelineSteps as step} + {@const status = getStepStatus(step.key, stage)} +
+ + {#if status === 'done'} + ✓ + {:else if status === 'active'} + ⟳ + {:else} + · + {/if} + + {step.label} +
+ {/each}
-

{percent}% — {message || 'Please wait...'}

+ +

{message || 'Please wait...'}

This may take several minutes for large files

@@ -39,7 +87,8 @@ background: #16213e; padding: 2rem 2.5rem; border-radius: 12px; - min-width: 420px; + min-width: 380px; + max-width: 440px; color: #e0e0e0; border: 1px solid #2a3a5e; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.5); @@ -57,35 +106,52 @@ border-top-color: #e94560; border-radius: 50%; animation: spin 0.8s linear infinite; + flex-shrink: 0; } @keyframes spin { to { transform: rotate(360deg); } } h3 { margin: 0; - text-transform: capitalize; font-size: 1.1rem; } - .bar-track { - height: 10px; - background: #0f3460; - border-radius: 5px; - overflow: hidden; + .steps { + display: flex; + flex-direction: column; + gap: 0.4rem; + margin-bottom: 1rem; } - .bar-fill { - height: 100%; - background: linear-gradient(90deg, #e94560, #ff6b81); - transition: width 0.3s; - border-radius: 5px; + .step { + display: flex; + align-items: center; + gap: 0.5rem; + font-size: 0.85rem; + color: #555; + } + .step-done { + color: #4ecdc4; + } + .step-active { + color: #e0e0e0; + font-weight: 500; + } + .step-icon { + width: 1.2rem; + text-align: center; + flex-shrink: 0; + } + .step-active .step-icon { + animation: spin 1.5s linear infinite; + display: inline-block; } .status-text { margin: 0.75rem 0 0; - font-size: 0.9rem; + font-size: 0.85rem; color: #b0b0b0; } .hint-text { margin: 0.5rem 0 0; font-size: 0.75rem; - color: #666; + color: #555; } diff --git a/src/lib/components/SpeakerManager.svelte b/src/lib/components/SpeakerManager.svelte index c5a925b..a3c773a 100644 --- a/src/lib/components/SpeakerManager.svelte +++ b/src/lib/components/SpeakerManager.svelte @@ -34,7 +34,11 @@

Speakers

{#if $speakers.length === 0} -

No speakers detected yet

+

No speakers detected

+

+ Speaker detection requires a HuggingFace token. + Set the HF_TOKEN environment variable and restart. +

{:else}
    {#each $speakers as speaker (speaker.id)} @@ -78,6 +82,19 @@ .empty-hint { color: #666; font-size: 0.875rem; + margin-bottom: 0.25rem; + } + .setup-hint { + color: #555; + font-size: 0.75rem; + line-height: 1.4; + } + .setup-hint code { + background: rgba(233, 69, 96, 0.15); + color: #e94560; + padding: 0.1rem 0.3rem; + border-radius: 3px; + font-size: 0.7rem; } .speaker-list { list-style: none; diff --git a/src/lib/components/WaveformPlayer.svelte b/src/lib/components/WaveformPlayer.svelte index 10c230c..ae45220 100644 --- a/src/lib/components/WaveformPlayer.svelte +++ b/src/lib/components/WaveformPlayer.svelte @@ -12,6 +12,7 @@ let container: HTMLDivElement; let wavesurfer: WaveSurfer | null = $state(null); + let isReady = $state(false); let currentTime = $state('0:00'); let totalTime = $state('0:00'); @@ -39,6 +40,7 @@ }); wavesurfer.on('ready', () => { + isReady = true; const dur = wavesurfer!.getDuration(); durationMs.set(Math.round(dur * 1000)); totalTime = formatTime(dur); @@ -48,6 +50,10 @@ wavesurfer.on('pause', () => isPlaying.set(false)); wavesurfer.on('finish', () => isPlaying.set(false)); + wavesurfer.on('loading', () => { + isReady = false; + }); + if (audioUrl) { wavesurfer.load(audioUrl); } @@ -57,20 +63,21 @@ wavesurfer?.destroy(); }); - /** Toggle play/pause. Exposed for keyboard shortcuts. */ + /** Toggle play/pause from current position. Exposed for keyboard shortcuts. */ export function togglePlayPause() { - wavesurfer?.playPause(); + if (!wavesurfer || !isReady) return; + wavesurfer.playPause(); } function skipBack() { - if (wavesurfer) { + if (wavesurfer && isReady) { const time = Math.max(0, wavesurfer.getCurrentTime() - 5); wavesurfer.setTime(time); } } function skipForward() { - if (wavesurfer) { + if (wavesurfer && isReady) { const time = Math.min(wavesurfer.getDuration(), wavesurfer.getCurrentTime() + 5); wavesurfer.setTime(time); } @@ -78,17 +85,20 @@ /** Seek to a specific time in milliseconds. Called from transcript click-to-seek. */ export function seekTo(timeMs: number) { - console.log('[voice-to-notes] seekTo called:', timeMs, 'ms, wavesurfer:', !!wavesurfer, 'duration:', wavesurfer?.getDuration()); - if (wavesurfer) { - wavesurfer.setTime(timeMs / 1000); - if (!wavesurfer.isPlaying()) { - wavesurfer.play(); - } + if (!wavesurfer || !isReady) { + console.warn('[voice-to-notes] seekTo ignored — audio not ready yet'); + return; + } + const timeSec = timeMs / 1000; + wavesurfer.setTime(timeSec); + if (!wavesurfer.isPlaying()) { + wavesurfer.play(); } } /** Load a new audio file. */ export function loadAudio(url: string) { + isReady = false; wavesurfer?.load(url); } @@ -96,11 +106,17 @@
    - - + - + {currentTime} / {totalTime}
    @@ -130,9 +146,13 @@ cursor: pointer; font-size: 1rem; } - .control-btn:hover { + .control-btn:hover:not(:disabled) { background: #1a4a7a; } + .control-btn:disabled { + opacity: 0.4; + cursor: not-allowed; + } .play-btn { padding: 0.4rem 1rem; font-size: 1.2rem; diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index d1e1eba..8b88227 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -102,6 +102,7 @@ stage: string; message: string; }>('pipeline-progress', (event) => { + console.log('[voice-to-notes] Progress event:', event.payload); const { percent, stage, message } = event.payload; if (typeof percent === 'number') transcriptionProgress = percent; if (typeof stage === 'string') transcriptionStage = stage; @@ -387,7 +388,8 @@ display: flex; gap: 1rem; padding: 1rem; - height: calc(100vh - 3.5rem); + height: calc(100vh - 3rem); + overflow: hidden; background: #0a0a23; } .main-content { -- 2.47.3 From baf820286ff8f8a2eacb1e793e4300d76a730ccd Mon Sep 17 00:00:00 2001 From: Josh Knapp Date: Thu, 26 Feb 2026 18:08:51 -0800 Subject: [PATCH 05/14] Add HuggingFace token setting for speaker detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add "Speakers" tab in Settings with HF token input field - Include step-by-step instructions for obtaining the token - Pass hf_token from settings through Rust → Python pipeline → diarize - Token can also be set via HF_TOKEN environment variable as fallback - Move skip_diarization checkbox to Speakers tab Co-Authored-By: Claude Opus 4.6 --- python/voice_to_notes/ipc/handlers.py | 1 + python/voice_to_notes/services/diarize.py | 9 ++-- python/voice_to_notes/services/pipeline.py | 2 + src-tauri/src/commands/transcribe.rs | 2 + src/lib/components/SettingsModal.svelte | 55 +++++++++++++++++++++- src/lib/stores/settings.ts | 2 + src/routes/+page.svelte | 1 + 7 files changed, 67 insertions(+), 5 deletions(-) diff --git a/python/voice_to_notes/ipc/handlers.py b/python/voice_to_notes/ipc/handlers.py index 5d27734..4334f23 100644 --- a/python/voice_to_notes/ipc/handlers.py +++ b/python/voice_to_notes/ipc/handlers.py @@ -107,6 +107,7 @@ def make_pipeline_handler() -> HandlerFunc: min_speakers=payload.get("min_speakers"), max_speakers=payload.get("max_speakers"), skip_diarization=payload.get("skip_diarization", False), + hf_token=payload.get("hf_token"), ) return IPCMessage( id=msg.id, diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py index 77079c3..c29fdd0 100644 --- a/python/voice_to_notes/services/diarize.py +++ b/python/voice_to_notes/services/diarize.py @@ -35,7 +35,7 @@ class DiarizeService: def __init__(self) -> None: self._pipeline: Any = None - def _ensure_pipeline(self) -> Any: + def _ensure_pipeline(self, hf_token: str | None = None) -> Any: """Load the pyannote diarization pipeline (lazy).""" if self._pipeline is not None: return self._pipeline @@ -44,7 +44,9 @@ class DiarizeService: print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True) - hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None + # Use token from argument, fall back to environment variable + if not hf_token: + hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None models = [ "pyannote/speaker-diarization-3.1", @@ -81,6 +83,7 @@ class DiarizeService: num_speakers: int | None = None, min_speakers: int | None = None, max_speakers: int | None = None, + hf_token: str | None = None, ) -> DiarizationResult: """Run speaker diarization on an audio file. @@ -98,7 +101,7 @@ class DiarizeService: progress_message(request_id, 0, "loading_diarization", "Loading diarization model...") ) - pipeline = self._ensure_pipeline() + pipeline = self._ensure_pipeline(hf_token=hf_token) write_message( progress_message(request_id, 20, "diarizing", "Running speaker diarization...") diff --git a/python/voice_to_notes/services/pipeline.py b/python/voice_to_notes/services/pipeline.py index 59f8186..7ca728e 100644 --- a/python/voice_to_notes/services/pipeline.py +++ b/python/voice_to_notes/services/pipeline.py @@ -60,6 +60,7 @@ class PipelineService: min_speakers: int | None = None, max_speakers: int | None = None, skip_diarization: bool = False, + hf_token: str | None = None, ) -> PipelineResult: """Run the full transcription + diarization pipeline. @@ -123,6 +124,7 @@ class PipelineService: num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers, + hf_token=hf_token, ) except Exception as e: print( diff --git a/src-tauri/src/commands/transcribe.rs b/src-tauri/src/commands/transcribe.rs index 2f2e64a..d4c90ea 100644 --- a/src-tauri/src/commands/transcribe.rs +++ b/src-tauri/src/commands/transcribe.rs @@ -52,6 +52,7 @@ pub fn run_pipeline( min_speakers: Option, max_speakers: Option, skip_diarization: Option, + hf_token: Option, ) -> Result { let manager = sidecar(); manager.ensure_running()?; @@ -70,6 +71,7 @@ pub fn run_pipeline( "min_speakers": min_speakers, "max_speakers": max_speakers, "skip_diarization": skip_diarization.unwrap_or(false), + "hf_token": hf_token, }), ); diff --git a/src/lib/components/SettingsModal.svelte b/src/lib/components/SettingsModal.svelte index 659b9e7..1794071 100644 --- a/src/lib/components/SettingsModal.svelte +++ b/src/lib/components/SettingsModal.svelte @@ -9,7 +9,7 @@ let { visible, onClose }: Props = $props(); let localSettings = $state({ ...$settings }); - let activeTab = $state<'transcription' | 'ai' | 'local'>('transcription'); + let activeTab = $state<'transcription' | 'speakers' | 'ai' | 'local'>('transcription'); // Sync when settings store changes $effect(() => { @@ -46,6 +46,9 @@ + @@ -77,10 +80,27 @@
+ {:else if activeTab === 'speakers'} +
+ + +
+
+

Why is this needed?

+

Speaker detection uses the pyannote.audio model, which is hosted on HuggingFace and requires accepting a license agreement.

+

How to get a token:

+
    +
  1. Create a free account at huggingface.co
  2. +
  3. Go to huggingface.co/pyannote/speaker-diarization-3.1 and accept the license
  4. +
  5. Go to huggingface.co/settings/tokens and create a token with read access
  6. +
  7. Paste the token above and click Save
  8. +
+

The model will be downloaded automatically on first use (~100 MB).

+
{:else if activeTab === 'ai'} @@ -252,6 +272,37 @@ color: #666; line-height: 1.4; } + .info-box { + background: rgba(233, 69, 96, 0.05); + border: 1px solid #2a3a5e; + border-radius: 6px; + padding: 0.75rem 1rem; + margin-bottom: 1rem; + font-size: 0.8rem; + color: #b0b0b0; + line-height: 1.5; + } + .info-box p { + margin: 0 0 0.5rem; + } + .info-box p:last-child { + margin-bottom: 0; + } + .info-box .info-title { + color: #e0e0e0; + font-weight: 600; + font-size: 0.8rem; + } + .info-box ol { + margin: 0.25rem 0 0.5rem; + padding-left: 1.25rem; + } + .info-box li { + margin-bottom: 0.25rem; + } + .info-box strong { + color: #e0e0e0; + } .modal-footer { display: flex; justify-content: flex-end; diff --git a/src/lib/stores/settings.ts b/src/lib/stores/settings.ts index 32da0ee..9eab830 100644 --- a/src/lib/stores/settings.ts +++ b/src/lib/stores/settings.ts @@ -14,6 +14,7 @@ export interface AppSettings { transcription_device: string; transcription_language: string; skip_diarization: boolean; + hf_token: string; } const defaults: AppSettings = { @@ -29,6 +30,7 @@ const defaults: AppSettings = { transcription_device: 'cpu', transcription_language: '', skip_diarization: false, + hf_token: '', }; export const settings = writable({ ...defaults }); diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 8b88227..9f8a457 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -133,6 +133,7 @@ device: $settings.transcription_device || undefined, language: $settings.transcription_language || undefined, skipDiarization: $settings.skip_diarization || undefined, + hfToken: $settings.hf_token || undefined, }); // Create speaker entries from pipeline result -- 2.47.3 From a3612c986d3eb1811b6d377916458be90b89c3a0 Mon Sep 17 00:00:00 2001 From: Josh Knapp Date: Thu, 26 Feb 2026 18:21:42 -0800 Subject: [PATCH 06/14] Add Test & Download button for diarization model, clickable links - Add diarize.download IPC handler that downloads the pyannote model and returns user-friendly error messages (missing license, bad token) - Add download_diarize_model Tauri command - Add "Test & Download Model" button in Speakers settings tab - Update instructions to list both required model licenses (speaker-diarization-3.1 AND segmentation-3.0) - Make all HuggingFace URLs clickable (opens in system browser) Co-Authored-By: Claude Opus 4.6 --- python/voice_to_notes/ipc/handlers.py | 51 +++++++++++ python/voice_to_notes/main.py | 2 + src-tauri/src/commands/transcribe.rs | 29 +++++++ src-tauri/src/lib.rs | 3 +- src/lib/components/SettingsModal.svelte | 111 ++++++++++++++++++++++-- 5 files changed, 186 insertions(+), 10 deletions(-) diff --git a/python/voice_to_notes/ipc/handlers.py b/python/voice_to_notes/ipc/handlers.py index 4334f23..35a98b5 100644 --- a/python/voice_to_notes/ipc/handlers.py +++ b/python/voice_to_notes/ipc/handlers.py @@ -88,6 +88,57 @@ def make_diarize_handler() -> HandlerFunc: return handler +def make_diarize_download_handler() -> HandlerFunc: + """Create a handler that downloads/validates the diarization model.""" + + def handler(msg: IPCMessage) -> IPCMessage: + payload = msg.payload + hf_token = payload.get("hf_token") + + try: + from pyannote.audio import Pipeline + + print("[sidecar] Downloading diarization model...", file=sys.stderr, flush=True) + pipeline = Pipeline.from_pretrained( + "pyannote/speaker-diarization-3.1", + token=hf_token, + ) + print("[sidecar] Diarization model downloaded successfully", file=sys.stderr, flush=True) + return IPCMessage( + id=msg.id, + type="diarize.download.result", + payload={"ok": True}, + ) + except Exception as e: + error_msg = str(e) + # Make common errors more user-friendly + if "403" in error_msg and "gated" in error_msg.lower(): + # Extract which model needs access + if "segmentation" in error_msg: + error_msg = ( + "Access denied for pyannote/segmentation-3.0. " + "Please visit huggingface.co/pyannote/segmentation-3.0 " + "and accept the license agreement." + ) + elif "speaker-diarization" in error_msg: + error_msg = ( + "Access denied for pyannote/speaker-diarization-3.1. " + "Please visit huggingface.co/pyannote/speaker-diarization-3.1 " + "and accept the license agreement." + ) + else: + error_msg = ( + "Access denied. Please accept the license agreements at: " + "huggingface.co/pyannote/speaker-diarization-3.1 and " + "huggingface.co/pyannote/segmentation-3.0" + ) + elif "401" in error_msg: + error_msg = "Invalid token. Please check your HuggingFace token." + return error_message(msg.id, "download_error", error_msg) + + return handler + + def make_pipeline_handler() -> HandlerFunc: """Create a full pipeline handler (transcribe + diarize + merge).""" from voice_to_notes.services.pipeline import PipelineService, pipeline_result_to_payload diff --git a/python/voice_to_notes/main.py b/python/voice_to_notes/main.py index 77e9e7d..d72d1df 100644 --- a/python/voice_to_notes/main.py +++ b/python/voice_to_notes/main.py @@ -15,6 +15,7 @@ from voice_to_notes.ipc.handlers import ( # noqa: E402 HandlerRegistry, hardware_detect_handler, make_ai_chat_handler, + make_diarize_download_handler, make_diarize_handler, make_export_handler, make_pipeline_handler, @@ -32,6 +33,7 @@ def create_registry() -> HandlerRegistry: registry.register("transcribe.start", make_transcribe_handler()) registry.register("hardware.detect", hardware_detect_handler) registry.register("diarize.start", make_diarize_handler()) + registry.register("diarize.download", make_diarize_download_handler()) registry.register("pipeline.start", make_pipeline_handler()) registry.register("export.start", make_export_handler()) registry.register("ai.chat", make_ai_chat_handler()) diff --git a/src-tauri/src/commands/transcribe.rs b/src-tauri/src/commands/transcribe.rs index d4c90ea..0dc625d 100644 --- a/src-tauri/src/commands/transcribe.rs +++ b/src-tauri/src/commands/transcribe.rs @@ -40,6 +40,35 @@ pub fn transcribe_file( Ok(response.payload) } +/// Download and validate the diarization model via the Python sidecar. +#[tauri::command] +pub fn download_diarize_model( + hf_token: String, +) -> Result { + let manager = sidecar(); + manager.ensure_running()?; + + let request_id = uuid::Uuid::new_v4().to_string(); + let msg = IPCMessage::new( + &request_id, + "diarize.download", + json!({ + "hf_token": hf_token, + }), + ); + + let response = manager.send_and_receive(&msg)?; + + if response.msg_type == "error" { + return Ok(json!({ + "ok": false, + "error": response.payload.get("message").and_then(|v| v.as_str()).unwrap_or("unknown"), + })); + } + + Ok(json!({ "ok": true })) +} + /// Run the full transcription + diarization pipeline via the Python sidecar. #[tauri::command] pub fn run_pipeline( diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index e13b003..0f3e476 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -12,7 +12,7 @@ use commands::export::export_transcript; use commands::project::{create_project, get_project, list_projects}; use commands::settings::{load_settings, save_settings}; use commands::system::{get_data_dir, llama_list_models, llama_start, llama_status, llama_stop}; -use commands::transcribe::{run_pipeline, transcribe_file}; +use commands::transcribe::{download_diarize_model, run_pipeline, transcribe_file}; use state::AppState; #[cfg_attr(mobile, tauri::mobile_entry_point)] @@ -36,6 +36,7 @@ pub fn run() { list_projects, transcribe_file, run_pipeline, + download_diarize_model, export_transcript, ai_chat, ai_list_providers, diff --git a/src/lib/components/SettingsModal.svelte b/src/lib/components/SettingsModal.svelte index 1794071..85ed628 100644 --- a/src/lib/components/SettingsModal.svelte +++ b/src/lib/components/SettingsModal.svelte @@ -1,4 +1,6 @@ @@ -50,8 +70,8 @@
- {#each pipelineSteps as step} - {@const status = getStepStatus(step.key, stage)} + {#each pipelineSteps as step, idx} + {@const status = getStepStatus(idx)}
{#if status === 'done'} diff --git a/src/lib/components/SettingsModal.svelte b/src/lib/components/SettingsModal.svelte index 0b95d09..7f8de81 100644 --- a/src/lib/components/SettingsModal.svelte +++ b/src/lib/components/SettingsModal.svelte @@ -14,6 +14,7 @@ let activeTab = $state<'transcription' | 'speakers' | 'ai' | 'local'>('transcription'); let modelStatus = $state<'idle' | 'downloading' | 'success' | 'error'>('idle'); let modelError = $state(''); + let revealedFields = $state>(new Set()); async function testAndDownloadModel() { if (!localSettings.hf_token) { @@ -111,7 +112,10 @@ {:else if activeTab === 'speakers'}
- +
+ + +

Setup (one-time)

@@ -150,6 +154,23 @@ {#if modelStatus === 'error'}

{modelError}

{/if} +
+ + +

Hint the expected number of speakers to speed up diarization clustering.

+
{#if localSettings.ai_provider === 'openai'}
- +
+ + +
@@ -179,13 +203,27 @@ {:else if localSettings.ai_provider === 'anthropic'}
- +
+ + +
{:else if localSettings.ai_provider === 'litellm'} +
+ + +
+
+ +
+ + +
+
@@ -293,11 +331,36 @@ color: #aaa; margin-bottom: 0.3rem; } + .input-reveal { + display: flex; + gap: 0; + } + .input-reveal input { + flex: 1; + border-top-right-radius: 0; + border-bottom-right-radius: 0; + } + .reveal-btn { + background: #0f3460; + border: 1px solid #4a5568; + border-left: none; + color: #aaa; + padding: 0.5rem 0.6rem; + border-radius: 0 4px 4px 0; + cursor: pointer; + font-size: 0.75rem; + white-space: nowrap; + } + .reveal-btn:hover { + color: #e0e0e0; + background: #1a4a7a; + } .field input, .field select { width: 100%; background: #1a1a2e; color: #e0e0e0; + color-scheme: dark; border: 1px solid #4a5568; border-radius: 4px; padding: 0.5rem; diff --git a/src/lib/components/WaveformPlayer.svelte b/src/lib/components/WaveformPlayer.svelte index ae45220..4ffeda9 100644 --- a/src/lib/components/WaveformPlayer.svelte +++ b/src/lib/components/WaveformPlayer.svelte @@ -13,6 +13,7 @@ let container: HTMLDivElement; let wavesurfer: WaveSurfer | null = $state(null); let isReady = $state(false); + let isLoading = $state(false); let currentTime = $state('0:00'); let totalTime = $state('0:00'); @@ -32,6 +33,7 @@ barWidth: 2, barGap: 1, barRadius: 2, + backend: 'WebAudio', }); wavesurfer.on('timeupdate', (time: number) => { @@ -41,6 +43,7 @@ wavesurfer.on('ready', () => { isReady = true; + isLoading = false; const dur = wavesurfer!.getDuration(); durationMs.set(Math.round(dur * 1000)); totalTime = formatTime(dur); @@ -55,7 +58,7 @@ }); if (audioUrl) { - wavesurfer.load(audioUrl); + loadAudio(audioUrl); } }); @@ -89,16 +92,13 @@ console.warn('[voice-to-notes] seekTo ignored — audio not ready yet'); return; } - const timeSec = timeMs / 1000; - wavesurfer.setTime(timeSec); - if (!wavesurfer.isPlaying()) { - wavesurfer.play(); - } + wavesurfer.setTime(timeMs / 1000); } /** Load a new audio file. */ export function loadAudio(url: string) { isReady = false; + isLoading = true; wavesurfer?.load(url); } diff --git a/src/lib/stores/settings.ts b/src/lib/stores/settings.ts index 9eab830..86262c9 100644 --- a/src/lib/stores/settings.ts +++ b/src/lib/stores/settings.ts @@ -8,6 +8,8 @@ export interface AppSettings { openai_model: string; anthropic_model: string; litellm_model: string; + litellm_api_key: string; + litellm_api_base: string; local_model_path: string; local_binary_path: string; transcription_model: string; @@ -15,6 +17,7 @@ export interface AppSettings { transcription_language: string; skip_diarization: boolean; hf_token: string; + num_speakers: number | null; } const defaults: AppSettings = { @@ -24,6 +27,8 @@ const defaults: AppSettings = { openai_model: 'gpt-4o-mini', anthropic_model: 'claude-sonnet-4-6', litellm_model: 'gpt-4o-mini', + litellm_api_key: '', + litellm_api_base: '', local_model_path: '', local_binary_path: 'llama-server', transcription_model: 'base', @@ -31,6 +36,7 @@ const defaults: AppSettings = { transcription_language: '', skip_diarization: false, hf_token: '', + num_speakers: null, }; export const settings = writable({ ...defaults }); @@ -47,4 +53,20 @@ export async function loadSettings(): Promise { export async function saveSettings(s: AppSettings): Promise { settings.set(s); await invoke('save_settings', { settings: s }); + + // Configure the AI provider in the Python sidecar + const configMap: Record> = { + openai: { api_key: s.openai_api_key, model: s.openai_model }, + anthropic: { api_key: s.anthropic_api_key, model: s.anthropic_model }, + litellm: { api_key: s.litellm_api_key, api_base: s.litellm_api_base, model: s.litellm_model }, + local: { model: s.local_model_path, base_url: 'http://localhost:8080' }, + }; + const config = configMap[s.ai_provider]; + if (config) { + try { + await invoke('ai_configure', { provider: s.ai_provider, config }); + } catch { + // Sidecar may not be running yet — provider will be configured on first use + } + } } diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 02efd76..5e0f382 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -13,6 +13,7 @@ import type { Segment, Speaker } from '$lib/types/transcript'; import { onMount, tick } from 'svelte'; + let appReady = $state(false); let waveformPlayer: WaveformPlayer; let audioUrl = $state(''); let showSettings = $state(false); @@ -54,6 +55,8 @@ document.addEventListener('keydown', handleKeyDown); document.addEventListener('click', handleClickOutside); + appReady = true; + return () => { document.removeEventListener('keydown', handleKeyDown); document.removeEventListener('click', handleClickOutside); @@ -200,6 +203,7 @@ language: $settings.transcription_language || undefined, skipDiarization: $settings.skip_diarization || undefined, hfToken: $settings.hf_token || undefined, + numSpeakers: $settings.num_speakers && $settings.num_speakers > 0 ? $settings.num_speakers : undefined, }); // Create speaker entries from pipeline result @@ -303,60 +307,70 @@ } -
-

Voice to Notes

-
- - - {#if $segments.length > 0} -
- - {#if showExportMenu} -
- {#each exportFormats as fmt} - - {/each} -
+{#if !appReady} +
+

Voice to Notes

+

Loading...

+
+
+{:else} +
+
+

Voice to Notes

+
+
- {/if} + + + {#if $segments.length > 0} +
+ + {#if showExportMenu} +
+ {#each exportFormats as fmt} + + {/each} +
+ {/if} +
+ {/if} +
-
-
-
- - +
+
+ + +
+
- -
- + - showSettings = false} -/> + showSettings = false} + /> +{/if} -- 2.47.3 From 09d7c2064fa922df849ee7a64d2441c772a8161a Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 21:52:21 -0700 Subject: [PATCH 14/14] Add release job to Gitea Actions workflow Creates a pre-release with all platform artifacts on every push to main. Uses BUILD_TOKEN secret for Gitea API authentication. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitea/workflows/build.yml | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml index 92000a5..bc9e21a 100644 --- a/.gitea/workflows/build.yml +++ b/.gitea/workflows/build.yml @@ -134,3 +134,46 @@ jobs: src-tauri/target/release/bundle/dmg/*.dmg src-tauri/target/release/bundle/macos/*.app retention-days: 30 + + release: + name: Create Release + needs: build-tauri + if: github.ref == 'refs/heads/main' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download all app artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts/ + pattern: app-* + + - name: Generate release tag + id: tag + run: echo "tag=build-$(date +%Y%m%d-%H%M%S)" >> $GITHUB_OUTPUT + + - name: Create release + env: + BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }} + TAG: ${{ steps.tag.outputs.tag }} + run: | + # Create the release + RELEASE_ID=$(curl -s -X POST \ + -H "Authorization: token ${BUILD_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{\"tag_name\": \"${TAG}\", \"name\": \"Voice to Notes ${TAG}\", \"body\": \"Automated build from main branch.\", \"draft\": false, \"prerelease\": true}" \ + "${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}/releases" | jq -r '.id') + + echo "Release ID: ${RELEASE_ID}" + + # Upload all artifacts + find artifacts/ -type f \( -name "*.deb" -o -name "*.AppImage" -o -name "*.msi" -o -name "*.exe" -o -name "*.dmg" \) | while read file; do + filename=$(basename "$file") + echo "Uploading ${filename}..." + curl -s -X POST \ + -H "Authorization: token ${BUILD_TOKEN}" \ + -H "Content-Type: application/octet-stream" \ + --data-binary "@${file}" \ + "${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}/releases/${RELEASE_ID}/assets?name=${filename}" + done -- 2.47.3