From baf820286ff8f8a2eacb1e793e4300d76a730ccd Mon Sep 17 00:00:00 2001 From: Josh Knapp Date: Thu, 26 Feb 2026 18:08:51 -0800 Subject: [PATCH] Add HuggingFace token setting for speaker detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add "Speakers" tab in Settings with HF token input field - Include step-by-step instructions for obtaining the token - Pass hf_token from settings through Rust → Python pipeline → diarize - Token can also be set via HF_TOKEN environment variable as fallback - Move skip_diarization checkbox to Speakers tab Co-Authored-By: Claude Opus 4.6 --- python/voice_to_notes/ipc/handlers.py | 1 + python/voice_to_notes/services/diarize.py | 9 ++-- python/voice_to_notes/services/pipeline.py | 2 + src-tauri/src/commands/transcribe.rs | 2 + src/lib/components/SettingsModal.svelte | 55 +++++++++++++++++++++- src/lib/stores/settings.ts | 2 + src/routes/+page.svelte | 1 + 7 files changed, 67 insertions(+), 5 deletions(-) diff --git a/python/voice_to_notes/ipc/handlers.py b/python/voice_to_notes/ipc/handlers.py index 5d27734..4334f23 100644 --- a/python/voice_to_notes/ipc/handlers.py +++ b/python/voice_to_notes/ipc/handlers.py @@ -107,6 +107,7 @@ def make_pipeline_handler() -> HandlerFunc: min_speakers=payload.get("min_speakers"), max_speakers=payload.get("max_speakers"), skip_diarization=payload.get("skip_diarization", False), + hf_token=payload.get("hf_token"), ) return IPCMessage( id=msg.id, diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py index 77079c3..c29fdd0 100644 --- a/python/voice_to_notes/services/diarize.py +++ b/python/voice_to_notes/services/diarize.py @@ -35,7 +35,7 @@ class DiarizeService: def __init__(self) -> None: self._pipeline: Any = None - def _ensure_pipeline(self) -> Any: + def _ensure_pipeline(self, hf_token: str | None = None) -> Any: """Load the pyannote diarization pipeline (lazy).""" if self._pipeline is not None: return self._pipeline @@ -44,7 +44,9 @@ class DiarizeService: print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True) - hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None + # Use token from argument, fall back to environment variable + if not hf_token: + hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None models = [ "pyannote/speaker-diarization-3.1", @@ -81,6 +83,7 @@ class DiarizeService: num_speakers: int | None = None, min_speakers: int | None = None, max_speakers: int | None = None, + hf_token: str | None = None, ) -> DiarizationResult: """Run speaker diarization on an audio file. @@ -98,7 +101,7 @@ class DiarizeService: progress_message(request_id, 0, "loading_diarization", "Loading diarization model...") ) - pipeline = self._ensure_pipeline() + pipeline = self._ensure_pipeline(hf_token=hf_token) write_message( progress_message(request_id, 20, "diarizing", "Running speaker diarization...") diff --git a/python/voice_to_notes/services/pipeline.py b/python/voice_to_notes/services/pipeline.py index 59f8186..7ca728e 100644 --- a/python/voice_to_notes/services/pipeline.py +++ b/python/voice_to_notes/services/pipeline.py @@ -60,6 +60,7 @@ class PipelineService: min_speakers: int | None = None, max_speakers: int | None = None, skip_diarization: bool = False, + hf_token: str | None = None, ) -> PipelineResult: """Run the full transcription + diarization pipeline. @@ -123,6 +124,7 @@ class PipelineService: num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers, + hf_token=hf_token, ) except Exception as e: print( diff --git a/src-tauri/src/commands/transcribe.rs b/src-tauri/src/commands/transcribe.rs index 2f2e64a..d4c90ea 100644 --- a/src-tauri/src/commands/transcribe.rs +++ b/src-tauri/src/commands/transcribe.rs @@ -52,6 +52,7 @@ pub fn run_pipeline( min_speakers: Option, max_speakers: Option, skip_diarization: Option, + hf_token: Option, ) -> Result { let manager = sidecar(); manager.ensure_running()?; @@ -70,6 +71,7 @@ pub fn run_pipeline( "min_speakers": min_speakers, "max_speakers": max_speakers, "skip_diarization": skip_diarization.unwrap_or(false), + "hf_token": hf_token, }), ); diff --git a/src/lib/components/SettingsModal.svelte b/src/lib/components/SettingsModal.svelte index 659b9e7..1794071 100644 --- a/src/lib/components/SettingsModal.svelte +++ b/src/lib/components/SettingsModal.svelte @@ -9,7 +9,7 @@ let { visible, onClose }: Props = $props(); let localSettings = $state({ ...$settings }); - let activeTab = $state<'transcription' | 'ai' | 'local'>('transcription'); + let activeTab = $state<'transcription' | 'speakers' | 'ai' | 'local'>('transcription'); // Sync when settings store changes $effect(() => { @@ -46,6 +46,9 @@ + @@ -77,10 +80,27 @@ + {:else if activeTab === 'speakers'} +
+ + +
+
+

Why is this needed?

+

Speaker detection uses the pyannote.audio model, which is hosted on HuggingFace and requires accepting a license agreement.

+

How to get a token:

+
    +
  1. Create a free account at huggingface.co
  2. +
  3. Go to huggingface.co/pyannote/speaker-diarization-3.1 and accept the license
  4. +
  5. Go to huggingface.co/settings/tokens and create a token with read access
  6. +
  7. Paste the token above and click Save
  8. +
+

The model will be downloaded automatically on first use (~100 MB).

+
{:else if activeTab === 'ai'} @@ -252,6 +272,37 @@ color: #666; line-height: 1.4; } + .info-box { + background: rgba(233, 69, 96, 0.05); + border: 1px solid #2a3a5e; + border-radius: 6px; + padding: 0.75rem 1rem; + margin-bottom: 1rem; + font-size: 0.8rem; + color: #b0b0b0; + line-height: 1.5; + } + .info-box p { + margin: 0 0 0.5rem; + } + .info-box p:last-child { + margin-bottom: 0; + } + .info-box .info-title { + color: #e0e0e0; + font-weight: 600; + font-size: 0.8rem; + } + .info-box ol { + margin: 0.25rem 0 0.5rem; + padding-left: 1.25rem; + } + .info-box li { + margin-bottom: 0.25rem; + } + .info-box strong { + color: #e0e0e0; + } .modal-footer { display: flex; justify-content: flex-end; diff --git a/src/lib/stores/settings.ts b/src/lib/stores/settings.ts index 32da0ee..9eab830 100644 --- a/src/lib/stores/settings.ts +++ b/src/lib/stores/settings.ts @@ -14,6 +14,7 @@ export interface AppSettings { transcription_device: string; transcription_language: string; skip_diarization: boolean; + hf_token: string; } const defaults: AppSettings = { @@ -29,6 +30,7 @@ const defaults: AppSettings = { transcription_device: 'cpu', transcription_language: '', skip_diarization: false, + hf_token: '', }; export const settings = writable({ ...defaults }); diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 8b88227..9f8a457 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -133,6 +133,7 @@ device: $settings.transcription_device || undefined, language: $settings.transcription_language || undefined, skipDiarization: $settings.skip_diarization || undefined, + hfToken: $settings.hf_token || undefined, }); // Create speaker entries from pipeline result