From baf820286ff8f8a2eacb1e793e4300d76a730ccd Mon Sep 17 00:00:00 2001
From: Josh Knapp <jknapp85@gmail.com>
Date: Thu, 26 Feb 2026 18:08:51 -0800
Subject: [PATCH] Add HuggingFace token setting for speaker detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add "Speakers" tab in Settings with HF token input field
- Include step-by-step instructions for obtaining the token
- Pass hf_token from settings through Rust → Python pipeline → diarize
- Token can also be set via HF_TOKEN environment variable as fallback
- Move skip_diarization checkbox to Speakers tab

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/voice_to_notes/ipc/handlers.py      |  1 +
 python/voice_to_notes/services/diarize.py  |  9 ++--
 python/voice_to_notes/services/pipeline.py |  2 +
 src-tauri/src/commands/transcribe.rs       |  2 +
 src/lib/components/SettingsModal.svelte    | 55 +++++++++++++++++++++-
 src/lib/stores/settings.ts                 |  2 +
 src/routes/+page.svelte                    |  1 +
 7 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/python/voice_to_notes/ipc/handlers.py b/python/voice_to_notes/ipc/handlers.py
index 5d27734..4334f23 100644
--- a/python/voice_to_notes/ipc/handlers.py
+++ b/python/voice_to_notes/ipc/handlers.py
@@ -107,6 +107,7 @@ def make_pipeline_handler() -> HandlerFunc:
             min_speakers=payload.get("min_speakers"),
             max_speakers=payload.get("max_speakers"),
             skip_diarization=payload.get("skip_diarization", False),
+            hf_token=payload.get("hf_token"),
         )
         return IPCMessage(
             id=msg.id,
diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py
index 77079c3..c29fdd0 100644
--- a/python/voice_to_notes/services/diarize.py
+++ b/python/voice_to_notes/services/diarize.py
@@ -35,7 +35,7 @@ class DiarizeService:
     def __init__(self) -> None:
         self._pipeline: Any = None
 
-    def _ensure_pipeline(self) -> Any:
+    def _ensure_pipeline(self, hf_token: str | None = None) -> Any:
         """Load the pyannote diarization pipeline (lazy)."""
         if self._pipeline is not None:
             return self._pipeline
@@ -44,7 +44,9 @@ class DiarizeService:
 
         print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True)
 
-        hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
+        # Use token from argument, fall back to environment variable
+        if not hf_token:
+            hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
 
         models = [
             "pyannote/speaker-diarization-3.1",
@@ -81,6 +83,7 @@ class DiarizeService:
         num_speakers: int | None = None,
         min_speakers: int | None = None,
         max_speakers: int | None = None,
+        hf_token: str | None = None,
     ) -> DiarizationResult:
         """Run speaker diarization on an audio file.
 
@@ -98,7 +101,7 @@ class DiarizeService:
             progress_message(request_id, 0, "loading_diarization", "Loading diarization model...")
         )
 
-        pipeline = self._ensure_pipeline()
+        pipeline = self._ensure_pipeline(hf_token=hf_token)
 
         write_message(
             progress_message(request_id, 20, "diarizing", "Running speaker diarization...")
diff --git a/python/voice_to_notes/services/pipeline.py b/python/voice_to_notes/services/pipeline.py
index 59f8186..7ca728e 100644
--- a/python/voice_to_notes/services/pipeline.py
+++ b/python/voice_to_notes/services/pipeline.py
@@ -60,6 +60,7 @@ class PipelineService:
         min_speakers: int | None = None,
         max_speakers: int | None = None,
         skip_diarization: bool = False,
+        hf_token: str | None = None,
     ) -> PipelineResult:
         """Run the full transcription + diarization pipeline.
 
@@ -123,6 +124,7 @@ class PipelineService:
                 num_speakers=num_speakers,
                 min_speakers=min_speakers,
                 max_speakers=max_speakers,
+                hf_token=hf_token,
             )
         except Exception as e:
             print(
diff --git a/src-tauri/src/commands/transcribe.rs b/src-tauri/src/commands/transcribe.rs
index 2f2e64a..d4c90ea 100644
--- a/src-tauri/src/commands/transcribe.rs
+++ b/src-tauri/src/commands/transcribe.rs
@@ -52,6 +52,7 @@ pub fn run_pipeline(
     min_speakers: Option<u32>,
     max_speakers: Option<u32>,
     skip_diarization: Option<bool>,
+    hf_token: Option<String>,
 ) -> Result<Value, String> {
     let manager = sidecar();
     manager.ensure_running()?;
@@ -70,6 +71,7 @@ pub fn run_pipeline(
             "min_speakers": min_speakers,
             "max_speakers": max_speakers,
             "skip_diarization": skip_diarization.unwrap_or(false),
+            "hf_token": hf_token,
         }),
     );
 
diff --git a/src/lib/components/SettingsModal.svelte b/src/lib/components/SettingsModal.svelte
index 659b9e7..1794071 100644
--- a/src/lib/components/SettingsModal.svelte
+++ b/src/lib/components/SettingsModal.svelte
@@ -9,7 +9,7 @@
   let { visible, onClose }: Props = $props();
 
   let localSettings = $state<AppSettings>({ ...$settings });
-  let activeTab = $state<'transcription' | 'ai' | 'local'>('transcription');
+  let activeTab = $state<'transcription' | 'speakers' | 'ai' | 'local'>('transcription');
 
   // Sync when settings store changes
   $effect(() => {
@@ -46,6 +46,9 @@
         <button class="tab" class:active={activeTab === 'transcription'} onclick={() => activeTab = 'transcription'}>
           Transcription
         </button>
+        <button class="tab" class:active={activeTab === 'speakers'} onclick={() => activeTab = 'speakers'}>
+          Speakers
+        </button>
         <button class="tab" class:active={activeTab === 'ai'} onclick={() => activeTab = 'ai'}>
           AI Provider
         </button>
@@ -77,10 +80,27 @@
             <label for="stt-lang">Language (blank = auto-detect)</label>
             <input id="stt-lang" type="text" bind:value={localSettings.transcription_language} placeholder="e.g., en, es, fr" />
           </div>
+        {:else if activeTab === 'speakers'}
+          <div class="field">
+            <label for="hf-token">HuggingFace Token</label>
+            <input id="hf-token" type="password" bind:value={localSettings.hf_token} placeholder="hf_..." />
+          </div>
+          <div class="info-box">
+            <p class="info-title">Why is this needed?</p>
+            <p>Speaker detection uses the <strong>pyannote.audio</strong> model, which is hosted on HuggingFace and requires accepting a license agreement.</p>
+            <p class="info-title">How to get a token:</p>
+            <ol>
+              <li>Create a free account at <strong>huggingface.co</strong></li>
+              <li>Go to <strong>huggingface.co/pyannote/speaker-diarization-3.1</strong> and accept the license</li>
+              <li>Go to <strong>huggingface.co/settings/tokens</strong> and create a token with <em>read</em> access</li>
+              <li>Paste the token above and click Save</li>
+            </ol>
+            <p>The model will be downloaded automatically on first use (~100 MB).</p>
+          </div>
           <div class="field checkbox">
             <label>
               <input type="checkbox" bind:checked={localSettings.skip_diarization} />
-              Skip speaker diarization (faster, no speaker labels)
+              Skip speaker detection (faster, no speaker labels)
             </label>
           </div>
         {:else if activeTab === 'ai'}
@@ -252,6 +272,37 @@
     color: #666;
     line-height: 1.4;
   }
+  .info-box {
+    background: rgba(233, 69, 96, 0.05);
+    border: 1px solid #2a3a5e;
+    border-radius: 6px;
+    padding: 0.75rem 1rem;
+    margin-bottom: 1rem;
+    font-size: 0.8rem;
+    color: #b0b0b0;
+    line-height: 1.5;
+  }
+  .info-box p {
+    margin: 0 0 0.5rem;
+  }
+  .info-box p:last-child {
+    margin-bottom: 0;
+  }
+  .info-box .info-title {
+    color: #e0e0e0;
+    font-weight: 600;
+    font-size: 0.8rem;
+  }
+  .info-box ol {
+    margin: 0.25rem 0 0.5rem;
+    padding-left: 1.25rem;
+  }
+  .info-box li {
+    margin-bottom: 0.25rem;
+  }
+  .info-box strong {
+    color: #e0e0e0;
+  }
   .modal-footer {
     display: flex;
     justify-content: flex-end;
diff --git a/src/lib/stores/settings.ts b/src/lib/stores/settings.ts
index 32da0ee..9eab830 100644
--- a/src/lib/stores/settings.ts
+++ b/src/lib/stores/settings.ts
@@ -14,6 +14,7 @@ export interface AppSettings {
   transcription_device: string;
   transcription_language: string;
   skip_diarization: boolean;
+  hf_token: string;
 }
 
 const defaults: AppSettings = {
@@ -29,6 +30,7 @@ const defaults: AppSettings = {
   transcription_device: 'cpu',
   transcription_language: '',
   skip_diarization: false,
+  hf_token: '',
 };
 
 export const settings = writable<AppSettings>({ ...defaults });
diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte
index 8b88227..9f8a457 100644
--- a/src/routes/+page.svelte
+++ b/src/routes/+page.svelte
@@ -133,6 +133,7 @@
         device: $settings.transcription_device || undefined,
         language: $settings.transcription_language || undefined,
         skipDiarization: $settings.skip_diarization || undefined,
+        hfToken: $settings.hf_token || undefined,
       });
 
       // Create speaker entries from pipeline result