Add HuggingFace token setting for speaker detection

- Add "Speakers" tab in Settings with HF token input field
- Include step-by-step instructions for obtaining the token
- Pass hf_token from settings through Rust → Python pipeline → diarize
- Token can also be set via HF_TOKEN environment variable as fallback
- Move skip_diarization checkbox to Speakers tab

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-26 18:08:51 -08:00
parent ed626b8ba0
commit baf820286f
7 changed files with 67 additions and 5 deletions

View File

@@ -107,6 +107,7 @@ def make_pipeline_handler() -> HandlerFunc:
min_speakers=payload.get("min_speakers"), min_speakers=payload.get("min_speakers"),
max_speakers=payload.get("max_speakers"), max_speakers=payload.get("max_speakers"),
skip_diarization=payload.get("skip_diarization", False), skip_diarization=payload.get("skip_diarization", False),
hf_token=payload.get("hf_token"),
) )
return IPCMessage( return IPCMessage(
id=msg.id, id=msg.id,

View File

@@ -35,7 +35,7 @@ class DiarizeService:
def __init__(self) -> None: def __init__(self) -> None:
self._pipeline: Any = None self._pipeline: Any = None
def _ensure_pipeline(self) -> Any: def _ensure_pipeline(self, hf_token: str | None = None) -> Any:
"""Load the pyannote diarization pipeline (lazy).""" """Load the pyannote diarization pipeline (lazy)."""
if self._pipeline is not None: if self._pipeline is not None:
return self._pipeline return self._pipeline
@@ -44,7 +44,9 @@ class DiarizeService:
print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True) print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True)
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None # Use token from argument, fall back to environment variable
if not hf_token:
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
models = [ models = [
"pyannote/speaker-diarization-3.1", "pyannote/speaker-diarization-3.1",
@@ -81,6 +83,7 @@ class DiarizeService:
num_speakers: int | None = None, num_speakers: int | None = None,
min_speakers: int | None = None, min_speakers: int | None = None,
max_speakers: int | None = None, max_speakers: int | None = None,
hf_token: str | None = None,
) -> DiarizationResult: ) -> DiarizationResult:
"""Run speaker diarization on an audio file. """Run speaker diarization on an audio file.
@@ -98,7 +101,7 @@ class DiarizeService:
progress_message(request_id, 0, "loading_diarization", "Loading diarization model...") progress_message(request_id, 0, "loading_diarization", "Loading diarization model...")
) )
pipeline = self._ensure_pipeline() pipeline = self._ensure_pipeline(hf_token=hf_token)
write_message( write_message(
progress_message(request_id, 20, "diarizing", "Running speaker diarization...") progress_message(request_id, 20, "diarizing", "Running speaker diarization...")

View File

@@ -60,6 +60,7 @@ class PipelineService:
min_speakers: int | None = None, min_speakers: int | None = None,
max_speakers: int | None = None, max_speakers: int | None = None,
skip_diarization: bool = False, skip_diarization: bool = False,
hf_token: str | None = None,
) -> PipelineResult: ) -> PipelineResult:
"""Run the full transcription + diarization pipeline. """Run the full transcription + diarization pipeline.
@@ -123,6 +124,7 @@ class PipelineService:
num_speakers=num_speakers, num_speakers=num_speakers,
min_speakers=min_speakers, min_speakers=min_speakers,
max_speakers=max_speakers, max_speakers=max_speakers,
hf_token=hf_token,
) )
except Exception as e: except Exception as e:
print( print(

View File

@@ -52,6 +52,7 @@ pub fn run_pipeline(
min_speakers: Option<u32>, min_speakers: Option<u32>,
max_speakers: Option<u32>, max_speakers: Option<u32>,
skip_diarization: Option<bool>, skip_diarization: Option<bool>,
hf_token: Option<String>,
) -> Result<Value, String> { ) -> Result<Value, String> {
let manager = sidecar(); let manager = sidecar();
manager.ensure_running()?; manager.ensure_running()?;
@@ -70,6 +71,7 @@ pub fn run_pipeline(
"min_speakers": min_speakers, "min_speakers": min_speakers,
"max_speakers": max_speakers, "max_speakers": max_speakers,
"skip_diarization": skip_diarization.unwrap_or(false), "skip_diarization": skip_diarization.unwrap_or(false),
"hf_token": hf_token,
}), }),
); );

View File

@@ -9,7 +9,7 @@
let { visible, onClose }: Props = $props(); let { visible, onClose }: Props = $props();
let localSettings = $state<AppSettings>({ ...$settings }); let localSettings = $state<AppSettings>({ ...$settings });
let activeTab = $state<'transcription' | 'ai' | 'local'>('transcription'); let activeTab = $state<'transcription' | 'speakers' | 'ai' | 'local'>('transcription');
// Sync when settings store changes // Sync when settings store changes
$effect(() => { $effect(() => {
@@ -46,6 +46,9 @@
<button class="tab" class:active={activeTab === 'transcription'} onclick={() => activeTab = 'transcription'}> <button class="tab" class:active={activeTab === 'transcription'} onclick={() => activeTab = 'transcription'}>
Transcription Transcription
</button> </button>
<button class="tab" class:active={activeTab === 'speakers'} onclick={() => activeTab = 'speakers'}>
Speakers
</button>
<button class="tab" class:active={activeTab === 'ai'} onclick={() => activeTab = 'ai'}> <button class="tab" class:active={activeTab === 'ai'} onclick={() => activeTab = 'ai'}>
AI Provider AI Provider
</button> </button>
@@ -77,10 +80,27 @@
<label for="stt-lang">Language (blank = auto-detect)</label> <label for="stt-lang">Language (blank = auto-detect)</label>
<input id="stt-lang" type="text" bind:value={localSettings.transcription_language} placeholder="e.g., en, es, fr" /> <input id="stt-lang" type="text" bind:value={localSettings.transcription_language} placeholder="e.g., en, es, fr" />
</div> </div>
{:else if activeTab === 'speakers'}
<div class="field">
<label for="hf-token">HuggingFace Token</label>
<input id="hf-token" type="password" bind:value={localSettings.hf_token} placeholder="hf_..." />
</div>
<div class="info-box">
<p class="info-title">Why is this needed?</p>
<p>Speaker detection uses the <strong>pyannote.audio</strong> model, which is hosted on HuggingFace and requires accepting a license agreement.</p>
<p class="info-title">How to get a token:</p>
<ol>
<li>Create a free account at <strong>huggingface.co</strong></li>
<li>Go to <strong>huggingface.co/pyannote/speaker-diarization-3.1</strong> and accept the license</li>
<li>Go to <strong>huggingface.co/settings/tokens</strong> and create a token with <em>read</em> access</li>
<li>Paste the token above and click Save</li>
</ol>
<p>The model will be downloaded automatically on first use (~100 MB).</p>
</div>
<div class="field checkbox"> <div class="field checkbox">
<label> <label>
<input type="checkbox" bind:checked={localSettings.skip_diarization} /> <input type="checkbox" bind:checked={localSettings.skip_diarization} />
Skip speaker diarization (faster, no speaker labels) Skip speaker detection (faster, no speaker labels)
</label> </label>
</div> </div>
{:else if activeTab === 'ai'} {:else if activeTab === 'ai'}
@@ -252,6 +272,37 @@
color: #666; color: #666;
line-height: 1.4; line-height: 1.4;
} }
.info-box {
background: rgba(233, 69, 96, 0.05);
border: 1px solid #2a3a5e;
border-radius: 6px;
padding: 0.75rem 1rem;
margin-bottom: 1rem;
font-size: 0.8rem;
color: #b0b0b0;
line-height: 1.5;
}
.info-box p {
margin: 0 0 0.5rem;
}
.info-box p:last-child {
margin-bottom: 0;
}
.info-box .info-title {
color: #e0e0e0;
font-weight: 600;
font-size: 0.8rem;
}
.info-box ol {
margin: 0.25rem 0 0.5rem;
padding-left: 1.25rem;
}
.info-box li {
margin-bottom: 0.25rem;
}
.info-box strong {
color: #e0e0e0;
}
.modal-footer { .modal-footer {
display: flex; display: flex;
justify-content: flex-end; justify-content: flex-end;

View File

@@ -14,6 +14,7 @@ export interface AppSettings {
transcription_device: string; transcription_device: string;
transcription_language: string; transcription_language: string;
skip_diarization: boolean; skip_diarization: boolean;
hf_token: string;
} }
const defaults: AppSettings = { const defaults: AppSettings = {
@@ -29,6 +30,7 @@ const defaults: AppSettings = {
transcription_device: 'cpu', transcription_device: 'cpu',
transcription_language: '', transcription_language: '',
skip_diarization: false, skip_diarization: false,
hf_token: '',
}; };
export const settings = writable<AppSettings>({ ...defaults }); export const settings = writable<AppSettings>({ ...defaults });

View File

@@ -133,6 +133,7 @@
device: $settings.transcription_device || undefined, device: $settings.transcription_device || undefined,
language: $settings.transcription_language || undefined, language: $settings.transcription_language || undefined,
skipDiarization: $settings.skip_diarization || undefined, skipDiarization: $settings.skip_diarization || undefined,
hfToken: $settings.hf_token || undefined,
}); });
// Create speaker entries from pipeline result // Create speaker entries from pipeline result