Fix speaker diarization: WAV conversion, pyannote 4.0 compat, telemetry bug

- Convert non-WAV audio to 16kHz mono WAV before diarization (pyannote
  v4.0.4 AudioDecoder returns None duration for FLAC, causing crash)
- Handle pyannote 4.0 DiarizeOutput return type (unwrap .speaker_diarization)
- Disable pyannote telemetry (np.isfinite(None) bug with max_speakers)
- Use huggingface_hub.login() to persist token for all sub-downloads
- Pre-download sub-models (segmentation-3.0, speaker-diarization-community-1)
- Add third required model license link in settings UI
- Improve SpeakerManager hints based on settings state
- Add word-wrap to transcript text

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-26 19:46:07 -08:00
parent a3612c986d
commit 585411f402
6 changed files with 133 additions and 25 deletions

View File

@@ -90,15 +90,40 @@ def make_diarize_handler() -> HandlerFunc:
def make_diarize_download_handler() -> HandlerFunc: def make_diarize_download_handler() -> HandlerFunc:
"""Create a handler that downloads/validates the diarization model.""" """Create a handler that downloads/validates the diarization model."""
import os
def handler(msg: IPCMessage) -> IPCMessage: def handler(msg: IPCMessage) -> IPCMessage:
payload = msg.payload payload = msg.payload
hf_token = payload.get("hf_token") hf_token = payload.get("hf_token")
try: try:
import huggingface_hub
# Disable pyannote telemetry (has a bug in v4.0.4)
os.environ.setdefault("PYANNOTE_METRICS_ENABLED", "false")
from pyannote.audio import Pipeline from pyannote.audio import Pipeline
print("[sidecar] Downloading diarization model...", file=sys.stderr, flush=True) # Persist token globally so ALL huggingface_hub downloads use auth.
# Setting env var alone isn't enough — pyannote's internal sub-downloads
# (e.g. PLDA.from_pretrained) don't forward the token= parameter.
# login() writes the token to ~/.cache/huggingface/token which
# huggingface_hub reads automatically for all downloads.
if hf_token:
os.environ["HF_TOKEN"] = hf_token
huggingface_hub.login(token=hf_token, add_to_git_credential=False)
# Pre-download sub-models that pyannote loads internally.
# This ensures they're cached before Pipeline.from_pretrained
# tries to load them (where token forwarding can fail).
sub_models = [
"pyannote/segmentation-3.0",
"pyannote/speaker-diarization-community-1",
]
for model_id in sub_models:
print(f"[sidecar] Pre-downloading {model_id}...", file=sys.stderr, flush=True)
huggingface_hub.snapshot_download(model_id, token=hf_token)
print("[sidecar] Downloading diarization pipeline...", file=sys.stderr, flush=True)
pipeline = Pipeline.from_pretrained( pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1", "pyannote/speaker-diarization-3.1",
token=hf_token, token=hf_token,
@@ -111,26 +136,23 @@ def make_diarize_download_handler() -> HandlerFunc:
) )
except Exception as e: except Exception as e:
error_msg = str(e) error_msg = str(e)
print(f"[sidecar] Model download error: {error_msg}", file=sys.stderr, flush=True)
# Make common errors more user-friendly # Make common errors more user-friendly
if "403" in error_msg and "gated" in error_msg.lower(): if "403" in error_msg or "gated" in error_msg.lower():
# Extract which model needs access # Try to extract the specific model name from the error
if "segmentation" in error_msg: import re
model_match = re.search(r"pyannote/[\w-]+", error_msg)
if model_match:
model_name = model_match.group(0)
error_msg = ( error_msg = (
"Access denied for pyannote/segmentation-3.0. " f"Access denied for {model_name}. "
"Please visit huggingface.co/pyannote/segmentation-3.0 " f"Please visit huggingface.co/{model_name} "
"and accept the license agreement." f"and accept the license agreement, then try again."
)
elif "speaker-diarization" in error_msg:
error_msg = (
"Access denied for pyannote/speaker-diarization-3.1. "
"Please visit huggingface.co/pyannote/speaker-diarization-3.1 "
"and accept the license agreement."
) )
else: else:
error_msg = ( error_msg = (
"Access denied. Please accept the license agreements at: " "Access denied. Please accept the license agreements for all "
"huggingface.co/pyannote/speaker-diarization-3.1 and " "required pyannote models on HuggingFace."
"huggingface.co/pyannote/segmentation-3.0"
) )
elif "401" in error_msg: elif "401" in error_msg:
error_msg = "Invalid token. Please check your HuggingFace token." error_msg = "Invalid token. Please check your HuggingFace token."

View File

@@ -2,15 +2,67 @@
from __future__ import annotations from __future__ import annotations
import os
import subprocess
import sys import sys
import tempfile
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Any from typing import Any
# Disable pyannote telemetry — it has a bug in v4.0.4 where
# np.isfinite(None) crashes when max_speakers is not set.
os.environ.setdefault("PYANNOTE_METRICS_ENABLED", "false")
from voice_to_notes.ipc.messages import progress_message from voice_to_notes.ipc.messages import progress_message
from voice_to_notes.ipc.protocol import write_message from voice_to_notes.ipc.protocol import write_message
def _ensure_wav(file_path: str) -> tuple[str, str | None]:
"""Convert audio to 16kHz mono WAV if needed.
pyannote.audio v4.0.4 has a bug where its AudioDecoder returns
duration=None for some formats (FLAC, etc.), causing crashes.
Converting to WAV ensures the duration header is always present.
Returns:
(path_to_use, temp_path_or_None)
If conversion was needed, temp_path is the WAV file to clean up.
"""
ext = Path(file_path).suffix.lower()
if ext == ".wav":
return file_path, None
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tmp.close()
try:
subprocess.run(
[
"ffmpeg", "-y", "-i", file_path,
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
tmp.name,
],
check=True,
capture_output=True,
)
print(
f"[sidecar] Converted {ext} to WAV for diarization",
file=sys.stderr,
flush=True,
)
return tmp.name, tmp.name
except (subprocess.CalledProcessError, FileNotFoundError) as e:
# ffmpeg not available or failed — try original file and hope for the best
print(
f"[sidecar] WAV conversion failed ({e}), using original file",
file=sys.stderr,
flush=True,
)
os.unlink(tmp.name)
return file_path, None
@dataclass @dataclass
class SpeakerSegment: class SpeakerSegment:
"""A time span assigned to a speaker.""" """A time span assigned to a speaker."""
@@ -40,14 +92,19 @@ class DiarizeService:
if self._pipeline is not None: if self._pipeline is not None:
return self._pipeline return self._pipeline
import os
print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True) print("[sidecar] Loading pyannote diarization pipeline...", file=sys.stderr, flush=True)
# Use token from argument, fall back to environment variable # Use token from argument, fall back to environment variable
if not hf_token: if not hf_token:
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
# Persist token globally so ALL huggingface_hub sub-downloads use auth.
# Pyannote has internal dependencies that don't forward the token= param.
if hf_token:
os.environ["HF_TOKEN"] = hf_token
import huggingface_hub
huggingface_hub.login(token=hf_token, add_to_git_credential=False)
models = [ models = [
"pyannote/speaker-diarization-3.1", "pyannote/speaker-diarization-3.1",
"pyannote/speaker-diarization", "pyannote/speaker-diarization",
@@ -118,8 +175,27 @@ class DiarizeService:
if max_speakers is not None: if max_speakers is not None:
kwargs["max_speakers"] = max_speakers kwargs["max_speakers"] = max_speakers
# Convert to WAV to work around pyannote v4.0.4 duration bug
audio_path, temp_wav = _ensure_wav(file_path)
print(
f"[sidecar] Running diarization on {audio_path} with kwargs: {kwargs}",
file=sys.stderr,
flush=True,
)
# Run diarization # Run diarization
diarization = pipeline(file_path, **kwargs) try:
raw_result = pipeline(audio_path, **kwargs)
finally:
if temp_wav:
os.unlink(temp_wav)
# pyannote 4.0+ returns DiarizeOutput; older versions return Annotation directly
if hasattr(raw_result, "speaker_diarization"):
diarization = raw_result.speaker_diarization
else:
diarization = raw_result
# Convert pyannote output to our format # Convert pyannote output to our format
result = DiarizationResult() result = DiarizationResult()

View File

@@ -127,15 +127,17 @@ class PipelineService:
hf_token=hf_token, hf_token=hf_token,
) )
except Exception as e: except Exception as e:
import traceback
print( print(
f"[sidecar] Diarization failed, falling back to transcription-only: {e}", f"[sidecar] Diarization failed, falling back to transcription-only: {e}",
file=sys.stderr, file=sys.stderr,
flush=True, flush=True,
) )
traceback.print_exc(file=sys.stderr)
write_message( write_message(
progress_message( progress_message(
request_id, 80, "pipeline", request_id, 80, "pipeline",
"Diarization unavailable, using transcription only..." f"Diarization failed ({e}), using transcription only..."
) )
) )

View File

@@ -118,12 +118,14 @@
<p>Speaker detection uses <strong>pyannote.audio</strong> models hosted on HuggingFace. You must accept the license for each model:</p> <p>Speaker detection uses <strong>pyannote.audio</strong> models hosted on HuggingFace. You must accept the license for each model:</p>
<ol> <ol>
<li>Create a free account at <!-- svelte-ignore a11y_no_static_element_interactions --><a class="ext-link" onclick={() => openUrl('https://huggingface.co/join')}>huggingface.co</a></li> <li>Create a free account at <!-- svelte-ignore a11y_no_static_element_interactions --><a class="ext-link" onclick={() => openUrl('https://huggingface.co/join')}>huggingface.co</a></li>
<li>Accept the license on <strong>each</strong> of these pages: <li>Accept the license on <strong>all three</strong> of these pages:
<ul> <ul>
<!-- svelte-ignore a11y_no_static_element_interactions --> <!-- svelte-ignore a11y_no_static_element_interactions -->
<li><a class="ext-link" onclick={() => openUrl('https://huggingface.co/pyannote/speaker-diarization-3.1')}>pyannote/speaker-diarization-3.1</a></li> <li><a class="ext-link" onclick={() => openUrl('https://huggingface.co/pyannote/speaker-diarization-3.1')}>pyannote/speaker-diarization-3.1</a></li>
<!-- svelte-ignore a11y_no_static_element_interactions --> <!-- svelte-ignore a11y_no_static_element_interactions -->
<li><a class="ext-link" onclick={() => openUrl('https://huggingface.co/pyannote/segmentation-3.0')}>pyannote/segmentation-3.0</a></li> <li><a class="ext-link" onclick={() => openUrl('https://huggingface.co/pyannote/segmentation-3.0')}>pyannote/segmentation-3.0</a></li>
<!-- svelte-ignore a11y_no_static_element_interactions -->
<li><a class="ext-link" onclick={() => openUrl('https://huggingface.co/pyannote/speaker-diarization-community-1')}>pyannote/speaker-diarization-community-1</a></li>
</ul> </ul>
</li> </li>
<!-- svelte-ignore a11y_no_static_element_interactions --> <!-- svelte-ignore a11y_no_static_element_interactions -->

View File

@@ -1,5 +1,6 @@
<script lang="ts"> <script lang="ts">
import { speakers } from '$lib/stores/transcript'; import { speakers } from '$lib/stores/transcript';
import { settings } from '$lib/stores/settings';
import type { Speaker } from '$lib/types/transcript'; import type { Speaker } from '$lib/types/transcript';
let editingSpeakerId = $state<string | null>(null); let editingSpeakerId = $state<string | null>(null);
@@ -35,10 +36,13 @@
<h3>Speakers</h3> <h3>Speakers</h3>
{#if $speakers.length === 0} {#if $speakers.length === 0}
<p class="empty-hint">No speakers detected</p> <p class="empty-hint">No speakers detected</p>
<p class="setup-hint"> {#if $settings.skip_diarization}
Speaker detection requires a HuggingFace token. <p class="setup-hint">Speaker detection is disabled. Enable it in Settings &gt; Speakers.</p>
Set the <code>HF_TOKEN</code> environment variable and restart. {:else if !$settings.hf_token}
</p> <p class="setup-hint">Speaker detection requires a HuggingFace token. Configure it in Settings &gt; Speakers.</p>
{:else}
<p class="setup-hint">Speaker detection ran but found no distinct speakers, or the model may need to be downloaded. Check Settings &gt; Speakers.</p>
{/if}
{:else} {:else}
<ul class="speaker-list"> <ul class="speaker-list">
{#each $speakers as speaker (speaker.id)} {#each $speakers as speaker (speaker.id)}

View File

@@ -217,6 +217,8 @@
.segment-text { .segment-text {
line-height: 1.6; line-height: 1.6;
padding-left: 0.75rem; padding-left: 0.75rem;
word-wrap: break-word;
overflow-wrap: break-word;
} }
.word { .word {
cursor: pointer; cursor: pointer;