From 68524cbbd69cc2e7def473fe2703884d71dce327 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 22 Mar 2026 17:37:57 -0700 Subject: [PATCH] Also patch Audio.crop to fix diarization embedding extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous patch only replaced Audio.__call__ (segmentation), but pyannote also calls Audio.crop during speaker embedding extraction. crop loads a time segment of audio — patched to load full file via soundfile then slice the tensor to the requested time range. Co-Authored-By: Claude Opus 4.6 --- python/voice_to_notes/services/diarize.py | 30 +++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py index 0f8691a..49a88b9 100644 --- a/python/voice_to_notes/services/diarize.py +++ b/python/voice_to_notes/services/diarize.py @@ -41,20 +41,34 @@ def _patch_pyannote_audio() -> None: import torch from pyannote.audio.core.io import Audio - def _soundfile_call(self: Audio, file: dict) -> tuple: - """Load audio via soundfile (bypasses torchaudio/torchcodec entirely).""" - audio_path = str(file["audio"]) - data, sample_rate = sf.read(audio_path, dtype="float32") - # soundfile returns (samples,) for mono, (samples, channels) for stereo - # pyannote expects (channels, samples) torch tensor + def _sf_load(audio_path: str) -> tuple: + """Load audio via soundfile, return (channels, samples) tensor + sample_rate.""" + data, sample_rate = sf.read(str(audio_path), dtype="float32") waveform = torch.from_numpy(np.array(data)) if waveform.ndim == 1: - waveform = waveform.unsqueeze(0) # (samples,) -> (1, samples) + waveform = waveform.unsqueeze(0) else: - waveform = waveform.T # (samples, channels) -> (channels, samples) + waveform = waveform.T return waveform, sample_rate + def _soundfile_call(self, file: dict) -> tuple: + """Replacement for Audio.__call__.""" + return _sf_load(file["audio"]) + + def _soundfile_crop(self, file: dict, segment, **kwargs) -> tuple: + """Replacement for Audio.crop — load full file then slice.""" + waveform, sample_rate = _sf_load(file["audio"]) + # Convert segment (seconds) to sample indices + start_sample = int(segment.start * sample_rate) + end_sample = int(segment.end * sample_rate) + # Clamp to bounds + start_sample = max(0, start_sample) + end_sample = min(waveform.shape[-1], end_sample) + cropped = waveform[:, start_sample:end_sample] + return cropped, sample_rate + Audio.__call__ = _soundfile_call # type: ignore[assignment] + Audio.crop = _soundfile_crop # type: ignore[assignment] print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True) except Exception as e: print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)