From f9226ee4d05e1494cc8089202fc45dfb7e9841d9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 22 Mar 2026 11:49:36 -0700 Subject: [PATCH] Fix diarization: use soundfile instead of torchaudio for audio loading torchaudio 2.10 unconditionally delegates load() to torchcodec, ignoring the backend parameter. Since torchcodec is excluded from PyInstaller, this broke our pyannote Audio monkey-patch. Fix: replace torchaudio.load() with soundfile.read() + torch.from_numpy(). soundfile handles WAV natively (audio is pre-converted to WAV), has no torchcodec dependency, and is already a transitive dependency. Also added soundfile to PyInstaller hiddenimports. Co-Authored-By: Claude Opus 4.6 --- python/voice_to_notes.spec | 1 + python/voice_to_notes/services/diarize.py | 23 ++++++++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/python/voice_to_notes.spec b/python/voice_to_notes.spec index ed82974..aaa7348 100644 --- a/python/voice_to_notes.spec +++ b/python/voice_to_notes.spec @@ -21,6 +21,7 @@ a = Analysis( hiddenimports=[ "torch", "torchaudio", + "soundfile", "huggingface_hub", "pysubs2", "openai", diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py index 3a8ed69..0f8691a 100644 --- a/python/voice_to_notes/services/diarize.py +++ b/python/voice_to_notes/services/diarize.py @@ -36,17 +36,26 @@ def _patch_pyannote_audio() -> None: _patched = True try: - import torchaudio + import numpy as np + import soundfile as sf + import torch from pyannote.audio.core.io import Audio - def _torchaudio_call(self: Audio, file: dict) -> tuple: - audio_path = file["audio"] - waveform, sample_rate = torchaudio.load(str(audio_path)) - # pyannote expects (channel, time) tensor and sample_rate + def _soundfile_call(self: Audio, file: dict) -> tuple: + """Load audio via soundfile (bypasses torchaudio/torchcodec entirely).""" + audio_path = str(file["audio"]) + data, sample_rate = sf.read(audio_path, dtype="float32") + # soundfile returns (samples,) for mono, (samples, channels) for stereo + # pyannote expects (channels, samples) torch tensor + waveform = torch.from_numpy(np.array(data)) + if waveform.ndim == 1: + waveform = waveform.unsqueeze(0) # (samples,) -> (1, samples) + else: + waveform = waveform.T # (samples, channels) -> (channels, samples) return waveform, sample_rate - Audio.__call__ = _torchaudio_call # type: ignore[assignment] - print("[sidecar] Patched pyannote Audio to use torchaudio", file=sys.stderr, flush=True) + Audio.__call__ = _soundfile_call # type: ignore[assignment] + print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True) except Exception as e: print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)