Fix diarization: use soundfile instead of torchaudio for audio loading

torchaudio 2.10 unconditionally delegates load() to torchcodec, ignoring the backend parameter. Since torchcodec is excluded from PyInstaller, this broke our pyannote Audio monkey-patch. Fix: replace torchaudio.load() with soundfile.read() + torch.from_numpy(). soundfile handles WAV natively (audio is pre-converted to WAV), has no torchcodec dependency, and is already a transitive dependency. Also added soundfile to PyInstaller hiddenimports. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 11:49:36 -07:00
parent 4da40fc5fd
commit f9226ee4d0
2 changed files with 17 additions and 7 deletions
@@ -21,6 +21,7 @@ a = Analysis(
    hiddenimports=[
        "torch",
        "torchaudio",
+        "soundfile",
        "huggingface_hub",
        "pysubs2",
        "openai",
@@ -36,17 +36,26 @@ def _patch_pyannote_audio() -> None:
    _patched = True

    try:
-        import torchaudio
+        import numpy as np
+        import soundfile as sf
+        import torch
        from pyannote.audio.core.io import Audio

-        def _torchaudio_call(self: Audio, file: dict) -> tuple:
-            audio_path = file["audio"]
-            waveform, sample_rate = torchaudio.load(str(audio_path))
-            # pyannote expects (channel, time) tensor and sample_rate
+        def _soundfile_call(self: Audio, file: dict) -> tuple:
+            """Load audio via soundfile (bypasses torchaudio/torchcodec entirely)."""
+            audio_path = str(file["audio"])
+            data, sample_rate = sf.read(audio_path, dtype="float32")
+            # soundfile returns (samples,) for mono, (samples, channels) for stereo
+            # pyannote expects (channels, samples) torch tensor
+            waveform = torch.from_numpy(np.array(data))
+            if waveform.ndim == 1:
+                waveform = waveform.unsqueeze(0)  # (samples,) -> (1, samples)
+            else:
+                waveform = waveform.T  # (samples, channels) -> (channels, samples)
            return waveform, sample_rate

-        Audio.__call__ = _torchaudio_call  # type: ignore[assignment]
-        print("[sidecar] Patched pyannote Audio to use torchaudio", file=sys.stderr, flush=True)
+        Audio.__call__ = _soundfile_call  # type: ignore[assignment]
+        print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True)
    except Exception as e:
        print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)