Fix diarization: use soundfile instead of torchaudio for audio loading

torchaudio 2.10 unconditionally delegates load() to torchcodec, ignoring the backend parameter. Since torchcodec is excluded from PyInstaller, this broke our pyannote Audio monkey-patch. Fix: replace torchaudio.load() with soundfile.read() + torch.from_numpy(). soundfile handles WAV natively (audio is pre-converted to WAV), has no torchcodec dependency, and is already a transitive dependency. Also added soundfile to PyInstaller hiddenimports. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 11:49:36 -07:00
parent 4da40fc5fd
commit f9226ee4d0
2 changed files with 17 additions and 7 deletions
@@ -21,6 +21,7 @@ a = Analysis(
    hiddenimports=[
        "torch",
        "torchaudio",
        "soundfile",
        "huggingface_hub",
        "pysubs2",
        "openai",
@@ -36,17 +36,26 @@ def _patch_pyannote_audio() -> None:
    _patched = True
    try:
-        import torchaudio
+        import numpy as np
        import soundfile as sf
        import torch
        from pyannote.audio.core.io import Audio
-        def _torchaudio_call(self: Audio, file: dict) -> tuple:
+        def _soundfile_call(self: Audio, file: dict) -> tuple:
-            audio_path = file["audio"]
+            """Load audio via soundfile (bypasses torchaudio/torchcodec entirely)."""
-            waveform, sample_rate = torchaudio.load(str(audio_path))
+            audio_path = str(file["audio"])
-            # pyannote expects (channel, time) tensor and sample_rate
+            data, sample_rate = sf.read(audio_path, dtype="float32")
            # soundfile returns (samples,) for mono, (samples, channels) for stereo
            # pyannote expects (channels, samples) torch tensor
            waveform = torch.from_numpy(np.array(data))
            if waveform.ndim == 1:
                waveform = waveform.unsqueeze(0)  # (samples,) -> (1, samples)
            else:
                waveform = waveform.T  # (samples, channels) -> (channels, samples)
            return waveform, sample_rate
-        Audio.__call__ = _torchaudio_call  # type: ignore[assignment]
+        Audio.__call__ = _soundfile_call  # type: ignore[assignment]
-        print("[sidecar] Patched pyannote Audio to use torchaudio", file=sys.stderr, flush=True)
+        print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True)
    except Exception as e:
        print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)