From f9226ee4d05e1494cc8089202fc45dfb7e9841d9 Mon Sep 17 00:00:00 2001
From: Claude <claude@anthropic.com>
Date: Sun, 22 Mar 2026 11:49:36 -0700
Subject: [PATCH] Fix diarization: use soundfile instead of torchaudio for
 audio loading

torchaudio 2.10 unconditionally delegates load() to torchcodec, ignoring
the backend parameter. Since torchcodec is excluded from PyInstaller,
this broke our pyannote Audio monkey-patch.

Fix: replace torchaudio.load() with soundfile.read() + torch.from_numpy().
soundfile handles WAV natively (audio is pre-converted to WAV), has no
torchcodec dependency, and is already a transitive dependency.

Also added soundfile to PyInstaller hiddenimports.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/voice_to_notes.spec                |  1 +
 python/voice_to_notes/services/diarize.py | 23 ++++++++++++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/voice_to_notes.spec b/python/voice_to_notes.spec
index ed82974..aaa7348 100644
--- a/python/voice_to_notes.spec
+++ b/python/voice_to_notes.spec
@@ -21,6 +21,7 @@ a = Analysis(
     hiddenimports=[
         "torch",
         "torchaudio",
+        "soundfile",
         "huggingface_hub",
         "pysubs2",
         "openai",
diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py
index 3a8ed69..0f8691a 100644
--- a/python/voice_to_notes/services/diarize.py
+++ b/python/voice_to_notes/services/diarize.py
@@ -36,17 +36,26 @@ def _patch_pyannote_audio() -> None:
     _patched = True
 
     try:
-        import torchaudio
+        import numpy as np
+        import soundfile as sf
+        import torch
         from pyannote.audio.core.io import Audio
 
-        def _torchaudio_call(self: Audio, file: dict) -> tuple:
-            audio_path = file["audio"]
-            waveform, sample_rate = torchaudio.load(str(audio_path))
-            # pyannote expects (channel, time) tensor and sample_rate
+        def _soundfile_call(self: Audio, file: dict) -> tuple:
+            """Load audio via soundfile (bypasses torchaudio/torchcodec entirely)."""
+            audio_path = str(file["audio"])
+            data, sample_rate = sf.read(audio_path, dtype="float32")
+            # soundfile returns (samples,) for mono, (samples, channels) for stereo
+            # pyannote expects (channels, samples) torch tensor
+            waveform = torch.from_numpy(np.array(data))
+            if waveform.ndim == 1:
+                waveform = waveform.unsqueeze(0)  # (samples,) -> (1, samples)
+            else:
+                waveform = waveform.T  # (samples, channels) -> (channels, samples)
             return waveform, sample_rate
 
-        Audio.__call__ = _torchaudio_call  # type: ignore[assignment]
-        print("[sidecar] Patched pyannote Audio to use torchaudio", file=sys.stderr, flush=True)
+        Audio.__call__ = _soundfile_call  # type: ignore[assignment]
+        print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True)
     except Exception as e:
         print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)