diff --git a/python/voice_to_notes.spec b/python/voice_to_notes.spec index ed82974..aaa7348 100644 --- a/python/voice_to_notes.spec +++ b/python/voice_to_notes.spec @@ -21,6 +21,7 @@ a = Analysis( hiddenimports=[ "torch", "torchaudio", + "soundfile", "huggingface_hub", "pysubs2", "openai", diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py index 3a8ed69..0f8691a 100644 --- a/python/voice_to_notes/services/diarize.py +++ b/python/voice_to_notes/services/diarize.py @@ -36,17 +36,26 @@ def _patch_pyannote_audio() -> None: _patched = True try: - import torchaudio + import numpy as np + import soundfile as sf + import torch from pyannote.audio.core.io import Audio - def _torchaudio_call(self: Audio, file: dict) -> tuple: - audio_path = file["audio"] - waveform, sample_rate = torchaudio.load(str(audio_path)) - # pyannote expects (channel, time) tensor and sample_rate + def _soundfile_call(self: Audio, file: dict) -> tuple: + """Load audio via soundfile (bypasses torchaudio/torchcodec entirely).""" + audio_path = str(file["audio"]) + data, sample_rate = sf.read(audio_path, dtype="float32") + # soundfile returns (samples,) for mono, (samples, channels) for stereo + # pyannote expects (channels, samples) torch tensor + waveform = torch.from_numpy(np.array(data)) + if waveform.ndim == 1: + waveform = waveform.unsqueeze(0) # (samples,) -> (1, samples) + else: + waveform = waveform.T # (samples, channels) -> (channels, samples) return waveform, sample_rate - Audio.__call__ = _torchaudio_call # type: ignore[assignment] - print("[sidecar] Patched pyannote Audio to use torchaudio", file=sys.stderr, flush=True) + Audio.__call__ = _soundfile_call # type: ignore[assignment] + print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True) except Exception as e: print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)