Also patch Audio.crop to fix diarization embedding extraction
Some checks failed
Build Sidecars / Bump sidecar version and tag (push) Successful in 4s
Release / Bump version and tag (push) Successful in 3s
Build Sidecars / Build Sidecar (Windows) (push) Has started running
Build Sidecars / Build Sidecar (Linux) (push) Has been cancelled
Release / Build App (Linux) (push) Has been cancelled
Release / Build App (Windows) (push) Has been cancelled
Release / Build App (macOS) (push) Has been cancelled
Build Sidecars / Build Sidecar (macOS) (push) Has been cancelled
Some checks failed
Build Sidecars / Bump sidecar version and tag (push) Successful in 4s
Release / Bump version and tag (push) Successful in 3s
Build Sidecars / Build Sidecar (Windows) (push) Has started running
Build Sidecars / Build Sidecar (Linux) (push) Has been cancelled
Release / Build App (Linux) (push) Has been cancelled
Release / Build App (Windows) (push) Has been cancelled
Release / Build App (macOS) (push) Has been cancelled
Build Sidecars / Build Sidecar (macOS) (push) Has been cancelled
The previous patch only replaced Audio.__call__ (segmentation), but pyannote also calls Audio.crop during speaker embedding extraction. crop loads a time segment of audio — patched to load full file via soundfile then slice the tensor to the requested time range. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,20 +41,34 @@ def _patch_pyannote_audio() -> None:
|
||||
import torch
|
||||
from pyannote.audio.core.io import Audio
|
||||
|
||||
def _soundfile_call(self: Audio, file: dict) -> tuple:
|
||||
"""Load audio via soundfile (bypasses torchaudio/torchcodec entirely)."""
|
||||
audio_path = str(file["audio"])
|
||||
data, sample_rate = sf.read(audio_path, dtype="float32")
|
||||
# soundfile returns (samples,) for mono, (samples, channels) for stereo
|
||||
# pyannote expects (channels, samples) torch tensor
|
||||
def _sf_load(audio_path: str) -> tuple:
|
||||
"""Load audio via soundfile, return (channels, samples) tensor + sample_rate."""
|
||||
data, sample_rate = sf.read(str(audio_path), dtype="float32")
|
||||
waveform = torch.from_numpy(np.array(data))
|
||||
if waveform.ndim == 1:
|
||||
waveform = waveform.unsqueeze(0) # (samples,) -> (1, samples)
|
||||
waveform = waveform.unsqueeze(0)
|
||||
else:
|
||||
waveform = waveform.T # (samples, channels) -> (channels, samples)
|
||||
waveform = waveform.T
|
||||
return waveform, sample_rate
|
||||
|
||||
def _soundfile_call(self, file: dict) -> tuple:
|
||||
"""Replacement for Audio.__call__."""
|
||||
return _sf_load(file["audio"])
|
||||
|
||||
def _soundfile_crop(self, file: dict, segment, **kwargs) -> tuple:
|
||||
"""Replacement for Audio.crop — load full file then slice."""
|
||||
waveform, sample_rate = _sf_load(file["audio"])
|
||||
# Convert segment (seconds) to sample indices
|
||||
start_sample = int(segment.start * sample_rate)
|
||||
end_sample = int(segment.end * sample_rate)
|
||||
# Clamp to bounds
|
||||
start_sample = max(0, start_sample)
|
||||
end_sample = min(waveform.shape[-1], end_sample)
|
||||
cropped = waveform[:, start_sample:end_sample]
|
||||
return cropped, sample_rate
|
||||
|
||||
Audio.__call__ = _soundfile_call # type: ignore[assignment]
|
||||
Audio.crop = _soundfile_crop # type: ignore[assignment]
|
||||
print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True)
|
||||
except Exception as e:
|
||||
print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)
|
||||
|
||||
Reference in New Issue
Block a user