Also patch Audio.crop to fix diarization embedding extraction
Some checks failed
Build Sidecars / Bump sidecar version and tag (push) Successful in 4s
Release / Bump version and tag (push) Successful in 3s
Build Sidecars / Build Sidecar (Windows) (push) Has started running
Build Sidecars / Build Sidecar (Linux) (push) Has been cancelled
Release / Build App (Linux) (push) Has been cancelled
Release / Build App (Windows) (push) Has been cancelled
Release / Build App (macOS) (push) Has been cancelled
Build Sidecars / Build Sidecar (macOS) (push) Has been cancelled
Some checks failed
Build Sidecars / Bump sidecar version and tag (push) Successful in 4s
Release / Bump version and tag (push) Successful in 3s
Build Sidecars / Build Sidecar (Windows) (push) Has started running
Build Sidecars / Build Sidecar (Linux) (push) Has been cancelled
Release / Build App (Linux) (push) Has been cancelled
Release / Build App (Windows) (push) Has been cancelled
Release / Build App (macOS) (push) Has been cancelled
Build Sidecars / Build Sidecar (macOS) (push) Has been cancelled
The previous patch only replaced Audio.__call__ (segmentation), but pyannote also calls Audio.crop during speaker embedding extraction. crop loads a time segment of audio — patched to load full file via soundfile then slice the tensor to the requested time range. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,20 +41,34 @@ def _patch_pyannote_audio() -> None:
|
|||||||
import torch
|
import torch
|
||||||
from pyannote.audio.core.io import Audio
|
from pyannote.audio.core.io import Audio
|
||||||
|
|
||||||
def _soundfile_call(self: Audio, file: dict) -> tuple:
|
def _sf_load(audio_path: str) -> tuple:
|
||||||
"""Load audio via soundfile (bypasses torchaudio/torchcodec entirely)."""
|
"""Load audio via soundfile, return (channels, samples) tensor + sample_rate."""
|
||||||
audio_path = str(file["audio"])
|
data, sample_rate = sf.read(str(audio_path), dtype="float32")
|
||||||
data, sample_rate = sf.read(audio_path, dtype="float32")
|
|
||||||
# soundfile returns (samples,) for mono, (samples, channels) for stereo
|
|
||||||
# pyannote expects (channels, samples) torch tensor
|
|
||||||
waveform = torch.from_numpy(np.array(data))
|
waveform = torch.from_numpy(np.array(data))
|
||||||
if waveform.ndim == 1:
|
if waveform.ndim == 1:
|
||||||
waveform = waveform.unsqueeze(0) # (samples,) -> (1, samples)
|
waveform = waveform.unsqueeze(0)
|
||||||
else:
|
else:
|
||||||
waveform = waveform.T # (samples, channels) -> (channels, samples)
|
waveform = waveform.T
|
||||||
return waveform, sample_rate
|
return waveform, sample_rate
|
||||||
|
|
||||||
|
def _soundfile_call(self, file: dict) -> tuple:
|
||||||
|
"""Replacement for Audio.__call__."""
|
||||||
|
return _sf_load(file["audio"])
|
||||||
|
|
||||||
|
def _soundfile_crop(self, file: dict, segment, **kwargs) -> tuple:
|
||||||
|
"""Replacement for Audio.crop — load full file then slice."""
|
||||||
|
waveform, sample_rate = _sf_load(file["audio"])
|
||||||
|
# Convert segment (seconds) to sample indices
|
||||||
|
start_sample = int(segment.start * sample_rate)
|
||||||
|
end_sample = int(segment.end * sample_rate)
|
||||||
|
# Clamp to bounds
|
||||||
|
start_sample = max(0, start_sample)
|
||||||
|
end_sample = min(waveform.shape[-1], end_sample)
|
||||||
|
cropped = waveform[:, start_sample:end_sample]
|
||||||
|
return cropped, sample_rate
|
||||||
|
|
||||||
Audio.__call__ = _soundfile_call # type: ignore[assignment]
|
Audio.__call__ = _soundfile_call # type: ignore[assignment]
|
||||||
|
Audio.crop = _soundfile_crop # type: ignore[assignment]
|
||||||
print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True)
|
print("[sidecar] Patched pyannote Audio to use soundfile", file=sys.stderr, flush=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)
|
print(f"[sidecar] Warning: Could not patch pyannote Audio: {e}", file=sys.stderr, flush=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user