Fix diarization performance for long files + better progress

- Cache loaded audio in _sf_load() — previously the entire WAV file was re-read from disk for every 10s crop call. For a 3-hour file with 1000+ chunks, this meant ~345GB of disk reads. Now read once, cached. - Better progress messages for long files: show elapsed time in m:ss format, warn "(180min audio, this may take a while)" for files >10min - Increased progress poll interval from 2s to 5s (less noise) - Better time estimate: use 0.8x audio duration (was 0.5x) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-23 06:24:18 -07:00
parent 999bdaa671
commit 806586ae3d
1 changed files with 23 additions and 7 deletions
--- a/python/voice_to_notes/services/diarize.py
+++ b/python/voice_to_notes/services/diarize.py
@@ -41,14 +41,23 @@ def _patch_pyannote_audio() -> None:
        import torch
        from pyannote.audio.core.io import Audio

+        # Cache loaded audio to avoid re-reading the entire file for every crop call.
+        # For a 3-hour file, crop is called 1000+ times — without caching, each call
+        # reads ~345MB from disk.
+        _audio_cache: dict[str, tuple] = {}
+
        def _sf_load(audio_path: str) -> tuple:
-            """Load audio via soundfile, return (channels, samples) tensor + sample_rate."""
-            data, sample_rate = sf.read(str(audio_path), dtype="float32")
+            """Load audio via soundfile with caching."""
+            key = str(audio_path)
+            if key in _audio_cache:
+                return _audio_cache[key]
+            data, sample_rate = sf.read(key, dtype="float32")
            waveform = torch.from_numpy(np.array(data))
            if waveform.ndim == 1:
                waveform = waveform.unsqueeze(0)
            else:
                waveform = waveform.T
+            _audio_cache[key] = (waveform, sample_rate)
            return waveform, sample_rate

        def _soundfile_call(self, file: dict) -> tuple:
@@ -56,7 +65,7 @@ def _patch_pyannote_audio() -> None:
            return _sf_load(file["audio"])

        def _soundfile_crop(self, file: dict, segment, **kwargs) -> tuple:
-            """Replacement for Audio.crop — load full file then slice.
+            """Replacement for Audio.crop — load file once (cached) then slice.

            Pads short segments with zeros to match the expected duration,
            which pyannote requires for batched embedding extraction.
@@ -279,13 +288,20 @@ class DiarizeService:
        thread.start()

        elapsed = 0.0
-        estimated_total = max(audio_duration_sec * 0.5, 30.0) if audio_duration_sec else 120.0
-        while not done_event.wait(timeout=2.0):
-            elapsed += 2.0
+        estimated_total = max(audio_duration_sec * 0.8, 30.0) if audio_duration_sec else 120.0
+        duration_str = ""
+        if audio_duration_sec and audio_duration_sec > 600:
+            mins = int(audio_duration_sec / 60)
+            duration_str = f" ({mins}min audio, this may take a while)"
+        while not done_event.wait(timeout=5.0):
+            elapsed += 5.0
            pct = min(20 + int((elapsed / estimated_total) * 65), 85)
+            elapsed_min = int(elapsed / 60)
+            elapsed_sec = int(elapsed % 60)
+            time_str = f"{elapsed_min}m{elapsed_sec:02d}s" if elapsed_min > 0 else f"{int(elapsed)}s"
            write_message(progress_message(
                request_id, pct, "diarizing",
-                f"Analyzing speakers ({int(elapsed)}s elapsed)..."))
+                f"Analyzing speakers ({time_str} elapsed){duration_str}"))

        thread.join()