From 806586ae3d5bdc8f0a057a5d4e072ebc50ac6c15 Mon Sep 17 00:00:00 2001
From: Claude <claude@anthropic.com>
Date: Mon, 23 Mar 2026 06:24:18 -0700
Subject: [PATCH] Fix diarization performance for long files + better progress
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Cache loaded audio in _sf_load() — previously the entire WAV file was
  re-read from disk for every 10s crop call. For a 3-hour file with
  1000+ chunks, this meant ~345GB of disk reads. Now read once, cached.
- Better progress messages for long files: show elapsed time in m:ss
  format, warn "(180min audio, this may take a while)" for files >10min
- Increased progress poll interval from 2s to 5s (less noise)
- Better time estimate: use 0.8x audio duration (was 0.5x)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/voice_to_notes/services/diarize.py | 30 +++++++++++++++++------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/python/voice_to_notes/services/diarize.py b/python/voice_to_notes/services/diarize.py
index bd8afb2..272792a 100644
--- a/python/voice_to_notes/services/diarize.py
+++ b/python/voice_to_notes/services/diarize.py
@@ -41,14 +41,23 @@ def _patch_pyannote_audio() -> None:
         import torch
         from pyannote.audio.core.io import Audio
 
+        # Cache loaded audio to avoid re-reading the entire file for every crop call.
+        # For a 3-hour file, crop is called 1000+ times — without caching, each call
+        # reads ~345MB from disk.
+        _audio_cache: dict[str, tuple] = {}
+
         def _sf_load(audio_path: str) -> tuple:
-            """Load audio via soundfile, return (channels, samples) tensor + sample_rate."""
-            data, sample_rate = sf.read(str(audio_path), dtype="float32")
+            """Load audio via soundfile with caching."""
+            key = str(audio_path)
+            if key in _audio_cache:
+                return _audio_cache[key]
+            data, sample_rate = sf.read(key, dtype="float32")
             waveform = torch.from_numpy(np.array(data))
             if waveform.ndim == 1:
                 waveform = waveform.unsqueeze(0)
             else:
                 waveform = waveform.T
+            _audio_cache[key] = (waveform, sample_rate)
             return waveform, sample_rate
 
         def _soundfile_call(self, file: dict) -> tuple:
@@ -56,7 +65,7 @@ def _patch_pyannote_audio() -> None:
             return _sf_load(file["audio"])
 
         def _soundfile_crop(self, file: dict, segment, **kwargs) -> tuple:
-            """Replacement for Audio.crop — load full file then slice.
+            """Replacement for Audio.crop — load file once (cached) then slice.
 
             Pads short segments with zeros to match the expected duration,
             which pyannote requires for batched embedding extraction.
@@ -279,13 +288,20 @@ class DiarizeService:
         thread.start()
 
         elapsed = 0.0
-        estimated_total = max(audio_duration_sec * 0.5, 30.0) if audio_duration_sec else 120.0
-        while not done_event.wait(timeout=2.0):
-            elapsed += 2.0
+        estimated_total = max(audio_duration_sec * 0.8, 30.0) if audio_duration_sec else 120.0
+        duration_str = ""
+        if audio_duration_sec and audio_duration_sec > 600:
+            mins = int(audio_duration_sec / 60)
+            duration_str = f" ({mins}min audio, this may take a while)"
+        while not done_event.wait(timeout=5.0):
+            elapsed += 5.0
             pct = min(20 + int((elapsed / estimated_total) * 65), 85)
+            elapsed_min = int(elapsed / 60)
+            elapsed_sec = int(elapsed % 60)
+            time_str = f"{elapsed_min}m{elapsed_sec:02d}s" if elapsed_min > 0 else f"{int(elapsed)}s"
             write_message(progress_message(
                 request_id, pct, "diarizing",
-                f"Analyzing speakers ({int(elapsed)}s elapsed)..."))
+                f"Analyzing speakers ({time_str} elapsed){duration_str}"))
 
         thread.join()