Phase 3: Speaker diarization and full transcription pipeline

- Implement DiarizeService with pyannote.audio speaker detection - Build PipelineService combining transcribe → diarize → merge with overlap-based speaker assignment per segment - Add pipeline.start and diarize.start IPC handlers - Add run_pipeline Tauri command for full pipeline execution - Wire frontend to use pipeline: speakers auto-created with colors, segments assigned to detected speakers - Build SpeakerManager with rename support (double-click or edit button) - Add speaker color coding throughout transcript display - Add pyannote.audio dependency - Tests: 24 Python (including merge logic), 6 Rust, 0 Svelte errors Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 16:09:48 -08:00
parent 842f8d5f90
commit 44480906a4
12 changed files with 806 additions and 24 deletions
--- a/src/routes/+page.svelte
+++ b/src/routes/+page.svelte
@@ -7,7 +7,7 @@
  import AIChatPanel from '$lib/components/AIChatPanel.svelte';
  import ProgressOverlay from '$lib/components/ProgressOverlay.svelte';
  import { segments, speakers } from '$lib/stores/transcript';
-  import type { Segment, Word } from '$lib/types/transcript';
+  import type { Segment, Speaker } from '$lib/types/transcript';

  let waveformPlayer: WaveformPlayer;
  let audioUrl = $state('');
@@ -16,6 +16,9 @@
  let transcriptionStage = $state('');
  let transcriptionMessage = $state('');

+  // Speaker color palette for auto-assignment
+  const speakerColors = ['#e94560', '#4ecdc4', '#ffe66d', '#a8e6cf', '#ff8b94', '#c7ceea', '#ffd93d', '#6bcb77'];
+
  function handleWordClick(timeMs: number) {
    waveformPlayer?.seekTo(timeMs);
  }
@@ -32,11 +35,10 @@
    if (!filePath) return;

    // Convert file path to URL for wavesurfer
-    // In Tauri, we can use convertFileSrc or asset protocol
    audioUrl = `asset://localhost/${encodeURIComponent(filePath)}`;
    waveformPlayer?.loadAudio(audioUrl);

-    // Start transcription
+    // Start pipeline (transcription + diarization)
    isTranscribing = true;
    transcriptionProgress = 0;
    transcriptionStage = 'Starting...';
@@ -47,6 +49,7 @@
          text: string;
          start_ms: number;
          end_ms: number;
+          speaker: string | null;
          words: Array<{
            word: string;
            start_ms: number;
@@ -56,14 +59,29 @@
        }>;
        language: string;
        duration_ms: number;
-      }>('transcribe_file', { filePath });
+        speakers: string[];
+        num_speakers: number;
+      }>('run_pipeline', { filePath });
+
+      // Create speaker entries from pipeline result
+      const newSpeakers: Speaker[] = (result.speakers || []).map((label, idx) => ({
+        id: `speaker-${idx}`,
+        project_id: '',
+        label,
+        display_name: null,
+        color: speakerColors[idx % speakerColors.length],
+      }));
+      speakers.set(newSpeakers);
+
+      // Build speaker label → id lookup
+      const speakerLookup = new Map(newSpeakers.map(s => [s.label, s.id]));

      // Convert result to our store format
      const newSegments: Segment[] = result.segments.map((seg, idx) => ({
        id: `seg-${idx}`,
        project_id: '',
        media_file_id: '',
-        speaker_id: null,
+        speaker_id: seg.speaker ? (speakerLookup.get(seg.speaker) ?? null) : null,
        start_ms: seg.start_ms,
        end_ms: seg.end_ms,
        text: seg.text,
@@ -85,8 +103,8 @@

      segments.set(newSegments);
    } catch (err) {
-      console.error('Transcription failed:', err);
-      alert(`Transcription failed: ${err}`);
+      console.error('Pipeline failed:', err);
+      alert(`Pipeline failed: ${err}`);
    } finally {
      isTranscribing = false;
    }