Phase 3: Speaker diarization and full transcription pipeline

- Implement DiarizeService with pyannote.audio speaker detection - Build PipelineService combining transcribe → diarize → merge with overlap-based speaker assignment per segment - Add pipeline.start and diarize.start IPC handlers - Add run_pipeline Tauri command for full pipeline execution - Wire frontend to use pipeline: speakers auto-created with colors, segments assigned to detected speakers - Build SpeakerManager with rename support (double-click or edit button) - Add speaker color coding throughout transcript display - Add pyannote.audio dependency - Tests: 24 Python (including merge logic), 6 Rust, 0 Svelte errors Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 16:09:48 -08:00
parent 842f8d5f90
commit 44480906a4
12 changed files with 806 additions and 24 deletions
--- a/src/lib/components/SpeakerManager.svelte
+++ b/src/lib/components/SpeakerManager.svelte
@@ -1,6 +1,67 @@
+<script lang="ts">
+  import { speakers } from '$lib/stores/transcript';
+  import type { Speaker } from '$lib/types/transcript';
+
+  let editingSpeakerId = $state<string | null>(null);
+  let editName = $state('');
+
+  function startRename(speaker: Speaker) {
+    editingSpeakerId = speaker.id;
+    editName = speaker.display_name || speaker.label;
+  }
+
+  function finishRename(speakerId: string) {
+    const trimmed = editName.trim();
+    if (trimmed) {
+      speakers.update(list => list.map(s => {
+        if (s.id !== speakerId) return s;
+        return { ...s, display_name: trimmed };
+      }));
+    }
+    editingSpeakerId = null;
+  }
+
+  function handleKeydown(e: KeyboardEvent, speakerId: string) {
+    if (e.key === 'Enter') {
+      e.preventDefault();
+      finishRename(speakerId);
+    } else if (e.key === 'Escape') {
+      editingSpeakerId = null;
+    }
+  }
+</script>
+
 <div class="speaker-manager">
  <h3>Speakers</h3>
-  <p class="placeholder">Speaker list with rename/color controls</p>
+  {#if $speakers.length === 0}
+    <p class="empty-hint">No speakers detected yet</p>
+  {:else}
+    <ul class="speaker-list">
+      {#each $speakers as speaker (speaker.id)}
+        <li class="speaker-item">
+          <span class="speaker-color" style="background: {speaker.color}"></span>
+          {#if editingSpeakerId === speaker.id}
+            <input
+              class="rename-input"
+              type="text"
+              bind:value={editName}
+              onblur={() => finishRename(speaker.id)}
+              onkeydown={(e) => handleKeydown(e, speaker.id)}
+            />
+          {:else}
+            <!-- svelte-ignore a11y_no_static_element_interactions -->
+            <span class="speaker-name" ondblclick={() => startRename(speaker)}>
+              {speaker.display_name || speaker.label}
+            </span>
+            <button class="rename-btn" onclick={() => startRename(speaker)} title="Rename speaker">
+              ✏
+            </button>
+          {/if}
+        </li>
+      {/each}
+    </ul>
+    <p class="speaker-hint">Double-click a name to rename</p>
+  {/if}
 </div>

 <style>
@@ -10,9 +71,72 @@
    border-radius: 8px;
    color: #e0e0e0;
  }
-  h3 { margin: 0 0 0.5rem; }
-  .placeholder {
+  h3 {
+    margin: 0 0 0.5rem;
+    font-size: 0.95rem;
+  }
+  .empty-hint {
    color: #666;
    font-size: 0.875rem;
  }
+  .speaker-list {
+    list-style: none;
+    padding: 0;
+    margin: 0;
+    display: flex;
+    flex-direction: column;
+    gap: 0.5rem;
+  }
+  .speaker-item {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    padding: 0.35rem 0.5rem;
+    background: rgba(255,255,255,0.03);
+    border-radius: 4px;
+  }
+  .speaker-color {
+    width: 12px;
+    height: 12px;
+    border-radius: 50%;
+    flex-shrink: 0;
+  }
+  .speaker-name {
+    flex: 1;
+    cursor: pointer;
+    font-size: 0.875rem;
+  }
+  .rename-btn {
+    background: none;
+    border: none;
+    color: #666;
+    cursor: pointer;
+    font-size: 0.75rem;
+    padding: 0.15rem 0.3rem;
+    border-radius: 3px;
+  }
+  .rename-btn:hover {
+    background: rgba(255,255,255,0.1);
+    color: #e0e0e0;
+  }
+  .rename-input {
+    flex: 1;
+    background: #1a1a2e;
+    color: #e0e0e0;
+    border: 1px solid #e94560;
+    border-radius: 3px;
+    padding: 0.2rem 0.4rem;
+    font-size: 0.875rem;
+    font-family: inherit;
+  }
+  .rename-input:focus {
+    outline: none;
+    border-color: #ff6b81;
+  }
+  .speaker-hint {
+    color: #555;
+    font-size: 0.7rem;
+    margin-top: 0.5rem;
+    margin-bottom: 0;
+  }
 </style>
--- a/src/lib/services/tauri-bridge.ts
+++ b/src/lib/services/tauri-bridge.ts
@@ -38,3 +38,35 @@ export async function transcribeFile(
 ): Promise<TranscriptionResult> {
  return invoke('transcribe_file', { filePath, model, device, language });
 }
+
+export interface PipelineResult extends TranscriptionResult {
+  segments: Array<TranscriptionResult['segments'][0] & {
+    speaker: string | null;
+  }>;
+  speakers: string[];
+  num_speakers: number;
+}
+
+export async function runPipeline(
+  filePath: string,
+  options?: {
+    model?: string;
+    device?: string;
+    language?: string;
+    numSpeakers?: number;
+    minSpeakers?: number;
+    maxSpeakers?: number;
+    skipDiarization?: boolean;
+  },
+): Promise<PipelineResult> {
+  return invoke('run_pipeline', {
+    filePath,
+    model: options?.model,
+    device: options?.device,
+    language: options?.language,
+    numSpeakers: options?.numSpeakers,
+    minSpeakers: options?.minSpeakers,
+    maxSpeakers: options?.maxSpeakers,
+    skipDiarization: options?.skipDiarization,
+  });
+}
--- a/src/routes/+page.svelte
+++ b/src/routes/+page.svelte
@@ -7,7 +7,7 @@
  import AIChatPanel from '$lib/components/AIChatPanel.svelte';
  import ProgressOverlay from '$lib/components/ProgressOverlay.svelte';
  import { segments, speakers } from '$lib/stores/transcript';
-  import type { Segment, Word } from '$lib/types/transcript';
+  import type { Segment, Speaker } from '$lib/types/transcript';

  let waveformPlayer: WaveformPlayer;
  let audioUrl = $state('');
@@ -16,6 +16,9 @@
  let transcriptionStage = $state('');
  let transcriptionMessage = $state('');

+  // Speaker color palette for auto-assignment
+  const speakerColors = ['#e94560', '#4ecdc4', '#ffe66d', '#a8e6cf', '#ff8b94', '#c7ceea', '#ffd93d', '#6bcb77'];
+
  function handleWordClick(timeMs: number) {
    waveformPlayer?.seekTo(timeMs);
  }
@@ -32,11 +35,10 @@
    if (!filePath) return;

    // Convert file path to URL for wavesurfer
-    // In Tauri, we can use convertFileSrc or asset protocol
    audioUrl = `asset://localhost/${encodeURIComponent(filePath)}`;
    waveformPlayer?.loadAudio(audioUrl);

-    // Start transcription
+    // Start pipeline (transcription + diarization)
    isTranscribing = true;
    transcriptionProgress = 0;
    transcriptionStage = 'Starting...';
@@ -47,6 +49,7 @@
          text: string;
          start_ms: number;
          end_ms: number;
+          speaker: string | null;
          words: Array<{
            word: string;
            start_ms: number;
@@ -56,14 +59,29 @@
        }>;
        language: string;
        duration_ms: number;
-      }>('transcribe_file', { filePath });
+        speakers: string[];
+        num_speakers: number;
+      }>('run_pipeline', { filePath });
+
+      // Create speaker entries from pipeline result
+      const newSpeakers: Speaker[] = (result.speakers || []).map((label, idx) => ({
+        id: `speaker-${idx}`,
+        project_id: '',
+        label,
+        display_name: null,
+        color: speakerColors[idx % speakerColors.length],
+      }));
+      speakers.set(newSpeakers);
+
+      // Build speaker label → id lookup
+      const speakerLookup = new Map(newSpeakers.map(s => [s.label, s.id]));

      // Convert result to our store format
      const newSegments: Segment[] = result.segments.map((seg, idx) => ({
        id: `seg-${idx}`,
        project_id: '',
        media_file_id: '',
-        speaker_id: null,
+        speaker_id: seg.speaker ? (speakerLookup.get(seg.speaker) ?? null) : null,
        start_ms: seg.start_ms,
        end_ms: seg.end_ms,
        text: seg.text,
@@ -85,8 +103,8 @@

      segments.set(newSegments);
    } catch (err) {
-      console.error('Transcription failed:', err);
-      alert(`Transcription failed: ${err}`);
+      console.error('Pipeline failed:', err);
+      alert(`Pipeline failed: ${err}`);
    } finally {
      isTranscribing = false;
    }