Phase 3: Speaker diarization and full transcription pipeline

- Implement DiarizeService with pyannote.audio speaker detection
- Build PipelineService combining transcribe → diarize → merge with
  overlap-based speaker assignment per segment
- Add pipeline.start and diarize.start IPC handlers
- Add run_pipeline Tauri command for full pipeline execution
- Wire frontend to use pipeline: speakers auto-created with colors,
  segments assigned to detected speakers
- Build SpeakerManager with rename support (double-click or edit button)
- Add speaker color coding throughout transcript display
- Add pyannote.audio dependency
- Tests: 24 Python (including merge logic), 6 Rust, 0 Svelte errors

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-26 16:09:48 -08:00
parent 842f8d5f90
commit 44480906a4
12 changed files with 806 additions and 24 deletions

View File

@@ -7,7 +7,7 @@
import AIChatPanel from '$lib/components/AIChatPanel.svelte';
import ProgressOverlay from '$lib/components/ProgressOverlay.svelte';
import { segments, speakers } from '$lib/stores/transcript';
import type { Segment, Word } from '$lib/types/transcript';
import type { Segment, Speaker } from '$lib/types/transcript';
let waveformPlayer: WaveformPlayer;
let audioUrl = $state('');
@@ -16,6 +16,9 @@
let transcriptionStage = $state('');
let transcriptionMessage = $state('');
// Speaker color palette for auto-assignment
const speakerColors = ['#e94560', '#4ecdc4', '#ffe66d', '#a8e6cf', '#ff8b94', '#c7ceea', '#ffd93d', '#6bcb77'];
function handleWordClick(timeMs: number) {
waveformPlayer?.seekTo(timeMs);
}
@@ -32,11 +35,10 @@
if (!filePath) return;
// Convert file path to URL for wavesurfer
// In Tauri, we can use convertFileSrc or asset protocol
audioUrl = `asset://localhost/${encodeURIComponent(filePath)}`;
waveformPlayer?.loadAudio(audioUrl);
// Start transcription
// Start pipeline (transcription + diarization)
isTranscribing = true;
transcriptionProgress = 0;
transcriptionStage = 'Starting...';
@@ -47,6 +49,7 @@
text: string;
start_ms: number;
end_ms: number;
speaker: string | null;
words: Array<{
word: string;
start_ms: number;
@@ -56,14 +59,29 @@
}>;
language: string;
duration_ms: number;
}>('transcribe_file', { filePath });
speakers: string[];
num_speakers: number;
}>('run_pipeline', { filePath });
// Create speaker entries from pipeline result
const newSpeakers: Speaker[] = (result.speakers || []).map((label, idx) => ({
id: `speaker-${idx}`,
project_id: '',
label,
display_name: null,
color: speakerColors[idx % speakerColors.length],
}));
speakers.set(newSpeakers);
// Build speaker label → id lookup
const speakerLookup = new Map(newSpeakers.map(s => [s.label, s.id]));
// Convert result to our store format
const newSegments: Segment[] = result.segments.map((seg, idx) => ({
id: `seg-${idx}`,
project_id: '',
media_file_id: '',
speaker_id: null,
speaker_id: seg.speaker ? (speakerLookup.get(seg.speaker) ?? null) : null,
start_ms: seg.start_ms,
end_ms: seg.end_ms,
text: seg.text,
@@ -85,8 +103,8 @@
segments.set(newSegments);
} catch (err) {
console.error('Transcription failed:', err);
alert(`Transcription failed: ${err}`);
console.error('Pipeline failed:', err);
alert(`Pipeline failed: ${err}`);
} finally {
isTranscribing = false;
}