Phase 3: Speaker diarization and full transcription pipeline

- Implement DiarizeService with pyannote.audio speaker detection
- Build PipelineService combining transcribe → diarize → merge with
  overlap-based speaker assignment per segment
- Add pipeline.start and diarize.start IPC handlers
- Add run_pipeline Tauri command for full pipeline execution
- Wire frontend to use pipeline: speakers auto-created with colors,
  segments assigned to detected speakers
- Build SpeakerManager with rename support (double-click or edit button)
- Add speaker color coding throughout transcript display
- Add pyannote.audio dependency
- Tests: 24 Python (including merge logic), 6 Rust, 0 Svelte errors

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-26 16:09:48 -08:00
parent 842f8d5f90
commit 44480906a4
12 changed files with 806 additions and 24 deletions

View File

@@ -1,6 +1,67 @@
<script lang="ts">
import { speakers } from '$lib/stores/transcript';
import type { Speaker } from '$lib/types/transcript';
let editingSpeakerId = $state<string | null>(null);
let editName = $state('');
function startRename(speaker: Speaker) {
editingSpeakerId = speaker.id;
editName = speaker.display_name || speaker.label;
}
function finishRename(speakerId: string) {
const trimmed = editName.trim();
if (trimmed) {
speakers.update(list => list.map(s => {
if (s.id !== speakerId) return s;
return { ...s, display_name: trimmed };
}));
}
editingSpeakerId = null;
}
function handleKeydown(e: KeyboardEvent, speakerId: string) {
if (e.key === 'Enter') {
e.preventDefault();
finishRename(speakerId);
} else if (e.key === 'Escape') {
editingSpeakerId = null;
}
}
</script>
<div class="speaker-manager">
<h3>Speakers</h3>
<p class="placeholder">Speaker list with rename/color controls</p>
{#if $speakers.length === 0}
<p class="empty-hint">No speakers detected yet</p>
{:else}
<ul class="speaker-list">
{#each $speakers as speaker (speaker.id)}
<li class="speaker-item">
<span class="speaker-color" style="background: {speaker.color}"></span>
{#if editingSpeakerId === speaker.id}
<input
class="rename-input"
type="text"
bind:value={editName}
onblur={() => finishRename(speaker.id)}
onkeydown={(e) => handleKeydown(e, speaker.id)}
/>
{:else}
<!-- svelte-ignore a11y_no_static_element_interactions -->
<span class="speaker-name" ondblclick={() => startRename(speaker)}>
{speaker.display_name || speaker.label}
</span>
<button class="rename-btn" onclick={() => startRename(speaker)} title="Rename speaker">
</button>
{/if}
</li>
{/each}
</ul>
<p class="speaker-hint">Double-click a name to rename</p>
{/if}
</div>
<style>
@@ -10,9 +71,72 @@
border-radius: 8px;
color: #e0e0e0;
}
h3 { margin: 0 0 0.5rem; }
.placeholder {
h3 {
margin: 0 0 0.5rem;
font-size: 0.95rem;
}
.empty-hint {
color: #666;
font-size: 0.875rem;
}
.speaker-list {
list-style: none;
padding: 0;
margin: 0;
display: flex;
flex-direction: column;
gap: 0.5rem;
}
.speaker-item {
display: flex;
align-items: center;
gap: 0.5rem;
padding: 0.35rem 0.5rem;
background: rgba(255,255,255,0.03);
border-radius: 4px;
}
.speaker-color {
width: 12px;
height: 12px;
border-radius: 50%;
flex-shrink: 0;
}
.speaker-name {
flex: 1;
cursor: pointer;
font-size: 0.875rem;
}
.rename-btn {
background: none;
border: none;
color: #666;
cursor: pointer;
font-size: 0.75rem;
padding: 0.15rem 0.3rem;
border-radius: 3px;
}
.rename-btn:hover {
background: rgba(255,255,255,0.1);
color: #e0e0e0;
}
.rename-input {
flex: 1;
background: #1a1a2e;
color: #e0e0e0;
border: 1px solid #e94560;
border-radius: 3px;
padding: 0.2rem 0.4rem;
font-size: 0.875rem;
font-family: inherit;
}
.rename-input:focus {
outline: none;
border-color: #ff6b81;
}
.speaker-hint {
color: #555;
font-size: 0.7rem;
margin-top: 0.5rem;
margin-bottom: 0;
}
</style>

View File

@@ -38,3 +38,35 @@ export async function transcribeFile(
): Promise<TranscriptionResult> {
return invoke('transcribe_file', { filePath, model, device, language });
}
export interface PipelineResult extends TranscriptionResult {
segments: Array<TranscriptionResult['segments'][0] & {
speaker: string | null;
}>;
speakers: string[];
num_speakers: number;
}
export async function runPipeline(
filePath: string,
options?: {
model?: string;
device?: string;
language?: string;
numSpeakers?: number;
minSpeakers?: number;
maxSpeakers?: number;
skipDiarization?: boolean;
},
): Promise<PipelineResult> {
return invoke('run_pipeline', {
filePath,
model: options?.model,
device: options?.device,
language: options?.language,
numSpeakers: options?.numSpeakers,
minSpeakers: options?.minSpeakers,
maxSpeakers: options?.maxSpeakers,
skipDiarization: options?.skipDiarization,
});
}

View File

@@ -7,7 +7,7 @@
import AIChatPanel from '$lib/components/AIChatPanel.svelte';
import ProgressOverlay from '$lib/components/ProgressOverlay.svelte';
import { segments, speakers } from '$lib/stores/transcript';
import type { Segment, Word } from '$lib/types/transcript';
import type { Segment, Speaker } from '$lib/types/transcript';
let waveformPlayer: WaveformPlayer;
let audioUrl = $state('');
@@ -16,6 +16,9 @@
let transcriptionStage = $state('');
let transcriptionMessage = $state('');
// Speaker color palette for auto-assignment
const speakerColors = ['#e94560', '#4ecdc4', '#ffe66d', '#a8e6cf', '#ff8b94', '#c7ceea', '#ffd93d', '#6bcb77'];
function handleWordClick(timeMs: number) {
waveformPlayer?.seekTo(timeMs);
}
@@ -32,11 +35,10 @@
if (!filePath) return;
// Convert file path to URL for wavesurfer
// In Tauri, we can use convertFileSrc or asset protocol
audioUrl = `asset://localhost/${encodeURIComponent(filePath)}`;
waveformPlayer?.loadAudio(audioUrl);
// Start transcription
// Start pipeline (transcription + diarization)
isTranscribing = true;
transcriptionProgress = 0;
transcriptionStage = 'Starting...';
@@ -47,6 +49,7 @@
text: string;
start_ms: number;
end_ms: number;
speaker: string | null;
words: Array<{
word: string;
start_ms: number;
@@ -56,14 +59,29 @@
}>;
language: string;
duration_ms: number;
}>('transcribe_file', { filePath });
speakers: string[];
num_speakers: number;
}>('run_pipeline', { filePath });
// Create speaker entries from pipeline result
const newSpeakers: Speaker[] = (result.speakers || []).map((label, idx) => ({
id: `speaker-${idx}`,
project_id: '',
label,
display_name: null,
color: speakerColors[idx % speakerColors.length],
}));
speakers.set(newSpeakers);
// Build speaker label → id lookup
const speakerLookup = new Map(newSpeakers.map(s => [s.label, s.id]));
// Convert result to our store format
const newSegments: Segment[] = result.segments.map((seg, idx) => ({
id: `seg-${idx}`,
project_id: '',
media_file_id: '',
speaker_id: null,
speaker_id: seg.speaker ? (speakerLookup.get(seg.speaker) ?? null) : null,
start_ms: seg.start_ms,
end_ms: seg.end_ms,
text: seg.text,
@@ -85,8 +103,8 @@
segments.set(newSegments);
} catch (err) {
console.error('Transcription failed:', err);
alert(`Transcription failed: ${err}`);
console.error('Pipeline failed:', err);
alert(`Pipeline failed: ${err}`);
} finally {
isTranscribing = false;
}