Phase 3: Speaker diarization and full transcription pipeline
- Implement DiarizeService with pyannote.audio speaker detection - Build PipelineService combining transcribe → diarize → merge with overlap-based speaker assignment per segment - Add pipeline.start and diarize.start IPC handlers - Add run_pipeline Tauri command for full pipeline execution - Wire frontend to use pipeline: speakers auto-created with colors, segments assigned to detected speakers - Build SpeakerManager with rename support (double-click or edit button) - Add speaker color coding throughout transcript display - Add pyannote.audio dependency - Tests: 24 Python (including merge logic), 6 Rust, 0 Svelte errors Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,67 @@
|
||||
<script lang="ts">
|
||||
import { speakers } from '$lib/stores/transcript';
|
||||
import type { Speaker } from '$lib/types/transcript';
|
||||
|
||||
let editingSpeakerId = $state<string | null>(null);
|
||||
let editName = $state('');
|
||||
|
||||
function startRename(speaker: Speaker) {
|
||||
editingSpeakerId = speaker.id;
|
||||
editName = speaker.display_name || speaker.label;
|
||||
}
|
||||
|
||||
function finishRename(speakerId: string) {
|
||||
const trimmed = editName.trim();
|
||||
if (trimmed) {
|
||||
speakers.update(list => list.map(s => {
|
||||
if (s.id !== speakerId) return s;
|
||||
return { ...s, display_name: trimmed };
|
||||
}));
|
||||
}
|
||||
editingSpeakerId = null;
|
||||
}
|
||||
|
||||
function handleKeydown(e: KeyboardEvent, speakerId: string) {
|
||||
if (e.key === 'Enter') {
|
||||
e.preventDefault();
|
||||
finishRename(speakerId);
|
||||
} else if (e.key === 'Escape') {
|
||||
editingSpeakerId = null;
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="speaker-manager">
|
||||
<h3>Speakers</h3>
|
||||
<p class="placeholder">Speaker list with rename/color controls</p>
|
||||
{#if $speakers.length === 0}
|
||||
<p class="empty-hint">No speakers detected yet</p>
|
||||
{:else}
|
||||
<ul class="speaker-list">
|
||||
{#each $speakers as speaker (speaker.id)}
|
||||
<li class="speaker-item">
|
||||
<span class="speaker-color" style="background: {speaker.color}"></span>
|
||||
{#if editingSpeakerId === speaker.id}
|
||||
<input
|
||||
class="rename-input"
|
||||
type="text"
|
||||
bind:value={editName}
|
||||
onblur={() => finishRename(speaker.id)}
|
||||
onkeydown={(e) => handleKeydown(e, speaker.id)}
|
||||
/>
|
||||
{:else}
|
||||
<!-- svelte-ignore a11y_no_static_element_interactions -->
|
||||
<span class="speaker-name" ondblclick={() => startRename(speaker)}>
|
||||
{speaker.display_name || speaker.label}
|
||||
</span>
|
||||
<button class="rename-btn" onclick={() => startRename(speaker)} title="Rename speaker">
|
||||
✏
|
||||
</button>
|
||||
{/if}
|
||||
</li>
|
||||
{/each}
|
||||
</ul>
|
||||
<p class="speaker-hint">Double-click a name to rename</p>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
<style>
|
||||
@@ -10,9 +71,72 @@
|
||||
border-radius: 8px;
|
||||
color: #e0e0e0;
|
||||
}
|
||||
h3 { margin: 0 0 0.5rem; }
|
||||
.placeholder {
|
||||
h3 {
|
||||
margin: 0 0 0.5rem;
|
||||
font-size: 0.95rem;
|
||||
}
|
||||
.empty-hint {
|
||||
color: #666;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
.speaker-list {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
.speaker-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.35rem 0.5rem;
|
||||
background: rgba(255,255,255,0.03);
|
||||
border-radius: 4px;
|
||||
}
|
||||
.speaker-color {
|
||||
width: 12px;
|
||||
height: 12px;
|
||||
border-radius: 50%;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.speaker-name {
|
||||
flex: 1;
|
||||
cursor: pointer;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
.rename-btn {
|
||||
background: none;
|
||||
border: none;
|
||||
color: #666;
|
||||
cursor: pointer;
|
||||
font-size: 0.75rem;
|
||||
padding: 0.15rem 0.3rem;
|
||||
border-radius: 3px;
|
||||
}
|
||||
.rename-btn:hover {
|
||||
background: rgba(255,255,255,0.1);
|
||||
color: #e0e0e0;
|
||||
}
|
||||
.rename-input {
|
||||
flex: 1;
|
||||
background: #1a1a2e;
|
||||
color: #e0e0e0;
|
||||
border: 1px solid #e94560;
|
||||
border-radius: 3px;
|
||||
padding: 0.2rem 0.4rem;
|
||||
font-size: 0.875rem;
|
||||
font-family: inherit;
|
||||
}
|
||||
.rename-input:focus {
|
||||
outline: none;
|
||||
border-color: #ff6b81;
|
||||
}
|
||||
.speaker-hint {
|
||||
color: #555;
|
||||
font-size: 0.7rem;
|
||||
margin-top: 0.5rem;
|
||||
margin-bottom: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -38,3 +38,35 @@ export async function transcribeFile(
|
||||
): Promise<TranscriptionResult> {
|
||||
return invoke('transcribe_file', { filePath, model, device, language });
|
||||
}
|
||||
|
||||
export interface PipelineResult extends TranscriptionResult {
|
||||
segments: Array<TranscriptionResult['segments'][0] & {
|
||||
speaker: string | null;
|
||||
}>;
|
||||
speakers: string[];
|
||||
num_speakers: number;
|
||||
}
|
||||
|
||||
export async function runPipeline(
|
||||
filePath: string,
|
||||
options?: {
|
||||
model?: string;
|
||||
device?: string;
|
||||
language?: string;
|
||||
numSpeakers?: number;
|
||||
minSpeakers?: number;
|
||||
maxSpeakers?: number;
|
||||
skipDiarization?: boolean;
|
||||
},
|
||||
): Promise<PipelineResult> {
|
||||
return invoke('run_pipeline', {
|
||||
filePath,
|
||||
model: options?.model,
|
||||
device: options?.device,
|
||||
language: options?.language,
|
||||
numSpeakers: options?.numSpeakers,
|
||||
minSpeakers: options?.minSpeakers,
|
||||
maxSpeakers: options?.maxSpeakers,
|
||||
skipDiarization: options?.skipDiarization,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
import AIChatPanel from '$lib/components/AIChatPanel.svelte';
|
||||
import ProgressOverlay from '$lib/components/ProgressOverlay.svelte';
|
||||
import { segments, speakers } from '$lib/stores/transcript';
|
||||
import type { Segment, Word } from '$lib/types/transcript';
|
||||
import type { Segment, Speaker } from '$lib/types/transcript';
|
||||
|
||||
let waveformPlayer: WaveformPlayer;
|
||||
let audioUrl = $state('');
|
||||
@@ -16,6 +16,9 @@
|
||||
let transcriptionStage = $state('');
|
||||
let transcriptionMessage = $state('');
|
||||
|
||||
// Speaker color palette for auto-assignment
|
||||
const speakerColors = ['#e94560', '#4ecdc4', '#ffe66d', '#a8e6cf', '#ff8b94', '#c7ceea', '#ffd93d', '#6bcb77'];
|
||||
|
||||
function handleWordClick(timeMs: number) {
|
||||
waveformPlayer?.seekTo(timeMs);
|
||||
}
|
||||
@@ -32,11 +35,10 @@
|
||||
if (!filePath) return;
|
||||
|
||||
// Convert file path to URL for wavesurfer
|
||||
// In Tauri, we can use convertFileSrc or asset protocol
|
||||
audioUrl = `asset://localhost/${encodeURIComponent(filePath)}`;
|
||||
waveformPlayer?.loadAudio(audioUrl);
|
||||
|
||||
// Start transcription
|
||||
// Start pipeline (transcription + diarization)
|
||||
isTranscribing = true;
|
||||
transcriptionProgress = 0;
|
||||
transcriptionStage = 'Starting...';
|
||||
@@ -47,6 +49,7 @@
|
||||
text: string;
|
||||
start_ms: number;
|
||||
end_ms: number;
|
||||
speaker: string | null;
|
||||
words: Array<{
|
||||
word: string;
|
||||
start_ms: number;
|
||||
@@ -56,14 +59,29 @@
|
||||
}>;
|
||||
language: string;
|
||||
duration_ms: number;
|
||||
}>('transcribe_file', { filePath });
|
||||
speakers: string[];
|
||||
num_speakers: number;
|
||||
}>('run_pipeline', { filePath });
|
||||
|
||||
// Create speaker entries from pipeline result
|
||||
const newSpeakers: Speaker[] = (result.speakers || []).map((label, idx) => ({
|
||||
id: `speaker-${idx}`,
|
||||
project_id: '',
|
||||
label,
|
||||
display_name: null,
|
||||
color: speakerColors[idx % speakerColors.length],
|
||||
}));
|
||||
speakers.set(newSpeakers);
|
||||
|
||||
// Build speaker label → id lookup
|
||||
const speakerLookup = new Map(newSpeakers.map(s => [s.label, s.id]));
|
||||
|
||||
// Convert result to our store format
|
||||
const newSegments: Segment[] = result.segments.map((seg, idx) => ({
|
||||
id: `seg-${idx}`,
|
||||
project_id: '',
|
||||
media_file_id: '',
|
||||
speaker_id: null,
|
||||
speaker_id: seg.speaker ? (speakerLookup.get(seg.speaker) ?? null) : null,
|
||||
start_ms: seg.start_ms,
|
||||
end_ms: seg.end_ms,
|
||||
text: seg.text,
|
||||
@@ -85,8 +103,8 @@
|
||||
|
||||
segments.set(newSegments);
|
||||
} catch (err) {
|
||||
console.error('Transcription failed:', err);
|
||||
alert(`Transcription failed: ${err}`);
|
||||
console.error('Pipeline failed:', err);
|
||||
alert(`Pipeline failed: ${err}`);
|
||||
} finally {
|
||||
isTranscribing = false;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user