Phase 2: Core transcription pipeline and audio playback

- Implement faster-whisper TranscribeService with word-level timestamps,
  progress reporting, and hardware auto-detection
- Wire up Rust SidecarManager for Python process lifecycle (spawn, IPC, shutdown)
- Add transcribe_file Tauri command bridging frontend to Python sidecar
- Integrate wavesurfer.js WaveformPlayer with play/pause, skip, seek controls
- Build TranscriptEditor with word-level click-to-seek and active highlighting
- Connect file import flow: prompt → asset load → transcribe → display
- Add typed tauri-bridge service with TranscriptionResult interface
- Add Python tests for hardware detection and transcription result formatting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-26 15:53:09 -08:00
parent 503cc6c0cf
commit 48fe41b064
18 changed files with 1775 additions and 32 deletions

View File

@@ -1,18 +1,154 @@
<div class="transcript-editor">
<p>Transcript Editor</p>
<p class="placeholder">TipTap rich text editor will be integrated here</p>
<script lang="ts">
import { segments, speakers } from '$lib/stores/transcript';
import { currentTimeMs } from '$lib/stores/playback';
import type { Segment, Word, Speaker } from '$lib/types/transcript';
interface Props {
onWordClick?: (timeMs: number) => void;
onTextEdit?: (segmentId: string, newText: string) => void;
}
let { onWordClick, onTextEdit }: Props = $props();
let transcriptContainer: HTMLDivElement;
function getSpeakerName(speakerId: string | null, speakerList: Speaker[]): string {
if (!speakerId) return 'Unknown';
const speaker = speakerList.find(s => s.id === speakerId);
return speaker?.display_name || speaker?.label || 'Unknown';
}
function getSpeakerColor(speakerId: string | null, speakerList: Speaker[]): string {
if (!speakerId) return '#888';
const speaker = speakerList.find(s => s.id === speakerId);
return speaker?.color || '#888';
}
function formatTimestamp(ms: number): string {
const totalSeconds = Math.floor(ms / 1000);
const m = Math.floor(totalSeconds / 60);
const s = totalSeconds % 60;
return `${m}:${s.toString().padStart(2, '0')}`;
}
function isWordActive(word: Word, currentMs: number): boolean {
return currentMs >= word.start_ms && currentMs <= word.end_ms;
}
function isSegmentActive(segment: Segment, currentMs: number): boolean {
return currentMs >= segment.start_ms && currentMs <= segment.end_ms;
}
function handleWordClick(word: Word) {
onWordClick?.(word.start_ms);
}
</script>
<div class="transcript-editor" bind:this={transcriptContainer}>
{#if $segments.length === 0}
<div class="empty-state">
<p>No transcript yet</p>
<p class="hint">Import an audio file and run transcription to get started</p>
</div>
{:else}
{#each $segments as segment (segment.id)}
<div
class="segment"
class:active={isSegmentActive(segment, $currentTimeMs)}
>
<div class="segment-header">
<span
class="speaker-label"
style="border-left-color: {getSpeakerColor(segment.speaker_id, $speakers)}"
>
{getSpeakerName(segment.speaker_id, $speakers)}
</span>
<span class="timestamp">{formatTimestamp(segment.start_ms)}</span>
</div>
<div class="segment-text">
{#each segment.words as word (word.id)}
<span
class="word"
class:word-active={isWordActive(word, $currentTimeMs)}
onclick={() => handleWordClick(word)}
role="button"
tabindex="0"
onkeydown={(e) => { if (e.key === 'Enter') handleWordClick(word); }}
>{word.word} </span>
{:else}
<span class="segment-plain-text">{segment.text}</span>
{/each}
</div>
</div>
{/each}
{/if}
</div>
<style>
.transcript-editor {
flex: 1;
overflow-y: auto;
padding: 1rem;
background: #16213e;
border-radius: 8px;
color: #e0e0e0;
flex: 1;
}
.placeholder {
.empty-state {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
height: 100%;
color: #666;
}
.hint {
font-size: 0.875rem;
color: #555;
}
.segment {
margin-bottom: 1rem;
padding: 0.5rem;
border-radius: 4px;
transition: background-color 0.2s;
}
.segment.active {
background: rgba(233, 69, 96, 0.1);
}
.segment-header {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.25rem;
}
.speaker-label {
font-weight: 600;
font-size: 0.875rem;
border-left: 3px solid;
padding-left: 0.5rem;
}
.timestamp {
color: #666;
font-size: 0.75rem;
font-variant-numeric: tabular-nums;
}
.segment-text {
line-height: 1.6;
padding-left: 0.75rem;
}
.word {
cursor: pointer;
border-radius: 2px;
padding: 0 1px;
transition: background-color 0.15s;
}
.word:hover {
background: rgba(233, 69, 96, 0.2);
}
.word-active {
background: rgba(233, 69, 96, 0.35);
color: #fff;
}
.segment-plain-text {
color: #ccc;
}
</style>

View File

@@ -1,17 +1,144 @@
<script lang="ts">
import { onMount, onDestroy } from 'svelte';
import WaveSurfer from 'wavesurfer.js';
import { isPlaying, currentTimeMs, durationMs } from '$lib/stores/playback';
interface Props {
audioUrl?: string;
onSeek?: (timeMs: number) => void;
}
let { audioUrl = '', onSeek }: Props = $props();
let container: HTMLDivElement;
let wavesurfer: WaveSurfer | null = $state(null);
let currentTime = $state('0:00');
let totalTime = $state('0:00');
function formatTime(seconds: number): string {
const m = Math.floor(seconds / 60);
const s = Math.floor(seconds % 60);
return `${m}:${s.toString().padStart(2, '0')}`;
}
onMount(() => {
wavesurfer = WaveSurfer.create({
container,
waveColor: '#4a5568',
progressColor: '#e94560',
cursorColor: '#e94560',
height: 80,
barWidth: 2,
barGap: 1,
barRadius: 2,
});
wavesurfer.on('timeupdate', (time: number) => {
currentTimeMs.set(Math.round(time * 1000));
currentTime = formatTime(time);
});
wavesurfer.on('ready', () => {
const dur = wavesurfer!.getDuration();
durationMs.set(Math.round(dur * 1000));
totalTime = formatTime(dur);
});
wavesurfer.on('play', () => isPlaying.set(true));
wavesurfer.on('pause', () => isPlaying.set(false));
wavesurfer.on('finish', () => isPlaying.set(false));
if (audioUrl) {
wavesurfer.load(audioUrl);
}
});
onDestroy(() => {
wavesurfer?.destroy();
});
function togglePlayPause() {
wavesurfer?.playPause();
}
function skipBack() {
if (wavesurfer) {
const time = Math.max(0, wavesurfer.getCurrentTime() - 5);
wavesurfer.setTime(time);
}
}
function skipForward() {
if (wavesurfer) {
const time = Math.min(wavesurfer.getDuration(), wavesurfer.getCurrentTime() + 5);
wavesurfer.setTime(time);
}
}
/** Seek to a specific time in milliseconds. Called from transcript click-to-seek. */
export function seekTo(timeMs: number) {
if (wavesurfer) {
wavesurfer.setTime(timeMs / 1000);
if (!wavesurfer.isPlaying()) {
wavesurfer.play();
}
}
}
/** Load a new audio file. */
export function loadAudio(url: string) {
wavesurfer?.load(url);
}
</script>
<div class="waveform-player">
<p>Waveform Player</p>
<p class="placeholder">wavesurfer.js will be integrated here</p>
<div class="waveform-container" bind:this={container}></div>
<div class="controls">
<button class="control-btn" onclick={skipBack} title="Back 5s"></button>
<button class="control-btn play-btn" onclick={togglePlayPause} title="Play/Pause">
{#if $isPlaying}{:else}{/if}
</button>
<button class="control-btn" onclick={skipForward} title="Forward 5s"></button>
<span class="time">{currentTime} / {totalTime}</span>
</div>
</div>
<style>
.waveform-player {
padding: 1rem;
background: #1a1a2e;
border-radius: 8px;
color: #e0e0e0;
padding: 0.75rem;
}
.placeholder {
color: #666;
.waveform-container {
border-radius: 4px;
overflow: hidden;
}
.controls {
display: flex;
align-items: center;
gap: 0.5rem;
margin-top: 0.5rem;
}
.control-btn {
background: #0f3460;
border: none;
color: #e0e0e0;
padding: 0.4rem 0.8rem;
border-radius: 4px;
cursor: pointer;
font-size: 1rem;
}
.control-btn:hover {
background: #1a4a7a;
}
.play-btn {
padding: 0.4rem 1rem;
font-size: 1.2rem;
}
.time {
color: #999;
font-size: 0.875rem;
margin-left: auto;
font-variant-numeric: tabular-nums;
}
</style>

View File

@@ -12,3 +12,29 @@ export async function getProject(id: string): Promise<Project | null> {
export async function listProjects(): Promise<Project[]> {
return invoke('list_projects');
}
export interface TranscriptionResult {
segments: Array<{
text: string;
start_ms: number;
end_ms: number;
words: Array<{
word: string;
start_ms: number;
end_ms: number;
confidence: number;
}>;
}>;
language: string;
language_probability: number;
duration_ms: number;
}
export async function transcribeFile(
filePath: string,
model?: string,
device?: string,
language?: string,
): Promise<TranscriptionResult> {
return invoke('transcribe_file', { filePath, model, device, language });
}

View File

@@ -1,14 +1,104 @@
<script lang="ts">
import { invoke } from '@tauri-apps/api/core';
import WaveformPlayer from '$lib/components/WaveformPlayer.svelte';
import TranscriptEditor from '$lib/components/TranscriptEditor.svelte';
import SpeakerManager from '$lib/components/SpeakerManager.svelte';
import AIChatPanel from '$lib/components/AIChatPanel.svelte';
import ProgressOverlay from '$lib/components/ProgressOverlay.svelte';
import { segments, speakers } from '$lib/stores/transcript';
import type { Segment, Word } from '$lib/types/transcript';
let waveformPlayer: WaveformPlayer;
let audioUrl = $state('');
let isTranscribing = $state(false);
let transcriptionProgress = $state(0);
let transcriptionStage = $state('');
let transcriptionMessage = $state('');
function handleWordClick(timeMs: number) {
waveformPlayer?.seekTo(timeMs);
}
async function handleFileImport() {
// For now, use a simple prompt — will be replaced with Tauri file dialog
const filePath = prompt('Enter path to audio/video file:');
if (!filePath) return;
// Convert file path to URL for wavesurfer
// In Tauri, we can use convertFileSrc or asset protocol
audioUrl = `asset://localhost/${encodeURIComponent(filePath)}`;
waveformPlayer?.loadAudio(audioUrl);
// Start transcription
isTranscribing = true;
transcriptionProgress = 0;
transcriptionStage = 'Starting...';
try {
const result = await invoke<{
segments: Array<{
text: string;
start_ms: number;
end_ms: number;
words: Array<{
word: string;
start_ms: number;
end_ms: number;
confidence: number;
}>;
}>;
language: string;
duration_ms: number;
}>('transcribe_file', { filePath });
// Convert result to our store format
const newSegments: Segment[] = result.segments.map((seg, idx) => ({
id: `seg-${idx}`,
project_id: '',
media_file_id: '',
speaker_id: null,
start_ms: seg.start_ms,
end_ms: seg.end_ms,
text: seg.text,
original_text: null,
confidence: null,
is_edited: false,
edited_at: null,
segment_index: idx,
words: seg.words.map((w, widx) => ({
id: `word-${idx}-${widx}`,
segment_id: `seg-${idx}`,
word: w.word,
start_ms: w.start_ms,
end_ms: w.end_ms,
confidence: w.confidence,
word_index: widx,
})),
}));
segments.set(newSegments);
} catch (err) {
console.error('Transcription failed:', err);
alert(`Transcription failed: ${err}`);
} finally {
isTranscribing = false;
}
}
</script>
<div class="app-header">
<h1>Voice to Notes</h1>
<div class="header-actions">
<button class="import-btn" onclick={handleFileImport}>
Import Audio/Video
</button>
</div>
</div>
<div class="workspace">
<div class="main-content">
<WaveformPlayer />
<TranscriptEditor />
<WaveformPlayer bind:this={waveformPlayer} {audioUrl} />
<TranscriptEditor onWordClick={handleWordClick} />
</div>
<div class="sidebar-right">
<SpeakerManager />
@@ -16,23 +106,58 @@
</div>
</div>
<ProgressOverlay
visible={isTranscribing}
percent={transcriptionProgress}
stage={transcriptionStage}
message={transcriptionMessage}
/>
<style>
.app-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.5rem 1rem;
background: #0f3460;
color: #e0e0e0;
}
h1 {
font-size: 1.25rem;
margin: 0;
}
.import-btn {
background: #e94560;
border: none;
color: white;
padding: 0.5rem 1rem;
border-radius: 6px;
cursor: pointer;
font-size: 0.875rem;
font-weight: 500;
}
.import-btn:hover {
background: #d63851;
}
.workspace {
display: flex;
gap: 1rem;
padding: 1rem;
height: calc(100vh - 3rem);
height: calc(100vh - 3.5rem);
background: #0a0a23;
}
.main-content {
flex: 1;
display: flex;
flex-direction: column;
gap: 1rem;
min-width: 0;
}
.sidebar-right {
width: 300px;
display: flex;
flex-direction: column;
gap: 1rem;
flex-shrink: 0;
}
</style>