Add speech-to-text feature using Faster Whisper container
Some checks failed
Build App / compute-version (pull_request) Successful in 3s
Build App / build-macos (pull_request) Successful in 2m28s
Build STT Container / build-stt-container (pull_request) Successful in 3m18s
Build App / build-windows (pull_request) Successful in 4m40s
Build App / build-linux (pull_request) Failing after 1m46s
Build App / create-tag (pull_request) Has been skipped
Build App / sync-to-github (pull_request) Has been skipped
Some checks failed
Build App / compute-version (pull_request) Successful in 3s
Build App / build-macos (pull_request) Successful in 2m28s
Build STT Container / build-stt-container (pull_request) Successful in 3m18s
Build App / build-windows (pull_request) Successful in 4m40s
Build App / build-linux (pull_request) Failing after 1m46s
Build App / create-tag (pull_request) Has been skipped
Build App / sync-to-github (pull_request) Has been skipped
Adds a mic button to the terminal UI that captures speech, transcribes it via a Faster Whisper sidecar container, and injects the text into the terminal input. Includes settings panel for model selection (tiny/small/medium), port config, and container lifecycle management. - stt-container/: Dockerfile + FastAPI server for Whisper transcription - Rust backend: STT container management, transcribe_audio IPC command - Frontend: useSTT hook, SttButton, SttSettings, WAV encoder - CI: Gitea Actions workflow for multi-arch STT image builds Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
145
app/src/hooks/useSTT.ts
Normal file
145
app/src/hooks/useSTT.ts
Normal file
@@ -0,0 +1,145 @@
|
||||
import { useCallback, useRef, useState } from "react";
|
||||
import * as commands from "../lib/tauri-commands";
|
||||
import { encodeWav } from "../lib/wav";
|
||||
import { useAppState } from "../store/appState";
|
||||
|
||||
export type SttState = "idle" | "recording" | "transcribing" | "error";
|
||||
|
||||
export function useSTT(sessionId: string, sendInput: (sessionId: string, data: string) => Promise<void>) {
|
||||
const [state, setState] = useState<SttState>("idle");
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const audioContextRef = useRef<AudioContext | null>(null);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const workletRef = useRef<AudioWorkletNode | null>(null);
|
||||
const chunksRef = useRef<Int16Array[]>([]);
|
||||
|
||||
const appSettings = useAppState((s) => s.appSettings);
|
||||
const deviceId = appSettings?.default_microphone;
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
if (state === "recording" || state === "transcribing") return;
|
||||
setState("recording");
|
||||
setError(null);
|
||||
chunksRef.current = [];
|
||||
|
||||
try {
|
||||
const audioConstraints: MediaTrackConstraints = {
|
||||
channelCount: 1,
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
autoGainControl: true,
|
||||
};
|
||||
if (deviceId) {
|
||||
audioConstraints.deviceId = { exact: deviceId };
|
||||
}
|
||||
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints });
|
||||
streamRef.current = stream;
|
||||
|
||||
const audioContext = new AudioContext({ sampleRate: 16000 });
|
||||
audioContextRef.current = audioContext;
|
||||
|
||||
await audioContext.audioWorklet.addModule("/audio-capture-processor.js");
|
||||
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
const processor = new AudioWorkletNode(audioContext, "audio-capture-processor");
|
||||
workletRef.current = processor;
|
||||
|
||||
processor.port.onmessage = (event: MessageEvent<ArrayBuffer>) => {
|
||||
chunksRef.current.push(new Int16Array(event.data));
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(audioContext.destination);
|
||||
} catch (e) {
|
||||
const msg = e instanceof Error ? e.message : String(e);
|
||||
setError(msg);
|
||||
setState("error");
|
||||
}
|
||||
}, [state, deviceId]);
|
||||
|
||||
const stopRecording = useCallback(async () => {
|
||||
if (state !== "recording") return;
|
||||
|
||||
// Stop audio capture
|
||||
workletRef.current?.disconnect();
|
||||
workletRef.current = null;
|
||||
|
||||
if (audioContextRef.current) {
|
||||
await audioContextRef.current.close().catch(() => {});
|
||||
audioContextRef.current = null;
|
||||
}
|
||||
|
||||
if (streamRef.current) {
|
||||
streamRef.current.getTracks().forEach((t) => t.stop());
|
||||
streamRef.current = null;
|
||||
}
|
||||
|
||||
// Concatenate PCM chunks
|
||||
const chunks = chunksRef.current;
|
||||
chunksRef.current = [];
|
||||
|
||||
if (chunks.length === 0) {
|
||||
setState("idle");
|
||||
return;
|
||||
}
|
||||
|
||||
const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
|
||||
const pcm = new Int16Array(totalLength);
|
||||
let offset = 0;
|
||||
for (const chunk of chunks) {
|
||||
pcm.set(chunk, offset);
|
||||
offset += chunk.length;
|
||||
}
|
||||
|
||||
// Encode to WAV and transcribe
|
||||
setState("transcribing");
|
||||
try {
|
||||
const wavBlob = encodeWav(pcm, 16000);
|
||||
const wavBuffer = await wavBlob.arrayBuffer();
|
||||
const audioData = Array.from(new Uint8Array(wavBuffer));
|
||||
|
||||
const text = await commands.transcribeAudio(audioData);
|
||||
if (text) {
|
||||
await sendInput(sessionId, text);
|
||||
}
|
||||
setState("idle");
|
||||
} catch (e) {
|
||||
const msg = e instanceof Error ? e.message : String(e);
|
||||
setError(msg);
|
||||
setState("error");
|
||||
// Reset to idle after a brief delay so the UI shows the error
|
||||
setTimeout(() => setState("idle"), 3000);
|
||||
}
|
||||
}, [state, sessionId, sendInput]);
|
||||
|
||||
const cancelRecording = useCallback(async () => {
|
||||
workletRef.current?.disconnect();
|
||||
workletRef.current = null;
|
||||
|
||||
if (audioContextRef.current) {
|
||||
await audioContextRef.current.close().catch(() => {});
|
||||
audioContextRef.current = null;
|
||||
}
|
||||
|
||||
if (streamRef.current) {
|
||||
streamRef.current.getTracks().forEach((t) => t.stop());
|
||||
streamRef.current = null;
|
||||
}
|
||||
|
||||
chunksRef.current = [];
|
||||
setState("idle");
|
||||
setError(null);
|
||||
}, []);
|
||||
|
||||
const toggle = useCallback(async () => {
|
||||
if (state === "recording") {
|
||||
await stopRecording();
|
||||
} else if (state === "idle" || state === "error") {
|
||||
await startRecording();
|
||||
}
|
||||
}, [state, startRecording, stopRecording]);
|
||||
|
||||
return { state, error, startRecording, stopRecording, cancelRecording, toggle };
|
||||
}
|
||||
Reference in New Issue
Block a user