Add speech-to-text feature using Faster Whisper container

Adds a mic button to the terminal UI that captures speech, transcribes it via a Faster Whisper sidecar container, and injects the text into the terminal input. Includes settings panel for model selection (tiny/small/medium), port config, and container lifecycle management. - stt-container/: Dockerfile + FastAPI server for Whisper transcription - Rust backend: STT container management, transcribe_audio IPC command - Frontend: useSTT hook, SttButton, SttSettings, WAV encoder - CI: Gitea Actions workflow for multi-arch STT image builds Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 20:02:39 -07:00
parent 8301fd3690
commit 532de77927
19 changed files with 1121 additions and 2 deletions
--- a/app/src/lib/tauri-commands.ts
+++ b/app/src/lib/tauri-commands.ts
@@ -1,5 +1,5 @@
 import { invoke } from "@tauri-apps/api/core";
-import type { Project, ProjectPath, ContainerInfo, SiblingContainer, AppSettings, UpdateInfo, ImageUpdateInfo, McpServer, FileEntry, WebTerminalInfo } from "./types";
+import type { Project, ProjectPath, ContainerInfo, SiblingContainer, AppSettings, UpdateInfo, ImageUpdateInfo, McpServer, FileEntry, WebTerminalInfo, SttStatus } from "./types";

 // Docker
 export const checkDocker = () => invoke<boolean>("check_docker");
@@ -98,3 +98,12 @@ export const getWebTerminalStatus = () =>
  invoke<WebTerminalInfo>("get_web_terminal_status");
 export const regenerateWebTerminalToken = () =>
  invoke<WebTerminalInfo>("regenerate_web_terminal_token");
+
+// STT
+export const getSttStatus = () => invoke<SttStatus>("get_stt_status");
+export const startStt = () => invoke<SttStatus>("start_stt");
+export const stopStt = () => invoke<void>("stop_stt");
+export const buildSttImage = () => invoke<void>("build_stt_image");
+export const pullSttImage = () => invoke<void>("pull_stt_image");
+export const transcribeAudio = (audioData: number[]) =>
+  invoke<string>("transcribe_audio", { audioData });
--- a/app/src/lib/types.ts
+++ b/app/src/lib/types.ts
@@ -119,6 +119,22 @@ export interface AppSettings {
  default_microphone: string | null;
  dismissed_image_digest: string | null;
  web_terminal: WebTerminalSettings;
+  stt: SttSettings;
+}
+
+export interface SttSettings {
+  enabled: boolean;
+  model: string;
+  port: number;
+  language: string | null;
+}
+
+export interface SttStatus {
+  container_exists: boolean;
+  running: boolean;
+  port: number;
+  model: string;
+  image_exists: boolean;
 }

 export interface WebTerminalSettings {
--- a/app/src/lib/wav.ts
+++ b/app/src/lib/wav.ts
@@ -0,0 +1,40 @@
+/**
+ * Encode PCM Int16 samples into a WAV file blob.
+ * Assumes mono channel at the given sample rate.
+ */
+export function encodeWav(samples: Int16Array, sampleRate: number): Blob {
+  const byteLength = samples.length * 2;
+  const buffer = new ArrayBuffer(44 + byteLength);
+  const view = new DataView(buffer);
+
+  // RIFF header
+  writeString(view, 0, "RIFF");
+  view.setUint32(4, 36 + byteLength, true);
+  writeString(view, 8, "WAVE");
+
+  // fmt chunk
+  writeString(view, 12, "fmt ");
+  view.setUint32(16, 16, true); // chunk size
+  view.setUint16(20, 1, true); // PCM format
+  view.setUint16(22, 1, true); // mono
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, sampleRate * 2, true); // byte rate
+  view.setUint16(32, 2, true); // block align
+  view.setUint16(34, 16, true); // bits per sample
+
+  // data chunk
+  writeString(view, 36, "data");
+  view.setUint32(40, byteLength, true);
+
+  // PCM samples
+  const output = new Int16Array(buffer, 44);
+  output.set(samples);
+
+  return new Blob([buffer], { type: "audio/wav" });
+}
+
+function writeString(view: DataView, offset: number, str: string) {
+  for (let i = 0; i < str.length; i++) {
+    view.setUint8(offset + i, str.charCodeAt(i));
+  }
+}