Add speech-to-text feature using Faster Whisper container
Some checks failed
Build App / compute-version (pull_request) Successful in 3s
Build App / build-macos (pull_request) Successful in 2m28s
Build STT Container / build-stt-container (pull_request) Successful in 3m18s
Build App / build-windows (pull_request) Successful in 4m40s
Build App / build-linux (pull_request) Failing after 1m46s
Build App / create-tag (pull_request) Has been skipped
Build App / sync-to-github (pull_request) Has been skipped

Adds a mic button to the terminal UI that captures speech, transcribes
it via a Faster Whisper sidecar container, and injects the text into
the terminal input. Includes settings panel for model selection
(tiny/small/medium), port config, and container lifecycle management.

- stt-container/: Dockerfile + FastAPI server for Whisper transcription
- Rust backend: STT container management, transcribe_audio IPC command
- Frontend: useSTT hook, SttButton, SttSettings, WAV encoder
- CI: Gitea Actions workflow for multi-arch STT image builds

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-12 20:02:39 -07:00
parent 8301fd3690
commit 532de77927
19 changed files with 1121 additions and 2 deletions

View File

@@ -9,6 +9,7 @@ import { detectHostTimezone } from "../../lib/tauri-commands";
import type { EnvVar } from "../../lib/types";
import Tooltip from "../ui/Tooltip";
import WebTerminalSettings from "./WebTerminalSettings";
import SttSettings from "./SttSettings";
export default function SettingsPanel() {
const { appSettings, saveSettings } = useSettings();
@@ -120,6 +121,9 @@ export default function SettingsPanel() {
{/* Web Terminal */}
<WebTerminalSettings />
{/* Speech to Text */}
<SttSettings />
{/* Updates section */}
<div>
<label className="block text-sm font-medium mb-2">Updates<Tooltip text="Check for new versions of the Triple-C app and container image." /></label>

View File

@@ -0,0 +1,249 @@
import { useState, useEffect } from "react";
import { useSettings } from "../../hooks/useSettings";
import { getSttStatus, startStt, stopStt, pullSttImage, buildSttImage } from "../../lib/tauri-commands";
import { listen } from "@tauri-apps/api/event";
import type { SttStatus } from "../../lib/types";
import Tooltip from "../ui/Tooltip";
export default function SttSettings() {
const { appSettings, saveSettings } = useSettings();
const [status, setStatus] = useState<SttStatus | null>(null);
const [loading, setLoading] = useState(false);
const [pulling, setPulling] = useState(false);
const [building, setBuilding] = useState(false);
const [buildLog, setBuildLog] = useState<string | null>(null);
const [model, setModel] = useState(appSettings?.stt?.model ?? "tiny");
const [port, setPort] = useState(String(appSettings?.stt?.port ?? 9876));
const [language, setLanguage] = useState(appSettings?.stt?.language ?? "");
useEffect(() => {
setModel(appSettings?.stt?.model ?? "tiny");
setPort(String(appSettings?.stt?.port ?? 9876));
setLanguage(appSettings?.stt?.language ?? "");
}, [appSettings?.stt?.model, appSettings?.stt?.port, appSettings?.stt?.language]);
useEffect(() => {
refreshStatus();
}, []);
const refreshStatus = () => {
getSttStatus().then(setStatus).catch(console.error);
};
const handleToggleEnabled = async () => {
if (!appSettings) return;
const newEnabled = !appSettings.stt.enabled;
await saveSettings({
...appSettings,
stt: { ...appSettings.stt, enabled: newEnabled },
});
};
const handleSaveModel = async () => {
if (!appSettings) return;
await saveSettings({
...appSettings,
stt: { ...appSettings.stt, model },
});
};
const handleSavePort = async () => {
if (!appSettings) return;
const portNum = parseInt(port, 10);
if (isNaN(portNum) || portNum < 1 || portNum > 65535) return;
await saveSettings({
...appSettings,
stt: { ...appSettings.stt, port: portNum },
});
};
const handleSaveLanguage = async () => {
if (!appSettings) return;
await saveSettings({
...appSettings,
stt: { ...appSettings.stt, language: language || null },
});
};
const handleStartStop = async () => {
setLoading(true);
try {
if (status?.running) {
await stopStt();
} else {
await startStt();
}
refreshStatus();
} catch (e) {
console.error("STT toggle failed:", e);
} finally {
setLoading(false);
}
};
const handlePull = async () => {
setPulling(true);
setBuildLog(null);
const unlisten = await listen<string>("stt-pull-progress", (event) => {
setBuildLog(event.payload);
});
try {
await pullSttImage();
refreshStatus();
} catch (e) {
console.error("STT image pull failed:", e);
setBuildLog(`Error: ${e}`);
} finally {
setPulling(false);
unlisten();
}
};
const handleBuild = async () => {
setBuilding(true);
setBuildLog(null);
const unlisten = await listen<string>("stt-build-progress", (event) => {
setBuildLog(event.payload);
});
try {
await buildSttImage();
refreshStatus();
} catch (e) {
console.error("STT image build failed:", e);
setBuildLog(`Error: ${e}`);
} finally {
setBuilding(false);
unlisten();
}
};
return (
<div>
<label className="block text-sm font-medium mb-1">
Speech to Text
<Tooltip text="Transcribe speech to text using Faster Whisper in a Docker container. Adds a mic button to the terminal." />
</label>
<p className="text-xs text-[var(--text-secondary)] mb-2">
Click the mic button in the terminal to dictate text via speech recognition.
</p>
<div className="space-y-2">
{/* Enable toggle */}
<div className="flex items-center gap-2">
<button
onClick={handleToggleEnabled}
className={`px-2 py-0.5 text-xs rounded transition-colors ${
appSettings?.stt?.enabled
? "bg-[var(--success)] text-white"
: "bg-[var(--bg-primary)] border border-[var(--border-color)] text-[var(--text-secondary)]"
}`}
>
{appSettings?.stt?.enabled ? "ON" : "OFF"}
</button>
<span className="text-xs text-[var(--text-secondary)]">
{appSettings?.stt?.enabled ? "Enabled" : "Disabled"}
</span>
</div>
{appSettings?.stt?.enabled && (
<>
{/* Model selector */}
<div>
<label className="block text-xs text-[var(--text-secondary)] mb-1">Model</label>
<select
value={model}
onChange={(e) => setModel(e.target.value)}
onBlur={handleSaveModel}
className="w-full px-2 py-1 text-sm bg-[var(--bg-primary)] border border-[var(--border-color)] rounded focus:outline-none focus:border-[var(--accent)]"
>
<option value="tiny">Tiny (fastest, ~75MB)</option>
<option value="small">Small (balanced, ~500MB)</option>
<option value="medium">Medium (most accurate, ~1.5GB)</option>
</select>
</div>
{/* Port */}
<div>
<label className="block text-xs text-[var(--text-secondary)] mb-1">Port</label>
<input
type="number"
value={port}
onChange={(e) => setPort(e.target.value)}
onBlur={handleSavePort}
min={1}
max={65535}
className="w-full px-2 py-1 text-sm bg-[var(--bg-primary)] border border-[var(--border-color)] rounded focus:outline-none focus:border-[var(--accent)]"
/>
</div>
{/* Language */}
<div>
<label className="block text-xs text-[var(--text-secondary)] mb-1">Language (optional)</label>
<input
type="text"
value={language}
onChange={(e) => setLanguage(e.target.value)}
onBlur={handleSaveLanguage}
placeholder="Auto-detect"
className="w-full px-2 py-1 text-sm bg-[var(--bg-primary)] border border-[var(--border-color)] rounded focus:outline-none focus:border-[var(--accent)]"
/>
</div>
{/* Container status + controls */}
<div className="pt-1">
<label className="block text-xs text-[var(--text-secondary)] mb-1">STT Container</label>
<div className="flex items-center gap-2 flex-wrap">
<span className="text-xs text-[var(--text-secondary)]">
{status?.image_exists
? status.running
? `Running (port ${status.port}, model: ${status.model})`
: status.container_exists
? "Stopped"
: "Image ready"
: "No image"}
</span>
{status?.image_exists && (
<button
onClick={handleStartStop}
disabled={loading}
className={`px-2 py-0.5 text-xs rounded transition-colors ${
status?.running
? "text-[var(--error)] hover:bg-[var(--bg-primary)]"
: "text-[var(--success)] hover:bg-[var(--bg-primary)]"
}`}
>
{loading ? "..." : status?.running ? "Stop" : "Start"}
</button>
)}
</div>
{/* Image actions */}
<div className="flex items-center gap-2 mt-2">
<button
onClick={handlePull}
disabled={pulling || building}
className="px-3 py-1 text-xs bg-[var(--bg-primary)] border border-[var(--border-color)] rounded hover:bg-[var(--border-color)] disabled:opacity-50 transition-colors"
>
{pulling ? "Pulling..." : "Pull Image"}
</button>
<button
onClick={handleBuild}
disabled={pulling || building}
className="px-3 py-1 text-xs bg-[var(--bg-primary)] border border-[var(--border-color)] rounded hover:bg-[var(--border-color)] disabled:opacity-50 transition-colors"
>
{building ? "Building..." : "Build Locally"}
</button>
</div>
{buildLog && (
<pre className="mt-2 text-[10px] text-[var(--text-secondary)] bg-[var(--bg-primary)] border border-[var(--border-color)] rounded px-2 py-1 max-h-20 overflow-y-auto whitespace-pre-wrap">
{buildLog}
</pre>
)}
</div>
</>
)}
</div>
</div>
);
}

View File

@@ -0,0 +1,107 @@
import { useCallback, useEffect, useRef, useState } from "react";
import { useSTT } from "../../hooks/useSTT";
import * as commands from "../../lib/tauri-commands";
interface Props {
sessionId: string;
sendInput: (sessionId: string, data: string) => Promise<void>;
}
export default function SttButton({ sessionId, sendInput }: Props) {
const { state, error, toggle, cancelRecording } = useSTT(sessionId, sendInput);
const [elapsed, setElapsed] = useState(0);
const timerRef = useRef<ReturnType<typeof setInterval> | null>(null);
// Track recording duration
useEffect(() => {
if (state === "recording") {
setElapsed(0);
timerRef.current = setInterval(() => setElapsed((e) => e + 1), 1000);
} else {
if (timerRef.current) {
clearInterval(timerRef.current);
timerRef.current = null;
}
}
return () => {
if (timerRef.current) clearInterval(timerRef.current);
};
}, [state]);
const handleClick = useCallback(async () => {
// Auto-start STT container if not running
if (state === "idle") {
try {
const status = await commands.getSttStatus();
if (!status.running) {
await commands.startStt();
}
} catch {
// Container start failed, toggle will still attempt transcription
}
}
await toggle();
}, [state, toggle]);
const handleContextMenu = useCallback(
(e: React.MouseEvent) => {
e.preventDefault();
if (state === "recording") {
cancelRecording();
}
},
[state, cancelRecording],
);
const formatTime = (seconds: number) => {
const m = Math.floor(seconds / 60);
const s = seconds % 60;
return `${m}:${s.toString().padStart(2, "0")}`;
};
return (
<div className="absolute bottom-4 left-4 z-50 flex items-center gap-2">
<button
onClick={handleClick}
onContextMenu={handleContextMenu}
disabled={state === "transcribing"}
className={`w-8 h-8 rounded-full flex items-center justify-center transition-all cursor-pointer ${
state === "recording"
? "bg-[#f85149] text-white shadow-lg animate-pulse"
: state === "transcribing"
? "bg-[#1f2937] text-[#58a6ff] border border-[#30363d] opacity-80"
: "bg-[#1f2937]/80 text-[#8b949e] border border-[#30363d] hover:text-[#e6edf3] hover:bg-[#2d3748]"
}`}
title={
state === "recording"
? "Click to stop and transcribe (right-click to cancel)"
: state === "transcribing"
? "Transcribing..."
: "Speech to text"
}
>
{state === "transcribing" ? (
<svg className="w-4 h-4 animate-spin" viewBox="0 0 24 24" fill="none">
<circle cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="2" opacity="0.25" />
<path d="M12 2a10 10 0 0 1 10 10" stroke="currentColor" strokeWidth="2" strokeLinecap="round" />
</svg>
) : (
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="currentColor">
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3z" />
<path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z" />
</svg>
)}
</button>
{state === "recording" && (
<span className="text-xs text-[#f85149] font-mono bg-[#1f2937] px-2 py-0.5 rounded border border-[#30363d]">
{formatTime(elapsed)}
</span>
)}
{state === "error" && error && (
<span className="text-xs text-[#f85149] bg-[#1f2937] px-2 py-0.5 rounded border border-[#30363d] max-w-[200px] truncate">
{error}
</span>
)}
</div>
);
}

View File

@@ -7,6 +7,7 @@ import { openUrl } from "@tauri-apps/plugin-opener";
import "@xterm/xterm/css/xterm.css";
import { useTerminal } from "../../hooks/useTerminal";
import { useAppState } from "../../store/appState";
import SttButton from "./SttButton";
import { awsSsoRefresh } from "../../lib/tauri-commands";
import { UrlDetector } from "../../lib/urlDetector";
import UrlToast from "./UrlToast";
@@ -25,6 +26,7 @@ export default function TerminalView({ sessionId, active }: Props) {
const detectorRef = useRef<UrlDetector | null>(null);
const { sendInput, pasteImage, resize, onOutput, onExit } = useTerminal();
const setTerminalHasSelection = useAppState(s => s.setTerminalHasSelection);
const sttEnabled = useAppState(s => s.appSettings?.stt?.enabled);
const ssoBufferRef = useRef("");
const ssoTriggeredRef = useRef(false);
@@ -424,6 +426,8 @@ export default function TerminalView({ sessionId, active }: Props) {
>
{isAutoFollow ? "▼ Following" : "▽ Paused"}
</button>
{/* STT mic button - bottom left */}
{sttEnabled && <SttButton sessionId={sessionId} sendInput={sendInput} />}
{/* Jump to Current - bottom right, when scrolled up */}
{!isAtBottom && (
<button