Add speech-to-text feature using Faster Whisper container

Adds a mic button to the terminal UI that captures speech, transcribes it via a Faster Whisper sidecar container, and injects the text into the terminal input. Includes settings panel for model selection (tiny/small/medium), port config, and container lifecycle management. - stt-container/: Dockerfile + FastAPI server for Whisper transcription - Rust backend: STT container management, transcribe_audio IPC command - Frontend: useSTT hook, SttButton, SttSettings, WAV encoder - CI: Gitea Actions workflow for multi-arch STT image builds Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 20:02:39 -07:00
parent 8301fd3690
commit 532de77927
19 changed files with 1121 additions and 2 deletions
--- a/stt-container/server.py
+++ b/stt-container/server.py
@@ -0,0 +1,41 @@
+import os
+import tempfile
+
+from faster_whisper import WhisperModel
+from fastapi import FastAPI, File, Form, UploadFile
+from fastapi.responses import JSONResponse
+
+app = FastAPI()
+model: WhisperModel | None = None
+
+
+@app.on_event("startup")
+def load_model():
+    global model
+    model_size = os.environ.get("WHISPER_MODEL", "tiny")
+    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+
+@app.post("/transcribe")
+async def transcribe(
+    file: UploadFile = File(...),
+    language: str = Form(None),
+):
+    if model is None:
+        return JSONResponse(status_code=503, content={"error": "Model not loaded"})
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
+        tmp.write(await file.read())
+        tmp.flush()
+        kwargs = {}
+        if language:
+            kwargs["language"] = language
+        segments, info = model.transcribe(tmp.name, **kwargs)
+        text = " ".join(s.text for s in segments).strip()
+
+    return {"text": text, "language": info.language}
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}