2026-04-13 03:35:40 +00:00
19 changed files with 1121 additions and 2 deletions
--- a/.gitea/workflows/build-stt.yml
+++ b/.gitea/workflows/build-stt.yml
@@ -0,0 +1,59 @@
+name: Build STT Container
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "stt-container/**"
+      - ".gitea/workflows/build-stt.yml"
+  pull_request:
+    branches: [main]
+    paths:
+      - "stt-container/**"
+      - ".gitea/workflows/build-stt.yml"
+
+env:
+  REGISTRY: repo.anhonesthost.net
+  IMAGE_NAME: cybercovellc/triple-c/triple-c-stt
+
+jobs:
+  build-stt-container:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Gitea Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ gitea.actor }}
+          password: ${{ secrets.REGISTRY_TOKEN }}
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: shadowdao
+          password: ${{ secrets.GH_PAT }}
+
+      - name: Build and push STT container image
+        uses: docker/build-push-action@v5
+        with:
+          context: ./stt-container
+          file: ./stt-container/Dockerfile
+          platforms: linux/amd64,linux/arm64
+          push: ${{ gitea.event_name == 'push' }}
+          tags: |
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ gitea.sha }}
+            ghcr.io/shadowdao/triple-c-stt:latest
+            ghcr.io/shadowdao/triple-c-stt:${{ gitea.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
--- a/app/src-tauri/Cargo.lock
+++ b/app/src-tauri/Cargo.lock
@@ -2345,6 +2345,16 @@ version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"

+[[package]]
+name = "mime_guess"
+version = "2.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e"
+dependencies = [
+ "mime",
+ "unicase",
+]
+
 [[package]]
 name = "miniz_oxide"
 version = "0.8.9"
@@ -3454,6 +3464,7 @@ dependencies = [
 "base64 0.22.1",
 "bytes",
 "futures-core",
+ "futures-util",
 "http",
 "http-body",
 "http-body-util",
@@ -3462,6 +3473,7 @@ dependencies = [
 "hyper-util",
 "js-sys",
 "log",
+ "mime_guess",
 "percent-encoding",
 "pin-project-lite",
 "quinn",
@@ -5053,6 +5065,12 @@ dependencies = [
 "unic-common",
 ]

+[[package]]
+name = "unicase"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.24"
--- a/app/src-tauri/Cargo.toml
+++ b/app/src-tauri/Cargo.toml
@@ -29,7 +29,7 @@ log = "0.4"
 fern = { version = "0.7", features = ["date-based"] }
 tar = "0.4"
 include_dir = "0.7"
-reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
+reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls", "multipart"] }
 iana-time-zone = "0.1"
 sha2 = "0.10"
 axum = { version = "0.8", features = ["ws"] }
--- a/app/src-tauri/src/commands/mod.rs
+++ b/app/src-tauri/src/commands/mod.rs
@@ -5,6 +5,7 @@ pub mod help_commands;
 pub mod mcp_commands;
 pub mod project_commands;
 pub mod settings_commands;
+pub mod stt_commands;
 pub mod terminal_commands;
 pub mod update_commands;
 pub mod web_terminal_commands;
--- a/app/src-tauri/src/commands/stt_commands.rs
+++ b/app/src-tauri/src/commands/stt_commands.rs
@@ -0,0 +1,92 @@
+use tauri::{AppHandle, Emitter, State};
+
+use crate::docker::stt;
+use crate::models::app_settings::SttStatus;
+use crate::AppState;
+
+#[tauri::command]
+pub async fn get_stt_status(state: State<'_, AppState>) -> Result<SttStatus, String> {
+    let settings = state.settings_store.get();
+    stt::get_stt_status(&settings.stt).await
+}
+
+#[tauri::command]
+pub async fn start_stt(state: State<'_, AppState>) -> Result<SttStatus, String> {
+    let settings = state.settings_store.get();
+    stt::ensure_stt_running(&settings.stt).await
+}
+
+#[tauri::command]
+pub async fn stop_stt() -> Result<(), String> {
+    stt::stop_stt_container().await
+}
+
+#[tauri::command]
+pub async fn build_stt_image(app_handle: AppHandle) -> Result<(), String> {
+    stt::build_stt_image(move |msg| {
+        let _ = app_handle.emit("stt-build-progress", &msg);
+    })
+    .await
+}
+
+#[tauri::command]
+pub async fn pull_stt_image(app_handle: AppHandle) -> Result<(), String> {
+    stt::pull_stt_image(move |msg| {
+        let _ = app_handle.emit("stt-pull-progress", &msg);
+    })
+    .await
+}
+
+#[tauri::command]
+pub async fn transcribe_audio(
+    audio_data: Vec<u8>,
+    state: State<'_, AppState>,
+) -> Result<String, String> {
+    let settings = state.settings_store.get();
+    if !settings.stt.enabled {
+        return Err("STT is not enabled".to_string());
+    }
+
+    let url = format!("http://127.0.0.1:{}/transcribe", settings.stt.port);
+
+    let file_part = reqwest::multipart::Part::bytes(audio_data)
+        .file_name("recording.wav")
+        .mime_str("audio/wav")
+        .map_err(|e| format!("Failed to create multipart: {}", e))?;
+
+    let mut form = reqwest::multipart::Form::new().part("file", file_part);
+
+    if let Some(ref lang) = settings.stt.language {
+        form = form.text("language", lang.clone());
+    }
+
+    let client = reqwest::Client::new();
+    let response = client
+        .post(&url)
+        .multipart(form)
+        .send()
+        .await
+        .map_err(|e| {
+            if e.is_connect() {
+                "STT container is not running. Start it from Settings.".to_string()
+            } else {
+                format!("Transcription request failed: {}", e)
+            }
+        })?;
+
+    if !response.status().is_success() {
+        let status = response.status();
+        let body = response.text().await.unwrap_or_default();
+        return Err(format!("Transcription failed ({}): {}", status, body));
+    }
+
+    let result: serde_json::Value = response
+        .json()
+        .await
+        .map_err(|e| format!("Failed to parse transcription response: {}", e))?;
+
+    result["text"]
+        .as_str()
+        .map(|s| s.to_string())
+        .ok_or_else(|| "No text in transcription response".to_string())
+}
--- a/app/src-tauri/src/docker/mod.rs
+++ b/app/src-tauri/src/docker/mod.rs
@@ -3,7 +3,10 @@ pub mod container;
 pub mod image;
 pub mod exec;
 pub mod network;
+pub mod stt;

+#[allow(unused_imports)]
+pub use stt::*;
 #[allow(unused_imports)]
 pub use client::*;
 #[allow(unused_imports)]
--- a/app/src-tauri/src/docker/stt.rs
+++ b/app/src-tauri/src/docker/stt.rs
@@ -0,0 +1,266 @@
+use bollard::container::{
+    Config, CreateContainerOptions, ListContainersOptions, RemoveContainerOptions,
+    StartContainerOptions, StopContainerOptions,
+};
+use bollard::image::BuildImageOptions;
+use bollard::models::{HostConfig, Mount, MountTypeEnum, PortBinding};
+use futures_util::StreamExt;
+use std::collections::HashMap;
+use std::io::Write;
+
+use super::client::get_docker;
+use crate::models::app_settings::{SttSettings, SttStatus};
+
+const STT_CONTAINER_NAME: &str = "triple-c-stt";
+const STT_MODEL_VOLUME: &str = "triple-c-stt-model-cache";
+const STT_REGISTRY_IMAGE: &str = "ghcr.io/shadowdao/triple-c-stt:latest";
+const STT_LOCAL_IMAGE: &str = "triple-c-stt:latest";
+const STT_DOCKERFILE: &str = include_str!("../../../../stt-container/Dockerfile");
+const STT_SERVER: &str = include_str!("../../../../stt-container/server.py");
+
+pub async fn get_stt_status(settings: &SttSettings) -> Result<SttStatus, String> {
+    let image_exists = super::image::image_exists(STT_REGISTRY_IMAGE).await.unwrap_or(false)
+        || super::image::image_exists(STT_LOCAL_IMAGE).await.unwrap_or(false);
+
+    let (container_exists, running, model) = match find_stt_container().await? {
+        Some((_, state, env_model)) => (true, state == "running", env_model),
+        None => (false, false, settings.model.clone()),
+    };
+
+    Ok(SttStatus {
+        container_exists,
+        running,
+        port: settings.port,
+        model,
+        image_exists,
+    })
+}
+
+async fn find_stt_container() -> Result<Option<(String, String, String)>, String> {
+    let docker = get_docker()?;
+
+    let filters: HashMap<String, Vec<String>> = HashMap::from([(
+        "name".to_string(),
+        vec![format!("/{}", STT_CONTAINER_NAME)],
+    )]);
+
+    let containers = docker
+        .list_containers(Some(ListContainersOptions {
+            all: true,
+            filters,
+            ..Default::default()
+        }))
+        .await
+        .map_err(|e| format!("Failed to list containers: {}", e))?;
+
+    if let Some(container) = containers.first() {
+        let id = container.id.clone().unwrap_or_default();
+        let state = container.state.clone().unwrap_or_default();
+
+        // Extract WHISPER_MODEL from container env
+        let model = container
+            .labels
+            .as_ref()
+            .and_then(|l| l.get("triple-c.stt.model"))
+            .cloned()
+            .unwrap_or_else(|| "tiny".to_string());
+
+        return Ok(Some((id, state, model)));
+    }
+
+    Ok(None)
+}
+
+async fn create_stt_container(settings: &SttSettings) -> Result<String, String> {
+    let docker = get_docker()?;
+
+    // Try local image first, fall back to registry
+    let image = if super::image::image_exists(STT_LOCAL_IMAGE).await.unwrap_or(false) {
+        STT_LOCAL_IMAGE.to_string()
+    } else if super::image::image_exists(STT_REGISTRY_IMAGE).await.unwrap_or(false) {
+        STT_REGISTRY_IMAGE.to_string()
+    } else {
+        return Err("STT image not found. Please build or pull the image first.".to_string());
+    };
+
+    let port_binding = PortBinding {
+        host_ip: Some("127.0.0.1".to_string()),
+        host_port: Some(settings.port.to_string()),
+    };
+
+    let mut port_bindings = HashMap::new();
+    port_bindings.insert(
+        "9876/tcp".to_string(),
+        Some(vec![port_binding]),
+    );
+
+    let host_config = HostConfig {
+        port_bindings: Some(port_bindings),
+        mounts: Some(vec![Mount {
+            target: Some("/root/.cache/huggingface".to_string()),
+            source: Some(STT_MODEL_VOLUME.to_string()),
+            typ: Some(MountTypeEnum::VOLUME),
+            ..Default::default()
+        }]),
+        ..Default::default()
+    };
+
+    let mut labels = HashMap::new();
+    labels.insert(
+        "triple-c.stt.model".to_string(),
+        settings.model.clone(),
+    );
+    labels.insert(
+        "triple-c.stt.port".to_string(),
+        settings.port.to_string(),
+    );
+
+    let config = Config {
+        image: Some(image),
+        env: Some(vec![format!("WHISPER_MODEL={}", settings.model)]),
+        host_config: Some(host_config),
+        labels: Some(labels),
+        ..Default::default()
+    };
+
+    let options = CreateContainerOptions {
+        name: STT_CONTAINER_NAME,
+        ..Default::default()
+    };
+
+    let response = docker
+        .create_container(Some(options), config)
+        .await
+        .map_err(|e| format!("Failed to create STT container: {}", e))?;
+
+    Ok(response.id)
+}
+
+pub async fn ensure_stt_running(settings: &SttSettings) -> Result<SttStatus, String> {
+    let docker = get_docker()?;
+
+    // Check if container exists and if settings match
+    if let Some((id, state, model)) = find_stt_container().await? {
+        let needs_recreate = model != settings.model;
+
+        if needs_recreate {
+            // Settings changed, recreate
+            if state == "running" {
+                docker
+                    .stop_container(&id, None::<StopContainerOptions>)
+                    .await
+                    .map_err(|e| format!("Failed to stop STT container: {}", e))?;
+            }
+            docker
+                .remove_container(
+                    &id,
+                    Some(RemoveContainerOptions {
+                        force: true,
+                        ..Default::default()
+                    }),
+                )
+                .await
+                .map_err(|e| format!("Failed to remove STT container: {}", e))?;
+        } else if state == "running" {
+            return get_stt_status(settings).await;
+        } else {
+            // Container exists but stopped, start it
+            docker
+                .start_container(&id, None::<StartContainerOptions<String>>)
+                .await
+                .map_err(|e| format!("Failed to start STT container: {}", e))?;
+            return get_stt_status(settings).await;
+        }
+    }
+
+    // Create and start new container
+    let id = create_stt_container(settings).await?;
+    docker
+        .start_container(&id, None::<StartContainerOptions<String>>)
+        .await
+        .map_err(|e| format!("Failed to start STT container: {}", e))?;
+
+    get_stt_status(settings).await
+}
+
+pub async fn stop_stt_container() -> Result<(), String> {
+    let docker = get_docker()?;
+
+    if let Some((id, state, _)) = find_stt_container().await? {
+        if state == "running" {
+            docker
+                .stop_container(&id, None::<StopContainerOptions>)
+                .await
+                .map_err(|e| format!("Failed to stop STT container: {}", e))?;
+        }
+    }
+
+    Ok(())
+}
+
+pub async fn pull_stt_image<F>(on_progress: F) -> Result<(), String>
+where
+    F: Fn(String) + Send + 'static,
+{
+    super::image::pull_image(STT_REGISTRY_IMAGE, on_progress).await
+}
+
+pub async fn build_stt_image<F>(on_progress: F) -> Result<(), String>
+where
+    F: Fn(String) + Send + 'static,
+{
+    let docker = get_docker()?;
+
+    let tar_bytes = create_stt_build_context()
+        .map_err(|e| format!("Failed to create STT build context: {}", e))?;
+
+    let options = BuildImageOptions {
+        t: STT_LOCAL_IMAGE,
+        rm: true,
+        forcerm: true,
+        ..Default::default()
+    };
+
+    let mut stream = docker.build_image(options, None, Some(tar_bytes.into()));
+
+    while let Some(result) = stream.next().await {
+        match result {
+            Ok(output) => {
+                if let Some(stream) = output.stream {
+                    on_progress(stream);
+                }
+                if let Some(error) = output.error {
+                    return Err(format!("Build error: {}", error));
+                }
+            }
+            Err(e) => return Err(format!("Build stream error: {}", e)),
+        }
+    }
+
+    Ok(())
+}
+
+fn create_stt_build_context() -> Result<Vec<u8>, std::io::Error> {
+    let mut buf = Vec::new();
+    {
+        let mut archive = tar::Builder::new(&mut buf);
+
+        let mut dockerfile_header = tar::Header::new_gnu();
+        dockerfile_header.set_size(STT_DOCKERFILE.len() as u64);
+        dockerfile_header.set_mode(0o644);
+        dockerfile_header.set_cksum();
+        archive.append_data(&mut dockerfile_header, "Dockerfile", STT_DOCKERFILE.as_bytes())?;
+
+        let mut server_header = tar::Header::new_gnu();
+        server_header.set_size(STT_SERVER.len() as u64);
+        server_header.set_mode(0o644);
+        server_header.set_cksum();
+        archive.append_data(&mut server_header, "server.py", STT_SERVER.as_bytes())?;
+
+        archive.finish()?;
+    }
+
+    let _ = buf.flush();
+    Ok(buf)
+}
+
--- a/app/src-tauri/src/lib.rs
+++ b/app/src-tauri/src/lib.rs
@@ -122,6 +122,8 @@ pub fn run() {
                    if let Some(server) = server_guard.take() {
                        server.stop();
                    }
+                    // Stop STT container
+                    let _ = docker::stt::stop_stt_container().await;
                    // Close all exec sessions
                    state.exec_manager.close_all_sessions().await;
                });
@@ -181,6 +183,13 @@ pub fn run() {
            commands::web_terminal_commands::stop_web_terminal,
            commands::web_terminal_commands::get_web_terminal_status,
            commands::web_terminal_commands::regenerate_web_terminal_token,
+            // STT
+            commands::stt_commands::get_stt_status,
+            commands::stt_commands::start_stt,
+            commands::stt_commands::stop_stt,
+            commands::stt_commands::build_stt_image,
+            commands::stt_commands::pull_stt_image,
+            commands::stt_commands::transcribe_audio,
        ])
        .run(tauri::generate_context!())
        .expect("error while running tauri application");
--- a/app/src-tauri/src/models/app_settings.rs
+++ b/app/src-tauri/src/models/app_settings.rs
@@ -76,6 +76,48 @@ pub struct AppSettings {
    pub dismissed_image_digest: Option<String>,
    #[serde(default)]
    pub web_terminal: WebTerminalSettings,
+    #[serde(default)]
+    pub stt: SttSettings,
+}
+
+fn default_stt_model() -> String {
+    "tiny".to_string()
+}
+
+fn default_stt_port() -> u16 {
+    9876
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SttSettings {
+    #[serde(default)]
+    pub enabled: bool,
+    #[serde(default = "default_stt_model")]
+    pub model: String,
+    #[serde(default = "default_stt_port")]
+    pub port: u16,
+    #[serde(default)]
+    pub language: Option<String>,
+}
+
+impl Default for SttSettings {
+    fn default() -> Self {
+        Self {
+            enabled: false,
+            model: default_stt_model(),
+            port: 9876,
+            language: None,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SttStatus {
+    pub container_exists: bool,
+    pub running: bool,
+    pub port: u16,
+    pub model: String,
+    pub image_exists: bool,
 }

 fn default_web_terminal_port() -> u16 {
@@ -120,6 +162,7 @@ impl Default for AppSettings {
            default_microphone: None,
            dismissed_image_digest: None,
            web_terminal: WebTerminalSettings::default(),
+            stt: SttSettings::default(),
        }
    }
 }
--- a/app/src/components/settings/SettingsPanel.tsx
+++ b/app/src/components/settings/SettingsPanel.tsx
@@ -9,6 +9,7 @@ import { detectHostTimezone } from "../../lib/tauri-commands";
 import type { EnvVar } from "../../lib/types";
 import Tooltip from "../ui/Tooltip";
 import WebTerminalSettings from "./WebTerminalSettings";
+import SttSettings from "./SttSettings";

 export default function SettingsPanel() {
  const { appSettings, saveSettings } = useSettings();
@@ -120,6 +121,9 @@ export default function SettingsPanel() {
      {/* Web Terminal */}
      <WebTerminalSettings />

+      {/* Speech to Text */}
+      <SttSettings />
+
      {/* Updates section */}
      <div>
        <label className="block text-sm font-medium mb-2">Updates<Tooltip text="Check for new versions of the Triple-C app and container image." /></label>
--- a/app/src/components/settings/SttSettings.tsx
+++ b/app/src/components/settings/SttSettings.tsx
@@ -0,0 +1,249 @@
+import { useState, useEffect } from "react";
+import { useSettings } from "../../hooks/useSettings";
+import { getSttStatus, startStt, stopStt, pullSttImage, buildSttImage } from "../../lib/tauri-commands";
+import { listen } from "@tauri-apps/api/event";
+import type { SttStatus } from "../../lib/types";
+import Tooltip from "../ui/Tooltip";
+
+export default function SttSettings() {
+  const { appSettings, saveSettings } = useSettings();
+  const [status, setStatus] = useState<SttStatus | null>(null);
+  const [loading, setLoading] = useState(false);
+  const [pulling, setPulling] = useState(false);
+  const [building, setBuilding] = useState(false);
+  const [buildLog, setBuildLog] = useState<string | null>(null);
+  const [model, setModel] = useState(appSettings?.stt?.model ?? "tiny");
+  const [port, setPort] = useState(String(appSettings?.stt?.port ?? 9876));
+  const [language, setLanguage] = useState(appSettings?.stt?.language ?? "");
+
+  useEffect(() => {
+    setModel(appSettings?.stt?.model ?? "tiny");
+    setPort(String(appSettings?.stt?.port ?? 9876));
+    setLanguage(appSettings?.stt?.language ?? "");
+  }, [appSettings?.stt?.model, appSettings?.stt?.port, appSettings?.stt?.language]);
+
+  useEffect(() => {
+    refreshStatus();
+  }, []);
+
+  const refreshStatus = () => {
+    getSttStatus().then(setStatus).catch(console.error);
+  };
+
+  const handleToggleEnabled = async () => {
+    if (!appSettings) return;
+    const newEnabled = !appSettings.stt.enabled;
+    await saveSettings({
+      ...appSettings,
+      stt: { ...appSettings.stt, enabled: newEnabled },
+    });
+  };
+
+  const handleSaveModel = async () => {
+    if (!appSettings) return;
+    await saveSettings({
+      ...appSettings,
+      stt: { ...appSettings.stt, model },
+    });
+  };
+
+  const handleSavePort = async () => {
+    if (!appSettings) return;
+    const portNum = parseInt(port, 10);
+    if (isNaN(portNum) || portNum < 1 || portNum > 65535) return;
+    await saveSettings({
+      ...appSettings,
+      stt: { ...appSettings.stt, port: portNum },
+    });
+  };
+
+  const handleSaveLanguage = async () => {
+    if (!appSettings) return;
+    await saveSettings({
+      ...appSettings,
+      stt: { ...appSettings.stt, language: language || null },
+    });
+  };
+
+  const handleStartStop = async () => {
+    setLoading(true);
+    try {
+      if (status?.running) {
+        await stopStt();
+      } else {
+        await startStt();
+      }
+      refreshStatus();
+    } catch (e) {
+      console.error("STT toggle failed:", e);
+    } finally {
+      setLoading(false);
+    }
+  };
+
+  const handlePull = async () => {
+    setPulling(true);
+    setBuildLog(null);
+    const unlisten = await listen<string>("stt-pull-progress", (event) => {
+      setBuildLog(event.payload);
+    });
+    try {
+      await pullSttImage();
+      refreshStatus();
+    } catch (e) {
+      console.error("STT image pull failed:", e);
+      setBuildLog(`Error: ${e}`);
+    } finally {
+      setPulling(false);
+      unlisten();
+    }
+  };
+
+  const handleBuild = async () => {
+    setBuilding(true);
+    setBuildLog(null);
+    const unlisten = await listen<string>("stt-build-progress", (event) => {
+      setBuildLog(event.payload);
+    });
+    try {
+      await buildSttImage();
+      refreshStatus();
+    } catch (e) {
+      console.error("STT image build failed:", e);
+      setBuildLog(`Error: ${e}`);
+    } finally {
+      setBuilding(false);
+      unlisten();
+    }
+  };
+
+  return (
+    <div>
+      <label className="block text-sm font-medium mb-1">
+        Speech to Text
+        <Tooltip text="Transcribe speech to text using Faster Whisper in a Docker container. Adds a mic button to the terminal." />
+      </label>
+      <p className="text-xs text-[var(--text-secondary)] mb-2">
+        Click the mic button in the terminal to dictate text via speech recognition.
+      </p>
+
+      <div className="space-y-2">
+        {/* Enable toggle */}
+        <div className="flex items-center gap-2">
+          <button
+            onClick={handleToggleEnabled}
+            className={`px-2 py-0.5 text-xs rounded transition-colors ${
+              appSettings?.stt?.enabled
+                ? "bg-[var(--success)] text-white"
+                : "bg-[var(--bg-primary)] border border-[var(--border-color)] text-[var(--text-secondary)]"
+            }`}
+          >
+            {appSettings?.stt?.enabled ? "ON" : "OFF"}
+          </button>
+          <span className="text-xs text-[var(--text-secondary)]">
+            {appSettings?.stt?.enabled ? "Enabled" : "Disabled"}
+          </span>
+        </div>
+
+        {appSettings?.stt?.enabled && (
+          <>
+            {/* Model selector */}
+            <div>
+              <label className="block text-xs text-[var(--text-secondary)] mb-1">Model</label>
+              <select
+                value={model}
+                onChange={(e) => setModel(e.target.value)}
+                onBlur={handleSaveModel}
+                className="w-full px-2 py-1 text-sm bg-[var(--bg-primary)] border border-[var(--border-color)] rounded focus:outline-none focus:border-[var(--accent)]"
+              >
+                <option value="tiny">Tiny (fastest, ~75MB)</option>
+                <option value="small">Small (balanced, ~500MB)</option>
+                <option value="medium">Medium (most accurate, ~1.5GB)</option>
+              </select>
+            </div>
+
+            {/* Port */}
+            <div>
+              <label className="block text-xs text-[var(--text-secondary)] mb-1">Port</label>
+              <input
+                type="number"
+                value={port}
+                onChange={(e) => setPort(e.target.value)}
+                onBlur={handleSavePort}
+                min={1}
+                max={65535}
+                className="w-full px-2 py-1 text-sm bg-[var(--bg-primary)] border border-[var(--border-color)] rounded focus:outline-none focus:border-[var(--accent)]"
+              />
+            </div>
+
+            {/* Language */}
+            <div>
+              <label className="block text-xs text-[var(--text-secondary)] mb-1">Language (optional)</label>
+              <input
+                type="text"
+                value={language}
+                onChange={(e) => setLanguage(e.target.value)}
+                onBlur={handleSaveLanguage}
+                placeholder="Auto-detect"
+                className="w-full px-2 py-1 text-sm bg-[var(--bg-primary)] border border-[var(--border-color)] rounded focus:outline-none focus:border-[var(--accent)]"
+              />
+            </div>
+
+            {/* Container status + controls */}
+            <div className="pt-1">
+              <label className="block text-xs text-[var(--text-secondary)] mb-1">STT Container</label>
+              <div className="flex items-center gap-2 flex-wrap">
+                <span className="text-xs text-[var(--text-secondary)]">
+                  {status?.image_exists
+                    ? status.running
+                      ? `Running (port ${status.port}, model: ${status.model})`
+                      : status.container_exists
+                        ? "Stopped"
+                        : "Image ready"
+                    : "No image"}
+                </span>
+                {status?.image_exists && (
+                  <button
+                    onClick={handleStartStop}
+                    disabled={loading}
+                    className={`px-2 py-0.5 text-xs rounded transition-colors ${
+                      status?.running
+                        ? "text-[var(--error)] hover:bg-[var(--bg-primary)]"
+                        : "text-[var(--success)] hover:bg-[var(--bg-primary)]"
+                    }`}
+                  >
+                    {loading ? "..." : status?.running ? "Stop" : "Start"}
+                  </button>
+                )}
+              </div>
+
+              {/* Image actions */}
+              <div className="flex items-center gap-2 mt-2">
+                <button
+                  onClick={handlePull}
+                  disabled={pulling || building}
+                  className="px-3 py-1 text-xs bg-[var(--bg-primary)] border border-[var(--border-color)] rounded hover:bg-[var(--border-color)] disabled:opacity-50 transition-colors"
+                >
+                  {pulling ? "Pulling..." : "Pull Image"}
+                </button>
+                <button
+                  onClick={handleBuild}
+                  disabled={pulling || building}
+                  className="px-3 py-1 text-xs bg-[var(--bg-primary)] border border-[var(--border-color)] rounded hover:bg-[var(--border-color)] disabled:opacity-50 transition-colors"
+                >
+                  {building ? "Building..." : "Build Locally"}
+                </button>
+              </div>
+
+              {buildLog && (
+                <pre className="mt-2 text-[10px] text-[var(--text-secondary)] bg-[var(--bg-primary)] border border-[var(--border-color)] rounded px-2 py-1 max-h-20 overflow-y-auto whitespace-pre-wrap">
+                  {buildLog}
+                </pre>
+              )}
+            </div>
+          </>
+        )}
+      </div>
+    </div>
+  );
+}
--- a/app/src/components/terminal/SttButton.tsx
+++ b/app/src/components/terminal/SttButton.tsx
@@ -0,0 +1,107 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { useSTT } from "../../hooks/useSTT";
+import * as commands from "../../lib/tauri-commands";
+
+interface Props {
+  sessionId: string;
+  sendInput: (sessionId: string, data: string) => Promise<void>;
+}
+
+export default function SttButton({ sessionId, sendInput }: Props) {
+  const { state, error, toggle, cancelRecording } = useSTT(sessionId, sendInput);
+  const [elapsed, setElapsed] = useState(0);
+  const timerRef = useRef<ReturnType<typeof setInterval> | null>(null);
+
+  // Track recording duration
+  useEffect(() => {
+    if (state === "recording") {
+      setElapsed(0);
+      timerRef.current = setInterval(() => setElapsed((e) => e + 1), 1000);
+    } else {
+      if (timerRef.current) {
+        clearInterval(timerRef.current);
+        timerRef.current = null;
+      }
+    }
+    return () => {
+      if (timerRef.current) clearInterval(timerRef.current);
+    };
+  }, [state]);
+
+  const handleClick = useCallback(async () => {
+    // Auto-start STT container if not running
+    if (state === "idle") {
+      try {
+        const status = await commands.getSttStatus();
+        if (!status.running) {
+          await commands.startStt();
+        }
+      } catch {
+        // Container start failed, toggle will still attempt transcription
+      }
+    }
+    await toggle();
+  }, [state, toggle]);
+
+  const handleContextMenu = useCallback(
+    (e: React.MouseEvent) => {
+      e.preventDefault();
+      if (state === "recording") {
+        cancelRecording();
+      }
+    },
+    [state, cancelRecording],
+  );
+
+  const formatTime = (seconds: number) => {
+    const m = Math.floor(seconds / 60);
+    const s = seconds % 60;
+    return `${m}:${s.toString().padStart(2, "0")}`;
+  };
+
+  return (
+    <div className="absolute bottom-4 left-4 z-50 flex items-center gap-2">
+      <button
+        onClick={handleClick}
+        onContextMenu={handleContextMenu}
+        disabled={state === "transcribing"}
+        className={`w-8 h-8 rounded-full flex items-center justify-center transition-all cursor-pointer ${
+          state === "recording"
+            ? "bg-[#f85149] text-white shadow-lg animate-pulse"
+            : state === "transcribing"
+              ? "bg-[#1f2937] text-[#58a6ff] border border-[#30363d] opacity-80"
+              : "bg-[#1f2937]/80 text-[#8b949e] border border-[#30363d] hover:text-[#e6edf3] hover:bg-[#2d3748]"
+        }`}
+        title={
+          state === "recording"
+            ? "Click to stop and transcribe (right-click to cancel)"
+            : state === "transcribing"
+              ? "Transcribing..."
+              : "Speech to text"
+        }
+      >
+        {state === "transcribing" ? (
+          <svg className="w-4 h-4 animate-spin" viewBox="0 0 24 24" fill="none">
+            <circle cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="2" opacity="0.25" />
+            <path d="M12 2a10 10 0 0 1 10 10" stroke="currentColor" strokeWidth="2" strokeLinecap="round" />
+          </svg>
+        ) : (
+          <svg className="w-4 h-4" viewBox="0 0 24 24" fill="currentColor">
+            <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3z" />
+            <path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z" />
+          </svg>
+        )}
+      </button>
+      {state === "recording" && (
+        <span className="text-xs text-[#f85149] font-mono bg-[#1f2937] px-2 py-0.5 rounded border border-[#30363d]">
+          {formatTime(elapsed)}
+        </span>
+      )}
+      {state === "error" && error && (
+        <span className="text-xs text-[#f85149] bg-[#1f2937] px-2 py-0.5 rounded border border-[#30363d] max-w-[200px] truncate">
+          {error}
+        </span>
+      )}
+    </div>
+  );
+}
--- a/app/src/components/terminal/TerminalView.tsx
+++ b/app/src/components/terminal/TerminalView.tsx
@@ -7,6 +7,7 @@ import { openUrl } from "@tauri-apps/plugin-opener";
 import "@xterm/xterm/css/xterm.css";
 import { useTerminal } from "../../hooks/useTerminal";
 import { useAppState } from "../../store/appState";
+import SttButton from "./SttButton";
 import { awsSsoRefresh } from "../../lib/tauri-commands";
 import { UrlDetector } from "../../lib/urlDetector";
 import UrlToast from "./UrlToast";
@@ -25,6 +26,7 @@ export default function TerminalView({ sessionId, active }: Props) {
  const detectorRef = useRef<UrlDetector | null>(null);
  const { sendInput, pasteImage, resize, onOutput, onExit } = useTerminal();
  const setTerminalHasSelection = useAppState(s => s.setTerminalHasSelection);
+  const sttEnabled = useAppState(s => s.appSettings?.stt?.enabled);

  const ssoBufferRef = useRef("");
  const ssoTriggeredRef = useRef(false);
@@ -424,6 +426,8 @@ export default function TerminalView({ sessionId, active }: Props) {
      >
        {isAutoFollow ? "▼ Following" : "▽ Paused"}
      </button>
+      {/* STT mic button - bottom left */}
+      {sttEnabled && <SttButton sessionId={sessionId} sendInput={sendInput} />}
      {/* Jump to Current - bottom right, when scrolled up */}
      {!isAtBottom && (
        <button
--- a/app/src/hooks/useSTT.ts
+++ b/app/src/hooks/useSTT.ts
@@ -0,0 +1,145 @@
+import { useCallback, useRef, useState } from "react";
+import * as commands from "../lib/tauri-commands";
+import { encodeWav } from "../lib/wav";
+import { useAppState } from "../store/appState";
+
+export type SttState = "idle" | "recording" | "transcribing" | "error";
+
+export function useSTT(sessionId: string, sendInput: (sessionId: string, data: string) => Promise<void>) {
+  const [state, setState] = useState<SttState>("idle");
+  const [error, setError] = useState<string | null>(null);
+
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+  const workletRef = useRef<AudioWorkletNode | null>(null);
+  const chunksRef = useRef<Int16Array[]>([]);
+
+  const appSettings = useAppState((s) => s.appSettings);
+  const deviceId = appSettings?.default_microphone;
+
+  const startRecording = useCallback(async () => {
+    if (state === "recording" || state === "transcribing") return;
+    setState("recording");
+    setError(null);
+    chunksRef.current = [];
+
+    try {
+      const audioConstraints: MediaTrackConstraints = {
+        channelCount: 1,
+        echoCancellation: true,
+        noiseSuppression: true,
+        autoGainControl: true,
+      };
+      if (deviceId) {
+        audioConstraints.deviceId = { exact: deviceId };
+      }
+
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints });
+      streamRef.current = stream;
+
+      const audioContext = new AudioContext({ sampleRate: 16000 });
+      audioContextRef.current = audioContext;
+
+      await audioContext.audioWorklet.addModule("/audio-capture-processor.js");
+
+      const source = audioContext.createMediaStreamSource(stream);
+      const processor = new AudioWorkletNode(audioContext, "audio-capture-processor");
+      workletRef.current = processor;
+
+      processor.port.onmessage = (event: MessageEvent<ArrayBuffer>) => {
+        chunksRef.current.push(new Int16Array(event.data));
+      };
+
+      source.connect(processor);
+      processor.connect(audioContext.destination);
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      setError(msg);
+      setState("error");
+    }
+  }, [state, deviceId]);
+
+  const stopRecording = useCallback(async () => {
+    if (state !== "recording") return;
+
+    // Stop audio capture
+    workletRef.current?.disconnect();
+    workletRef.current = null;
+
+    if (audioContextRef.current) {
+      await audioContextRef.current.close().catch(() => {});
+      audioContextRef.current = null;
+    }
+
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((t) => t.stop());
+      streamRef.current = null;
+    }
+
+    // Concatenate PCM chunks
+    const chunks = chunksRef.current;
+    chunksRef.current = [];
+
+    if (chunks.length === 0) {
+      setState("idle");
+      return;
+    }
+
+    const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
+    const pcm = new Int16Array(totalLength);
+    let offset = 0;
+    for (const chunk of chunks) {
+      pcm.set(chunk, offset);
+      offset += chunk.length;
+    }
+
+    // Encode to WAV and transcribe
+    setState("transcribing");
+    try {
+      const wavBlob = encodeWav(pcm, 16000);
+      const wavBuffer = await wavBlob.arrayBuffer();
+      const audioData = Array.from(new Uint8Array(wavBuffer));
+
+      const text = await commands.transcribeAudio(audioData);
+      if (text) {
+        await sendInput(sessionId, text);
+      }
+      setState("idle");
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      setError(msg);
+      setState("error");
+      // Reset to idle after a brief delay so the UI shows the error
+      setTimeout(() => setState("idle"), 3000);
+    }
+  }, [state, sessionId, sendInput]);
+
+  const cancelRecording = useCallback(async () => {
+    workletRef.current?.disconnect();
+    workletRef.current = null;
+
+    if (audioContextRef.current) {
+      await audioContextRef.current.close().catch(() => {});
+      audioContextRef.current = null;
+    }
+
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((t) => t.stop());
+      streamRef.current = null;
+    }
+
+    chunksRef.current = [];
+    setState("idle");
+    setError(null);
+  }, []);
+
+  const toggle = useCallback(async () => {
+    if (state === "recording") {
+      await stopRecording();
+    } else if (state === "idle" || state === "error") {
+      await startRecording();
+    }
+  }, [state, startRecording, stopRecording]);
+
+  return { state, error, startRecording, stopRecording, cancelRecording, toggle };
+}
--- a/app/src/lib/tauri-commands.ts
+++ b/app/src/lib/tauri-commands.ts
@@ -1,5 +1,5 @@
 import { invoke } from "@tauri-apps/api/core";
-import type { Project, ProjectPath, ContainerInfo, SiblingContainer, AppSettings, UpdateInfo, ImageUpdateInfo, McpServer, FileEntry, WebTerminalInfo } from "./types";
+import type { Project, ProjectPath, ContainerInfo, SiblingContainer, AppSettings, UpdateInfo, ImageUpdateInfo, McpServer, FileEntry, WebTerminalInfo, SttStatus } from "./types";

 // Docker
 export const checkDocker = () => invoke<boolean>("check_docker");
@@ -98,3 +98,12 @@ export const getWebTerminalStatus = () =>
  invoke<WebTerminalInfo>("get_web_terminal_status");
 export const regenerateWebTerminalToken = () =>
  invoke<WebTerminalInfo>("regenerate_web_terminal_token");
+
+// STT
+export const getSttStatus = () => invoke<SttStatus>("get_stt_status");
+export const startStt = () => invoke<SttStatus>("start_stt");
+export const stopStt = () => invoke<void>("stop_stt");
+export const buildSttImage = () => invoke<void>("build_stt_image");
+export const pullSttImage = () => invoke<void>("pull_stt_image");
+export const transcribeAudio = (audioData: number[]) =>
+  invoke<string>("transcribe_audio", { audioData });
--- a/app/src/lib/types.ts
+++ b/app/src/lib/types.ts
@@ -119,6 +119,22 @@ export interface AppSettings {
  default_microphone: string | null;
  dismissed_image_digest: string | null;
  web_terminal: WebTerminalSettings;
+  stt: SttSettings;
+}
+
+export interface SttSettings {
+  enabled: boolean;
+  model: string;
+  port: number;
+  language: string | null;
+}
+
+export interface SttStatus {
+  container_exists: boolean;
+  running: boolean;
+  port: number;
+  model: string;
+  image_exists: boolean;
 }

 export interface WebTerminalSettings {
--- a/app/src/lib/wav.ts
+++ b/app/src/lib/wav.ts
@@ -0,0 +1,40 @@
+/**
+ * Encode PCM Int16 samples into a WAV file blob.
+ * Assumes mono channel at the given sample rate.
+ */
+export function encodeWav(samples: Int16Array, sampleRate: number): Blob {
+  const byteLength = samples.length * 2;
+  const buffer = new ArrayBuffer(44 + byteLength);
+  const view = new DataView(buffer);
+
+  // RIFF header
+  writeString(view, 0, "RIFF");
+  view.setUint32(4, 36 + byteLength, true);
+  writeString(view, 8, "WAVE");
+
+  // fmt chunk
+  writeString(view, 12, "fmt ");
+  view.setUint32(16, 16, true); // chunk size
+  view.setUint16(20, 1, true); // PCM format
+  view.setUint16(22, 1, true); // mono
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, sampleRate * 2, true); // byte rate
+  view.setUint16(32, 2, true); // block align
+  view.setUint16(34, 16, true); // bits per sample
+
+  // data chunk
+  writeString(view, 36, "data");
+  view.setUint32(40, byteLength, true);
+
+  // PCM samples
+  const output = new Int16Array(buffer, 44);
+  output.set(samples);
+
+  return new Blob([buffer], { type: "audio/wav" });
+}
+
+function writeString(view: DataView, offset: number, str: string) {
+  for (let i = 0; i < str.length; i++) {
+    view.setUint8(offset + i, str.charCodeAt(i));
+  }
+}
--- a/stt-container/Dockerfile
+++ b/stt-container/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.11-slim
+
+RUN pip install --no-cache-dir \
+    faster-whisper \
+    fastapi \
+    uvicorn[standard] \
+    python-multipart
+
+COPY server.py /app/server.py
+
+EXPOSE 9876
+
+CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "9876"]
--- a/stt-container/server.py
+++ b/stt-container/server.py
@@ -0,0 +1,41 @@
+import os
+import tempfile
+
+from faster_whisper import WhisperModel
+from fastapi import FastAPI, File, Form, UploadFile
+from fastapi.responses import JSONResponse
+
+app = FastAPI()
+model: WhisperModel | None = None
+
+
+@app.on_event("startup")
+def load_model():
+    global model
+    model_size = os.environ.get("WHISPER_MODEL", "tiny")
+    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+
+@app.post("/transcribe")
+async def transcribe(
+    file: UploadFile = File(...),
+    language: str = Form(None),
+):
+    if model is None:
+        return JSONResponse(status_code=503, content={"error": "Model not loaded"})
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
+        tmp.write(await file.read())
+        tmp.flush()
+        kwargs = {}
+        if language:
+            kwargs["language"] = language
+        segments, info = model.transcribe(tmp.name, **kwargs)
+        text = " ".join(s.text for s in segments).strip()
+
+    return {"text": text, "language": info.language}
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}