feat: add voice mode support via mic passthrough to container

Enables Claude Code's /voice command inside Docker containers by capturing microphone audio in the Tauri webview and streaming it into the container via a FIFO pipe. Container: fake rec/arecord shims read PCM from a FIFO instead of a real mic. Audio bridge exec writes PCM from Tauri into the FIFO. Frontend: getUserMedia() + AudioWorklet captures 16kHz mono PCM and streams it to the container via invoke("send_audio_data"). UI: "Mic Off/On" toggle button in the terminal view. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 06:11:33 -08:00
parent 58a10c65e9
commit 86176d8830
9 changed files with 249 additions and 1 deletions
--- a/app/src/components/terminal/TerminalView.tsx
+++ b/app/src/components/terminal/TerminalView.tsx
@@ -6,6 +6,7 @@ import { WebLinksAddon } from "@xterm/addon-web-links";
 import { openUrl } from "@tauri-apps/plugin-opener";
 import "@xterm/xterm/css/xterm.css";
 import { useTerminal } from "../../hooks/useTerminal";
+import { useVoice } from "../../hooks/useVoice";
 import { UrlDetector } from "../../lib/urlDetector";
 import UrlToast from "./UrlToast";

@@ -23,6 +24,8 @@ export default function TerminalView({ sessionId, active }: Props) {
  const detectorRef = useRef<UrlDetector | null>(null);
  const { sendInput, pasteImage, resize, onOutput, onExit } = useTerminal();

+  const voice = useVoice(sessionId);
+
  const [detectedUrl, setDetectedUrl] = useState<string | null>(null);
  const [imagePasteMsg, setImagePasteMsg] = useState<string | null>(null);
  const [isAtBottom, setIsAtBottom] = useState(true);
@@ -200,6 +203,7 @@ export default function TerminalView({ sessionId, active }: Props) {
      try { webglRef.current?.dispose(); } catch { /* may already be disposed */ }
      webglRef.current = null;
      term.dispose();
+      voice.stop();
    };
  }, [sessionId]); // eslint-disable-line react-hooks/exhaustive-deps

@@ -284,6 +288,32 @@ export default function TerminalView({ sessionId, active }: Props) {
          {imagePasteMsg}
        </div>
      )}
+      <button
+        onClick={voice.toggle}
+        title={
+          voice.state === "active"
+            ? "Voice active — click to stop"
+            : voice.error
+              ? `Voice error: ${voice.error}`
+              : "Enable voice input for /voice mode"
+        }
+        className={`absolute bottom-4 left-4 z-50 px-3 py-1.5 rounded-md text-xs font-medium border shadow-lg transition-colors cursor-pointer ${
+          voice.state === "active"
+            ? "bg-[#1a3a2a] text-[#3fb950] border-[#238636] hover:bg-[#243b2a]"
+            : voice.state === "starting"
+              ? "bg-[#1f2937] text-[#d29922] border-[#30363d] opacity-75"
+              : voice.state === "error"
+                ? "bg-[#3a1a1a] text-[#ff7b72] border-[#da3633] hover:bg-[#4a2020]"
+                : "bg-[#1f2937] text-[#b1bac4] border-[#30363d] hover:bg-[#2d3748] hover:text-[#e6edf3]"
+        }`}
+        disabled={voice.state === "starting"}
+      >
+        {voice.state === "active"
+          ? "Mic On"
+          : voice.state === "starting"
+            ? "Mic..."
+            : "Mic Off"}
+      </button>
      {!isAtBottom && (
        <button
          onClick={handleScrollToBottom}
--- a/app/src/hooks/useVoice.ts
+++ b/app/src/hooks/useVoice.ts
@@ -0,0 +1,98 @@
+import { useCallback, useRef, useState } from "react";
+import * as commands from "../lib/tauri-commands";
+
+type VoiceState = "inactive" | "starting" | "active" | "error";
+
+export function useVoice(sessionId: string) {
+  const [state, setState] = useState<VoiceState>("inactive");
+  const [error, setError] = useState<string | null>(null);
+
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+  const workletRef = useRef<AudioWorkletNode | null>(null);
+
+  const start = useCallback(async () => {
+    if (state === "active" || state === "starting") return;
+    setState("starting");
+    setError(null);
+
+    try {
+      // 1. Start the audio bridge in the container (creates FIFO writer)
+      await commands.startAudioBridge(sessionId);
+
+      // 2. Get microphone access
+      const stream = await navigator.mediaDevices.getUserMedia({
+        audio: {
+          channelCount: 1,
+          echoCancellation: true,
+          noiseSuppression: true,
+          autoGainControl: true,
+        },
+      });
+      streamRef.current = stream;
+
+      // 3. Create AudioContext at 16kHz (browser handles resampling)
+      const audioContext = new AudioContext({ sampleRate: 16000 });
+      audioContextRef.current = audioContext;
+
+      // 4. Load AudioWorklet processor
+      await audioContext.audioWorklet.addModule("/audio-capture-processor.js");
+
+      // 5. Connect: mic → worklet → (silent) destination
+      const source = audioContext.createMediaStreamSource(stream);
+      const processor = new AudioWorkletNode(audioContext, "audio-capture-processor");
+      workletRef.current = processor;
+
+      // 6. Handle PCM chunks from the worklet
+      processor.port.onmessage = (event: MessageEvent<ArrayBuffer>) => {
+        const bytes = Array.from(new Uint8Array(event.data));
+        commands.sendAudioData(sessionId, bytes).catch(() => {
+          // Audio bridge may have been closed — ignore send errors
+        });
+      };
+
+      source.connect(processor);
+      processor.connect(audioContext.destination);
+
+      setState("active");
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      setError(msg);
+      setState("error");
+      // Clean up on failure
+      await commands.stopAudioBridge(sessionId).catch(() => {});
+    }
+  }, [sessionId, state]);
+
+  const stop = useCallback(async () => {
+    // Tear down audio pipeline
+    workletRef.current?.disconnect();
+    workletRef.current = null;
+
+    if (audioContextRef.current) {
+      await audioContextRef.current.close().catch(() => {});
+      audioContextRef.current = null;
+    }
+
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((t) => t.stop());
+      streamRef.current = null;
+    }
+
+    // Stop the container-side audio bridge
+    await commands.stopAudioBridge(sessionId).catch(() => {});
+
+    setState("inactive");
+    setError(null);
+  }, [sessionId]);
+
+  const toggle = useCallback(async () => {
+    if (state === "active") {
+      await stop();
+    } else {
+      await start();
+    }
+  }, [state, start, stop]);
+
+  return { state, error, start, stop, toggle };
+}
--- a/app/src/lib/tauri-commands.ts
+++ b/app/src/lib/tauri-commands.ts
@@ -49,6 +49,12 @@ export const closeTerminalSession = (sessionId: string) =>
  invoke<void>("close_terminal_session", { sessionId });
 export const pasteImageToTerminal = (sessionId: string, imageData: number[]) =>
  invoke<string>("paste_image_to_terminal", { sessionId, imageData });
+export const startAudioBridge = (sessionId: string) =>
+  invoke<void>("start_audio_bridge", { sessionId });
+export const sendAudioData = (sessionId: string, data: number[]) =>
+  invoke<void>("send_audio_data", { sessionId, data });
+export const stopAudioBridge = (sessionId: string) =>
+  invoke<void>("stop_audio_bridge", { sessionId });

 // MCP Servers
 export const listMcpServers = () => invoke<McpServer[]>("list_mcp_servers");