Phase 1 foundation: Tauri shell, Python sidecar, SQLite database

Tauri v2 + Svelte + TypeScript frontend: - App shell with workspace layout (waveform, transcript, speakers, AI chat) - Placeholder components for all major UI areas - Typed stores (project, transcript, playback, AI) - TypeScript interfaces matching the database schema - Tauri bridge service with typed invoke wrappers - svelte-check passes with 0 errors Rust backend: - Tauri v2 app entry point with command registration - SQLite database layer (rusqlite with bundled SQLite) - Full schema: projects, media_files, speakers, segments, words, ai_outputs, annotations (with indexes) - Model structs with serde serialization - CRUD queries for projects, speakers, segments, words - Segment text editing preserves original text - Schema versioning for future migrations - 6 tests passing - Command stubs for project, transcribe, export, AI, settings, system - App state management Python sidecar: - JSON-line IPC protocol (stdin/stdout) - Message types: IPCMessage, progress, error, ready - Handler registry with routing and error handling - Ping/pong handler for connectivity testing - Service stubs: transcribe, diarize, pipeline, AI, export - Provider stubs: local (llama-server), OpenAI, Anthropic, LiteLLM - Hardware detection stubs - 14 tests passing, ruff clean Also adds: - Testing strategy document (docs/TESTING.md) - Validation script (scripts/validate.sh) - Updated .gitignore for Svelte, Rust, Python artifacts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 15:16:06 -08:00
parent c450ef3c0c
commit 503cc6c0cf
95 changed files with 9607 additions and 0 deletions
--- a/python/voice_to_notes/init.py
+++ b/python/voice_to_notes/init.py
@@ -0,0 +1,3 @@
+"""Voice to Notes — Python sidecar for transcription, diarization, and AI services."""
+
+__version__ = "0.1.0"
--- a/python/voice_to_notes/hardware/init.py
+++ b/python/voice_to_notes/hardware/init.py
@@ -0,0 +1 @@
+"""Hardware detection and model selection."""
--- a/python/voice_to_notes/hardware/detect.py
+++ b/python/voice_to_notes/hardware/detect.py
@@ -0,0 +1,9 @@
+"""GPU/CPU detection and VRAM estimation."""
+
+from __future__ import annotations
+
+# TODO: Implement hardware detection
+# - Check torch.cuda.is_available()
+# - Detect VRAM size
+# - Detect CPU cores and available RAM
+# - Return recommended model configuration
--- a/python/voice_to_notes/hardware/models.py
+++ b/python/voice_to_notes/hardware/models.py
@@ -0,0 +1,7 @@
+"""Model selection logic based on available hardware."""
+
+from __future__ import annotations
+
+# TODO: Implement model selection
+# - Map hardware capabilities to recommended models
+# - Support user overrides from settings
--- a/python/voice_to_notes/ipc/init.py
+++ b/python/voice_to_notes/ipc/init.py
@@ -0,0 +1 @@
+"""IPC protocol layer for JSON-line communication with the Rust backend."""
--- a/python/voice_to_notes/ipc/handlers.py
+++ b/python/voice_to_notes/ipc/handlers.py
@@ -0,0 +1,39 @@
+"""Message handler registry and routing."""
+
+from __future__ import annotations
+
+import sys
+from collections.abc import Callable
+
+from voice_to_notes.ipc.messages import IPCMessage, error_message
+
+# Handler function type: takes a message, returns a response message
+HandlerFunc = Callable[[IPCMessage], IPCMessage | None]
+
+
+class HandlerRegistry:
+    """Registry mapping message types to handler functions."""
+
+    def __init__(self) -> None:
+        self._handlers: dict[str, HandlerFunc] = {}
+
+    def register(self, message_type: str, handler: HandlerFunc) -> None:
+        """Register a handler for a message type."""
+        self._handlers[message_type] = handler
+
+    def handle(self, msg: IPCMessage) -> IPCMessage | None:
+        """Route a message to its handler. Returns a response or error."""
+        handler = self._handlers.get(msg.type)
+        if handler is None:
+            print(f"[sidecar] Unknown message type: {msg.type}", file=sys.stderr, flush=True)
+            return error_message(msg.id, "unknown_type", f"Unknown message type: {msg.type}")
+        try:
+            return handler(msg)
+        except Exception as e:
+            print(f"[sidecar] Handler error for {msg.type}: {e}", file=sys.stderr, flush=True)
+            return error_message(msg.id, "handler_error", str(e))
+
+
+def ping_handler(msg: IPCMessage) -> IPCMessage:
+    """Simple ping handler for testing connectivity."""
+    return IPCMessage(id=msg.id, type="pong", payload={"echo": msg.payload})
--- a/python/voice_to_notes/ipc/messages.py
+++ b/python/voice_to_notes/ipc/messages.py
@@ -0,0 +1,46 @@
+"""IPC message type definitions."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class IPCMessage:
+    """A message exchanged between Rust and Python via JSON-line protocol."""
+
+    id: str
+    type: str
+    payload: dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"id": self.id, "type": self.type, "payload": self.payload}
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> IPCMessage:
+        return cls(
+            id=data.get("id", ""),
+            type=data.get("type", ""),
+            payload=data.get("payload", {}),
+        )
+
+
+def progress_message(request_id: str, percent: int, stage: str, message: str) -> IPCMessage:
+    return IPCMessage(
+        id=request_id,
+        type="progress",
+        payload={"percent": percent, "stage": stage, "message": message},
+    )
+
+
+def error_message(request_id: str, code: str, message: str) -> IPCMessage:
+    return IPCMessage(
+        id=request_id,
+        type="error",
+        payload={"code": code, "message": message},
+    )
+
+
+def ready_message() -> IPCMessage:
+    return IPCMessage(id="system", type="ready", payload={"version": "0.1.0"})
--- a/python/voice_to_notes/ipc/protocol.py
+++ b/python/voice_to_notes/ipc/protocol.py
@@ -0,0 +1,47 @@
+"""JSON-line protocol reader/writer over stdin/stdout."""
+
+from __future__ import annotations
+
+import json
+import sys
+from typing import Any
+
+from voice_to_notes.ipc.messages import IPCMessage
+
+
+def read_message() -> IPCMessage | None:
+    """Read a single JSON-line message from stdin. Returns None on EOF."""
+    try:
+        line = sys.stdin.readline()
+        if not line:
+            return None  # EOF
+        line = line.strip()
+        if not line:
+            return None
+        data = json.loads(line)
+        return IPCMessage.from_dict(data)
+    except json.JSONDecodeError as e:
+        _log(f"Invalid JSON: {e}")
+        return None
+    except Exception as e:
+        _log(f"Read error: {e}")
+        return None
+
+
+def write_message(msg: IPCMessage) -> None:
+    """Write a JSON-line message to stdout."""
+    line = json.dumps(msg.to_dict(), separators=(",", ":"))
+    sys.stdout.write(line + "\n")
+    sys.stdout.flush()
+
+
+def write_dict(data: dict[str, Any]) -> None:
+    """Write a raw dict as a JSON-line message to stdout."""
+    line = json.dumps(data, separators=(",", ":"))
+    sys.stdout.write(line + "\n")
+    sys.stdout.flush()
+
+
+def _log(message: str) -> None:
+    """Log to stderr (stdout is reserved for IPC)."""
+    print(f"[sidecar] {message}", file=sys.stderr, flush=True)
--- a/python/voice_to_notes/main.py
+++ b/python/voice_to_notes/main.py
@@ -0,0 +1,52 @@
+"""Main entry point for the Voice to Notes Python sidecar."""
+
+from __future__ import annotations
+
+import signal
+import sys
+
+from voice_to_notes.ipc.handlers import HandlerRegistry, ping_handler
+from voice_to_notes.ipc.messages import ready_message
+from voice_to_notes.ipc.protocol import read_message, write_message
+
+
+def create_registry() -> HandlerRegistry:
+    """Set up the message handler registry."""
+    registry = HandlerRegistry()
+    registry.register("ping", ping_handler)
+    # TODO: Register transcribe, diarize, pipeline, ai, export handlers
+    return registry
+
+
+def main() -> None:
+    """Main loop: read messages from stdin, dispatch to handlers, write responses to stdout."""
+
+    # Handle clean shutdown
+    def shutdown(signum: int, frame: object) -> None:
+        print("[sidecar] Shutting down...", file=sys.stderr, flush=True)
+        sys.exit(0)
+
+    signal.signal(signal.SIGTERM, shutdown)
+    signal.signal(signal.SIGINT, shutdown)
+
+    registry = create_registry()
+
+    # Signal to Rust that we're ready
+    write_message(ready_message())
+    print("[sidecar] Ready and waiting for messages", file=sys.stderr, flush=True)
+
+    # Message loop
+    while True:
+        msg = read_message()
+        if msg is None:
+            # EOF — parent closed stdin, time to exit
+            print("[sidecar] EOF on stdin, exiting", file=sys.stderr, flush=True)
+            break
+
+        response = registry.handle(msg)
+        if response is not None:
+            write_message(response)
+
+
+if __name__ == "__main__":
+    main()
--- a/python/voice_to_notes/providers/init.py
+++ b/python/voice_to_notes/providers/init.py
@@ -0,0 +1 @@
+"""AI provider adapters — local (llama-server), LiteLLM, OpenAI, Anthropic."""
--- a/python/voice_to_notes/providers/anthropic_provider.py
+++ b/python/voice_to_notes/providers/anthropic_provider.py
@@ -0,0 +1,5 @@
+"""Anthropic provider — direct Anthropic SDK integration."""
+
+from __future__ import annotations
+
+# TODO: Implement Anthropic provider
--- a/python/voice_to_notes/providers/base.py
+++ b/python/voice_to_notes/providers/base.py
@@ -0,0 +1,23 @@
+"""Abstract base class for AI providers."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from collections.abc import AsyncIterator
+from typing import Any
+
+
+class AIProvider(ABC):
+    """Base interface for all AI providers."""
+
+    @abstractmethod
+    async def chat(self, messages: list[dict[str, Any]], config: dict[str, Any]) -> str:
+        """Send a chat completion request and return the response."""
+        ...
+
+    @abstractmethod
+    async def stream(
+        self, messages: list[dict[str, Any]], config: dict[str, Any]
+    ) -> AsyncIterator[str]:
+        """Send a streaming chat request, yielding tokens as they arrive."""
+        ...
--- a/python/voice_to_notes/providers/litellm_provider.py
+++ b/python/voice_to_notes/providers/litellm_provider.py
@@ -0,0 +1,5 @@
+"""LiteLLM provider — multi-provider gateway."""
+
+from __future__ import annotations
+
+# TODO: Implement LiteLLM provider
--- a/python/voice_to_notes/providers/local_provider.py
+++ b/python/voice_to_notes/providers/local_provider.py
@@ -0,0 +1,9 @@
+"""Local AI provider — bundled llama-server (OpenAI-compatible API)."""
+
+from __future__ import annotations
+
+
+# TODO: Implement local provider
+# - Connect to llama-server on localhost:{port}
+# - Use openai SDK with custom base_url
+# - Support chat and streaming
--- a/python/voice_to_notes/providers/openai_provider.py
+++ b/python/voice_to_notes/providers/openai_provider.py
@@ -0,0 +1,5 @@
+"""OpenAI provider — direct OpenAI SDK integration."""
+
+from __future__ import annotations
+
+# TODO: Implement OpenAI provider
--- a/python/voice_to_notes/services/init.py
+++ b/python/voice_to_notes/services/init.py
@@ -0,0 +1 @@
+"""Service layer — transcription, diarization, AI, and export."""
--- a/python/voice_to_notes/services/ai_provider.py
+++ b/python/voice_to_notes/services/ai_provider.py
@@ -0,0 +1,13 @@
+"""AI provider service — routes requests to configured provider."""
+
+from __future__ import annotations
+
+
+class AIProviderService:
+    """Manages AI provider selection and routes chat/summarize requests."""
+
+    # TODO: Implement provider routing
+    # - Select provider based on config (local, openai, anthropic, litellm)
+    # - Forward chat messages
+    # - Handle streaming responses
+    pass
--- a/python/voice_to_notes/services/diarize.py
+++ b/python/voice_to_notes/services/diarize.py
@@ -0,0 +1,13 @@
+"""Diarization service — pyannote.audio speaker identification."""
+
+from __future__ import annotations
+
+
+class DiarizeService:
+    """Handles speaker diarization via pyannote.audio."""
+
+    # TODO: Implement pyannote.audio integration
+    # - Load community-1 model
+    # - Run diarization on audio
+    # - Return speaker segments with timestamps
+    pass
--- a/python/voice_to_notes/services/export.py
+++ b/python/voice_to_notes/services/export.py
@@ -0,0 +1,14 @@
+"""Export service — caption and text document generation."""
+
+from __future__ import annotations
+
+
+class ExportService:
+    """Handles export to SRT, WebVTT, ASS, plain text, and Markdown."""
+
+    # TODO: Implement pysubs2 integration
+    # - SRT with [Speaker]: prefix
+    # - WebVTT with <v Speaker> voice tags
+    # - ASS with named styles per speaker
+    # - Plain text and Markdown with speaker labels
+    pass
--- a/python/voice_to_notes/services/pipeline.py
+++ b/python/voice_to_notes/services/pipeline.py
@@ -0,0 +1,14 @@
+"""Combined transcription + diarization pipeline."""
+
+from __future__ import annotations
+
+
+class PipelineService:
+    """Runs the full WhisperX-style pipeline: transcribe -> align -> diarize -> merge."""
+
+    # TODO: Implement combined pipeline
+    # 1. faster-whisper transcription
+    # 2. wav2vec2 word-level alignment
+    # 3. pyannote diarization
+    # 4. Merge words with speaker segments
+    pass
--- a/python/voice_to_notes/services/transcribe.py
+++ b/python/voice_to_notes/services/transcribe.py
@@ -0,0 +1,13 @@
+"""Transcription service — faster-whisper + wav2vec2 pipeline."""
+
+from __future__ import annotations
+
+
+class TranscribeService:
+    """Handles audio transcription via faster-whisper."""
+
+    # TODO: Implement faster-whisper integration
+    # - Load model based on hardware detection
+    # - Transcribe audio with word-level timestamps
+    # - Report progress via IPC
+    pass
--- a/python/voice_to_notes/utils/init.py
+++ b/python/voice_to_notes/utils/init.py
@@ -0,0 +1 @@
+"""Utility modules."""
				`@@ -0,0 +1 @@`
				`"""Hardware detection and model selection."""`
				`@@ -0,0 +1 @@`
				`"""IPC protocol layer for JSON-line communication with the Rust backend."""`
				`@@ -0,0 +1 @@`
				`"""AI provider adapters — local (llama-server), LiteLLM, OpenAI, Anthropic."""`
				`@@ -0,0 +1 @@`
				`"""Service layer — transcription, diarization, AI, and export."""`