Phase 1 foundation: Tauri shell, Python sidecar, SQLite database

Tauri v2 + Svelte + TypeScript frontend:
- App shell with workspace layout (waveform, transcript, speakers, AI chat)
- Placeholder components for all major UI areas
- Typed stores (project, transcript, playback, AI)
- TypeScript interfaces matching the database schema
- Tauri bridge service with typed invoke wrappers
- svelte-check passes with 0 errors

Rust backend:
- Tauri v2 app entry point with command registration
- SQLite database layer (rusqlite with bundled SQLite)
  - Full schema: projects, media_files, speakers, segments, words,
    ai_outputs, annotations (with indexes)
  - Model structs with serde serialization
  - CRUD queries for projects, speakers, segments, words
  - Segment text editing preserves original text
  - Schema versioning for future migrations
  - 6 tests passing
- Command stubs for project, transcribe, export, AI, settings, system
- App state management

Python sidecar:
- JSON-line IPC protocol (stdin/stdout)
- Message types: IPCMessage, progress, error, ready
- Handler registry with routing and error handling
- Ping/pong handler for connectivity testing
- Service stubs: transcribe, diarize, pipeline, AI, export
- Provider stubs: local (llama-server), OpenAI, Anthropic, LiteLLM
- Hardware detection stubs
- 14 tests passing, ruff clean

Also adds:
- Testing strategy document (docs/TESTING.md)
- Validation script (scripts/validate.sh)
- Updated .gitignore for Svelte, Rust, Python artifacts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-26 15:16:06 -08:00
parent c450ef3c0c
commit 503cc6c0cf
95 changed files with 9607 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
"""Voice to Notes — Python sidecar for transcription, diarization, and AI services."""
__version__ = "0.1.0"

View File

@@ -0,0 +1 @@
"""Hardware detection and model selection."""

View File

@@ -0,0 +1,9 @@
"""GPU/CPU detection and VRAM estimation."""
from __future__ import annotations
# TODO: Implement hardware detection
# - Check torch.cuda.is_available()
# - Detect VRAM size
# - Detect CPU cores and available RAM
# - Return recommended model configuration

View File

@@ -0,0 +1,7 @@
"""Model selection logic based on available hardware."""
from __future__ import annotations
# TODO: Implement model selection
# - Map hardware capabilities to recommended models
# - Support user overrides from settings

View File

@@ -0,0 +1 @@
"""IPC protocol layer for JSON-line communication with the Rust backend."""

View File

@@ -0,0 +1,39 @@
"""Message handler registry and routing."""
from __future__ import annotations
import sys
from collections.abc import Callable
from voice_to_notes.ipc.messages import IPCMessage, error_message
# Handler function type: takes a message, returns a response message
HandlerFunc = Callable[[IPCMessage], IPCMessage | None]
class HandlerRegistry:
"""Registry mapping message types to handler functions."""
def __init__(self) -> None:
self._handlers: dict[str, HandlerFunc] = {}
def register(self, message_type: str, handler: HandlerFunc) -> None:
"""Register a handler for a message type."""
self._handlers[message_type] = handler
def handle(self, msg: IPCMessage) -> IPCMessage | None:
"""Route a message to its handler. Returns a response or error."""
handler = self._handlers.get(msg.type)
if handler is None:
print(f"[sidecar] Unknown message type: {msg.type}", file=sys.stderr, flush=True)
return error_message(msg.id, "unknown_type", f"Unknown message type: {msg.type}")
try:
return handler(msg)
except Exception as e:
print(f"[sidecar] Handler error for {msg.type}: {e}", file=sys.stderr, flush=True)
return error_message(msg.id, "handler_error", str(e))
def ping_handler(msg: IPCMessage) -> IPCMessage:
"""Simple ping handler for testing connectivity."""
return IPCMessage(id=msg.id, type="pong", payload={"echo": msg.payload})

View File

@@ -0,0 +1,46 @@
"""IPC message type definitions."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass
class IPCMessage:
"""A message exchanged between Rust and Python via JSON-line protocol."""
id: str
type: str
payload: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return {"id": self.id, "type": self.type, "payload": self.payload}
@classmethod
def from_dict(cls, data: dict[str, Any]) -> IPCMessage:
return cls(
id=data.get("id", ""),
type=data.get("type", ""),
payload=data.get("payload", {}),
)
def progress_message(request_id: str, percent: int, stage: str, message: str) -> IPCMessage:
return IPCMessage(
id=request_id,
type="progress",
payload={"percent": percent, "stage": stage, "message": message},
)
def error_message(request_id: str, code: str, message: str) -> IPCMessage:
return IPCMessage(
id=request_id,
type="error",
payload={"code": code, "message": message},
)
def ready_message() -> IPCMessage:
return IPCMessage(id="system", type="ready", payload={"version": "0.1.0"})

View File

@@ -0,0 +1,47 @@
"""JSON-line protocol reader/writer over stdin/stdout."""
from __future__ import annotations
import json
import sys
from typing import Any
from voice_to_notes.ipc.messages import IPCMessage
def read_message() -> IPCMessage | None:
"""Read a single JSON-line message from stdin. Returns None on EOF."""
try:
line = sys.stdin.readline()
if not line:
return None # EOF
line = line.strip()
if not line:
return None
data = json.loads(line)
return IPCMessage.from_dict(data)
except json.JSONDecodeError as e:
_log(f"Invalid JSON: {e}")
return None
except Exception as e:
_log(f"Read error: {e}")
return None
def write_message(msg: IPCMessage) -> None:
"""Write a JSON-line message to stdout."""
line = json.dumps(msg.to_dict(), separators=(",", ":"))
sys.stdout.write(line + "\n")
sys.stdout.flush()
def write_dict(data: dict[str, Any]) -> None:
"""Write a raw dict as a JSON-line message to stdout."""
line = json.dumps(data, separators=(",", ":"))
sys.stdout.write(line + "\n")
sys.stdout.flush()
def _log(message: str) -> None:
"""Log to stderr (stdout is reserved for IPC)."""
print(f"[sidecar] {message}", file=sys.stderr, flush=True)

View File

@@ -0,0 +1,52 @@
"""Main entry point for the Voice to Notes Python sidecar."""
from __future__ import annotations
import signal
import sys
from voice_to_notes.ipc.handlers import HandlerRegistry, ping_handler
from voice_to_notes.ipc.messages import ready_message
from voice_to_notes.ipc.protocol import read_message, write_message
def create_registry() -> HandlerRegistry:
"""Set up the message handler registry."""
registry = HandlerRegistry()
registry.register("ping", ping_handler)
# TODO: Register transcribe, diarize, pipeline, ai, export handlers
return registry
def main() -> None:
"""Main loop: read messages from stdin, dispatch to handlers, write responses to stdout."""
# Handle clean shutdown
def shutdown(signum: int, frame: object) -> None:
print("[sidecar] Shutting down...", file=sys.stderr, flush=True)
sys.exit(0)
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
registry = create_registry()
# Signal to Rust that we're ready
write_message(ready_message())
print("[sidecar] Ready and waiting for messages", file=sys.stderr, flush=True)
# Message loop
while True:
msg = read_message()
if msg is None:
# EOF — parent closed stdin, time to exit
print("[sidecar] EOF on stdin, exiting", file=sys.stderr, flush=True)
break
response = registry.handle(msg)
if response is not None:
write_message(response)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1 @@
"""AI provider adapters — local (llama-server), LiteLLM, OpenAI, Anthropic."""

View File

@@ -0,0 +1,5 @@
"""Anthropic provider — direct Anthropic SDK integration."""
from __future__ import annotations
# TODO: Implement Anthropic provider

View File

@@ -0,0 +1,23 @@
"""Abstract base class for AI providers."""
from __future__ import annotations
from abc import ABC, abstractmethod
from collections.abc import AsyncIterator
from typing import Any
class AIProvider(ABC):
"""Base interface for all AI providers."""
@abstractmethod
async def chat(self, messages: list[dict[str, Any]], config: dict[str, Any]) -> str:
"""Send a chat completion request and return the response."""
...
@abstractmethod
async def stream(
self, messages: list[dict[str, Any]], config: dict[str, Any]
) -> AsyncIterator[str]:
"""Send a streaming chat request, yielding tokens as they arrive."""
...

View File

@@ -0,0 +1,5 @@
"""LiteLLM provider — multi-provider gateway."""
from __future__ import annotations
# TODO: Implement LiteLLM provider

View File

@@ -0,0 +1,9 @@
"""Local AI provider — bundled llama-server (OpenAI-compatible API)."""
from __future__ import annotations
# TODO: Implement local provider
# - Connect to llama-server on localhost:{port}
# - Use openai SDK with custom base_url
# - Support chat and streaming

View File

@@ -0,0 +1,5 @@
"""OpenAI provider — direct OpenAI SDK integration."""
from __future__ import annotations
# TODO: Implement OpenAI provider

View File

@@ -0,0 +1 @@
"""Service layer — transcription, diarization, AI, and export."""

View File

@@ -0,0 +1,13 @@
"""AI provider service — routes requests to configured provider."""
from __future__ import annotations
class AIProviderService:
"""Manages AI provider selection and routes chat/summarize requests."""
# TODO: Implement provider routing
# - Select provider based on config (local, openai, anthropic, litellm)
# - Forward chat messages
# - Handle streaming responses
pass

View File

@@ -0,0 +1,13 @@
"""Diarization service — pyannote.audio speaker identification."""
from __future__ import annotations
class DiarizeService:
"""Handles speaker diarization via pyannote.audio."""
# TODO: Implement pyannote.audio integration
# - Load community-1 model
# - Run diarization on audio
# - Return speaker segments with timestamps
pass

View File

@@ -0,0 +1,14 @@
"""Export service — caption and text document generation."""
from __future__ import annotations
class ExportService:
"""Handles export to SRT, WebVTT, ASS, plain text, and Markdown."""
# TODO: Implement pysubs2 integration
# - SRT with [Speaker]: prefix
# - WebVTT with <v Speaker> voice tags
# - ASS with named styles per speaker
# - Plain text and Markdown with speaker labels
pass

View File

@@ -0,0 +1,14 @@
"""Combined transcription + diarization pipeline."""
from __future__ import annotations
class PipelineService:
"""Runs the full WhisperX-style pipeline: transcribe -> align -> diarize -> merge."""
# TODO: Implement combined pipeline
# 1. faster-whisper transcription
# 2. wav2vec2 word-level alignment
# 3. pyannote diarization
# 4. Merge words with speaker segments
pass

View File

@@ -0,0 +1,13 @@
"""Transcription service — faster-whisper + wav2vec2 pipeline."""
from __future__ import annotations
class TranscribeService:
"""Handles audio transcription via faster-whisper."""
# TODO: Implement faster-whisper integration
# - Load model based on hardware detection
# - Transcribe audio with word-level timestamps
# - Report progress via IPC
pass

View File

@@ -0,0 +1 @@
"""Utility modules."""