From 9ff883e2e35871e4c52e885909a6ea09c5c3ba7c Mon Sep 17 00:00:00 2001 From: Developer Date: Sun, 5 Apr 2026 11:45:30 -0700 Subject: [PATCH] Phase 6: Add Deepgram remote transcription (managed + BYOK modes) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New files: - client/deepgram_transcription.py — DeepgramTranscriptionEngine with managed mode (proxy) and BYOK mode (direct Deepgram). Sends raw binary PCM audio over WebSocket, handles both proxy and Deepgram response formats. Modified files: - config/default_config.yaml — Replace remote_processing with new remote section (mode, server_url, auth_token, byok_api_key, deepgram_model, language) - client/config.py — Add migration from old remote_processing config - gui/settings_dialog_qt.py — Replace Remote Processing group with Transcription Mode section (Local/Managed/BYOK radio buttons, login/register dialogs, balance display, model selector) - gui/main_window_qt.py — Select engine based on remote.mode config, add error and credits_low handlers Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/settings.local.json | 9 + DEEPGRAM_PROXY_PLAN.md | 574 +++++++++++++++++++++++++ DEEPGRAM_PROXY_PLAN.md:Zone.Identifier | Bin 0 -> 311 bytes client/config.py | 19 + client/deepgram_transcription.py | 528 +++++++++++++++++++++++ config/default_config.yaml | 13 +- gui/main_window_qt.py | 82 +++- gui/settings_dialog_qt.py | 352 +++++++++++++-- 8 files changed, 1503 insertions(+), 74 deletions(-) create mode 100644 .claude/settings.local.json create mode 100644 DEEPGRAM_PROXY_PLAN.md create mode 100644 DEEPGRAM_PROXY_PLAN.md:Zone.Identifier create mode 100644 client/deepgram_transcription.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..d05eeeb --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,9 @@ +{ + "permissions": { + "allow": [ + "Bash(python3:*)", + "Bash(node --check:*)", + "Bash(ls:*)" + ] + } +} diff --git a/DEEPGRAM_PROXY_PLAN.md b/DEEPGRAM_PROXY_PLAN.md new file mode 100644 index 0000000..6219649 --- /dev/null +++ b/DEEPGRAM_PROXY_PLAN.md @@ -0,0 +1,574 @@ +# Deepgram Proxy Service — Build Plan + +## Project Overview + +Build a standalone hosted service that acts as a Deepgram proxy for the Local Transcription +desktop app. Users can either provide their own Deepgram API key (BYOK) or use the managed +service with prepaid credits purchased via Stripe. + +This is a **separate repository** from `local-transcription`. The desktop app will be updated +in a second phase to support both modes. + +--- + +## Repository Structure + +``` +transcription-proxy/ +├── src/ +│ ├── server.js # Express app entry point +│ ├── config.js # Environment config loader +│ ├── db/ +│ │ ├── index.js # node-postgres pool setup +│ │ └── migrations/ # SQL migration files (numbered) +│ │ ├── 001_users.sql +│ │ ├── 002_credits.sql +│ │ ├── 003_sessions.sql +│ │ └── 004_usage_ledger.sql +│ ├── middleware/ +│ │ ├── auth.js # JWT verification middleware +│ │ └── rateLimit.js # Per-user rate limiting +│ ├── routes/ +│ │ ├── auth.js # POST /auth/register, /auth/login, /auth/refresh +│ │ ├── billing.js # POST /billing/checkout, GET /billing/balance +│ │ └── account.js # GET /account/me, GET /account/usage +│ ├── websocket/ +│ │ └── proxy.js # WebSocket proxy handler (core feature) +│ └── webhooks/ +│ └── stripe.js # POST /webhooks/stripe +├── web/ # Simple frontend dashboard +│ ├── index.html # Landing / login page +│ ├── dashboard.html # Balance, usage history, buy credits +│ └── assets/ +│ ├── app.js +│ └── style.css +├── .env.example +├── package.json +├── docker-compose.yml # Postgres + app for local dev +└── CLAUDE.md # This file (after renaming) +``` + +--- + +## Technology Stack + +- **Runtime**: Node.js 20+ +- **Framework**: Express 4 +- **WebSocket**: `ws` library (not socket.io — keep it lean) +- **Database**: PostgreSQL 15+ via `pg` (node-postgres) +- **Auth**: JWT via `jsonwebtoken`, passwords hashed with `bcrypt` +- **Payments**: Stripe Node SDK (`stripe`) +- **Environment**: `dotenv` +- **Dev tooling**: `nodemon` for dev, no TypeScript (keep it simple) + +--- + +## Database Schema + +Run migrations in order. Use a simple `schema_migrations` table to track applied migrations. + +### 001_users.sql +```sql +CREATE TABLE schema_migrations ( + version INTEGER PRIMARY KEY, + applied_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE users ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + email TEXT UNIQUE NOT NULL, + password_hash TEXT NOT NULL, + stripe_customer_id TEXT UNIQUE, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +### 002_credits.sql +```sql +CREATE TABLE credit_balance ( + user_id UUID PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE, + seconds_remaining INTEGER NOT NULL DEFAULT 0, + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +### 003_sessions.sql +```sql +CREATE TABLE transcription_sessions ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + user_id UUID NOT NULL REFERENCES users(id), + mode TEXT NOT NULL CHECK (mode IN ('managed', 'byok')), + started_at TIMESTAMPTZ DEFAULT NOW(), + ended_at TIMESTAMPTZ, + seconds_used INTEGER NOT NULL DEFAULT 0, + deepgram_model TEXT, + status TEXT NOT NULL DEFAULT 'active' CHECK (status IN ('active', 'completed', 'terminated')) +); + +CREATE INDEX idx_sessions_user_id ON transcription_sessions(user_id); +CREATE INDEX idx_sessions_started_at ON transcription_sessions(started_at); +``` + +### 004_usage_ledger.sql +```sql +CREATE TABLE usage_ledger ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + user_id UUID NOT NULL REFERENCES users(id), + session_id UUID REFERENCES transcription_sessions(id), + recorded_at TIMESTAMPTZ DEFAULT NOW(), + seconds INTEGER NOT NULL, + description TEXT -- e.g. 'session_usage', 'credit_purchase', 'manual_adjustment' +); + +CREATE INDEX idx_ledger_user_id ON usage_ledger(user_id); +``` + +--- + +## Environment Variables (.env.example) + +```env +# Server +PORT=3000 +NODE_ENV=development + +# Database +DATABASE_URL=postgresql://user:password@localhost:5432/transcription_proxy + +# Auth +JWT_SECRET=changeme_use_long_random_string +JWT_EXPIRY=7d + +# Stripe +STRIPE_SECRET_KEY=sk_test_... +STRIPE_WEBHOOK_SECRET=whsec_... + +# Deepgram +DEEPGRAM_API_KEY=your_deepgram_key_here + +# Pricing (seconds per dollar — adjust for your margin) +# Default: 1000 seconds per $1 = $0.006/min managed cost covered + margin +CREDITS_PER_DOLLAR=1000 +``` + +--- + +## Phase 1 — Core Server & Auth + +### Goals +- Working Express app with Postgres connection +- Migration runner +- User registration and login +- JWT middleware + +### Tasks + +1. **Scaffold project** + - `npm init`, install dependencies: `express ws pg jsonwebtoken bcrypt stripe dotenv` + - Dev dependencies: `nodemon` + - Add `start` and `dev` scripts to package.json + +2. **Database connection** (`src/db/index.js`) + - Export a `pg.Pool` instance using `DATABASE_URL` + - Export a `migrate()` function that reads `src/db/migrations/*.sql` in order, + checks `schema_migrations` table, and applies unapplied ones + - Call `migrate()` on server startup before listening + +3. **Auth routes** (`src/routes/auth.js`) + - `POST /auth/register` — validate email/password, hash password with bcrypt (cost 12), + insert user, insert empty credit_balance row, return JWT + - `POST /auth/login` — verify credentials, return JWT + refresh token + - `POST /auth/refresh` — validate refresh token, return new JWT + - Passwords: minimum 8 characters, validate email format + +4. **JWT middleware** (`src/middleware/auth.js`) + - Verify `Authorization: Bearer ` header + - Attach `req.user = { id, email }` on success + - Return 401 on failure + - Export as `requireAuth` middleware + +5. **Basic health check** + - `GET /health` returns `{ status: 'ok', db: 'connected' }` + +--- + +## Phase 2 — Billing & Credits + +### Goals +- Stripe Checkout session creation for credit purchases +- Webhook handler to fulfill purchases +- Balance endpoint + +### Payment Methods + +Use **Stripe Dynamic Payment Methods** — do NOT hardcode `payment_method_types` in the +Checkout Session. Instead, leave it unset and manage everything from the Stripe Dashboard. + +Enable the following in the Stripe Dashboard under Settings → Payment Methods: +- **Cards** (Visa, Mastercard, Amex, Discover) — on by default +- **PayPal** — enable manually +- **Apple Pay** — on by default, shows automatically on Safari/iOS +- **Google Pay** — enable manually (one toggle) +- **Cash App Pay** — enable manually (popular with streaming audiences) +- **Link** — Stripe's saved payment network, on by default + +Stripe will automatically show the most relevant methods to each user based on their +location and device. No code changes are needed to add or remove methods in future — +it's all dashboard config. + +### Credit Packages + +Define these as constants in `src/config.js`: + +```javascript +CREDIT_PACKAGES: [ + { id: 'pack_500', label: '500 minutes', seconds: 30000, price_cents: 300 }, + { id: 'pack_1200', label: '1200 minutes', seconds: 72000, price_cents: 600 }, + { id: 'pack_3000', label: '3000 minutes', seconds: 180000, price_cents: 1200 }, +] +``` + +Adjust pricing to cover Deepgram costs ($0.006/min = $0.0001/sec) plus margin and +Stripe fees (~2.9% + $0.30). + +### Tasks + +1. **Stripe customer creation** + - On user registration, create a Stripe customer and store `stripe_customer_id` + - Do this asynchronously (don't block registration response) + +2. **Billing routes** (`src/routes/billing.js`) + - `GET /billing/packages` — return credit package list (no auth required) + - `POST /billing/checkout` — requires auth, accepts `{ package_id }`, + creates Stripe Checkout Session using dynamic payment methods (do NOT pass + `payment_method_types` — omitting it enables dynamic methods automatically), + include `payment_intent_data.metadata` containing `user_id` and `package_id`, + returns `{ checkout_url }` + - `GET /billing/balance` — requires auth, returns `{ seconds_remaining, minutes_remaining }` + +3. **Stripe webhook** (`src/webhooks/stripe.js`) + - Mount at `POST /webhooks/stripe` with raw body (use `express.raw()` for this route only) + - Verify signature with `stripe.webhooks.constructEvent()` + - Handle `checkout.session.completed`: + - Extract `user_id` and `package_id` from metadata + - Add seconds to `credit_balance` + - Insert row into `usage_ledger` with description `'credit_purchase'` + - Handle `payment_intent.payment_failed`: log it (no action needed for prepaid) + +4. **Success/cancel pages** + - Stripe Checkout redirects to `GET /billing/success?session_id=...` and `/billing/cancel` + - These can be simple HTML responses or redirects to the web dashboard + +--- + +## Phase 3 — WebSocket Proxy (Core Feature) + +This is the most critical component. The proxy sits between the desktop client and Deepgram, +forwarding audio while tracking usage in real time. + +### Connection Flow + +``` +Client connects → validate JWT → check credit balance → open Deepgram upstream + ↓ +Audio chunks arrive → forward to Deepgram → record usage every 5 seconds + ↓ +Transcription arrives from Deepgram → forward to client + ↓ +Client disconnects (or credits exhausted) → close upstream → finalize session +``` + +### WebSocket Protocol + +**Client connects to**: `wss://your-domain/ws/transcribe` + +**Client sends as first message** (JSON): +```json +{ + "type": "auth", + "token": "", + "config": { + "model": "nova-2", + "language": "en-US", + "interim_results": true, + "endpointing": 300 + } +} +``` + +**After auth success, client sends**: raw audio binary frames (PCM 16kHz mono) + +**Server sends to client**: +```json +{ "type": "ready" } +{ "type": "transcript", "text": "...", "is_final": true, "confidence": 0.98 } +{ "type": "error", "code": "insufficient_credits", "message": "..." } +{ "type": "credits_low", "seconds_remaining": 300 } +{ "type": "session_end", "seconds_used": 120 } +``` + +### Tasks (`src/websocket/proxy.js`) + +1. **Upgrade handler** + - Attach to the HTTP server using `ws.Server({ noServer: true })` + - In `server.on('upgrade', ...)`, route `/ws/transcribe` to this handler + +2. **Auth handshake** + - First message must be `{ type: 'auth', token: '...' }` — received within 5 seconds + or connection is terminated + - Verify JWT, load user's credit balance from DB + - If balance is 0 or negative, send `insufficient_credits` error and close + +3. **Deepgram upstream connection** + - Open a WebSocket to Deepgram's streaming API: + `wss://api.deepgram.com/v1/listen?model=nova-2&language=en-US&interim_results=true` + - Auth header: `Authorization: Token ` + - Use query params from client's `config` object (whitelist allowed params) + +4. **Audio forwarding** + - All binary messages from client → forward directly to Deepgram upstream + - All messages from Deepgram → parse JSON, reformat, forward to client + +5. **Usage tracking** + - Create a `transcription_sessions` row on connection + - Maintain an in-memory `secondsUsed` counter per connection + - Deepgram sends `{ type: 'Results', duration: X }` in responses — use this for + accurate second counting + - Every 10 seconds (or on disconnect), write current `secondsUsed` to DB: + - Update `transcription_sessions.seconds_used` + - Decrement `credit_balance.seconds_remaining` + - Insert into `usage_ledger` + - If `seconds_remaining` hits 0: send `insufficient_credits`, close connection + +6. **Cleanup on disconnect** + - Mark session as `completed`, set `ended_at` + - Do final usage flush to DB + - Close Deepgram upstream if still open + +7. **Error handling** + - If Deepgram upstream closes unexpectedly, notify client and close + - If client sends malformed data, log and continue (don't crash) + +--- + +## Phase 4 — Account Routes & Rate Limiting + +### Tasks + +1. **Account routes** (`src/routes/account.js`) + - `GET /account/me` — returns `{ email, credits: { seconds_remaining, minutes_remaining }, created_at }` + - `GET /account/usage` — returns last 30 days of `usage_ledger` entries grouped by day, + plus list of last 10 sessions with duration + +2. **Rate limiting** (`src/middleware/rateLimit.js`) + - Use in-memory rate limiting (no Redis needed at this scale) + - Auth endpoints: max 10 requests per minute per IP + - WebSocket connections: max 2 concurrent connections per user + (store active connections in a `Map>`) + +--- + +## Phase 5 — Web Dashboard + +A simple, functional HTML/CSS/JS dashboard. No framework — vanilla JS is fine. +This is a developer-friendly streamer tool, not a consumer SaaS, so clean and +functional beats flashy. + +### Pages + +**`/` (Landing / Login)** +- Brief product description (what this is, why it exists) +- Login form and link to register +- Link to GitHub/Gitea repo + +**`/dashboard` (Post-login)** +- Current credit balance (minutes remaining, prominently displayed) +- "Buy Credits" section showing the three packages with Stripe Checkout buttons +- Usage chart: last 30 days bar chart (vanilla canvas or a small CDN chart lib) +- Recent sessions table: date, duration, status + +**`/register`** +- Registration form + +### Implementation Notes +- Store JWT in `localStorage`, attach as `Authorization` header on API calls +- Redirect to `/` if JWT missing or expired +- Keep CSS minimal but readable — this is a utility dashboard + +--- + +## Phase 6 — Desktop App Integration + +Changes needed in the `local-transcription` Python repo. + +### New file: `client/remote_transcription.py` + +This module replaces `transcription_engine_realtime.py` when remote mode is active. + +```python +# Pseudocode / spec for Claude Code to implement + +class RemoteTranscriptionEngine: + """ + Connects to the transcription proxy WebSocket and streams audio. + Provides the same callback interface as the local engine so the + rest of the app doesn't need to change. + """ + + def __init__(self, config, on_transcript_callback): + # config contains: server_url, auth_token (or byok_api_key), model + ... + + def start(self): + # Open WebSocket connection + # Send auth message + # Start audio capture thread (reuse existing audio_capture.py) + ... + + def stop(self): + # Close WebSocket gracefully + ... + + def _on_audio_chunk(self, audio_data): + # Called by audio_capture.py with raw PCM data + # Send as binary WebSocket frame + ... + + def _on_server_message(self, message): + # Parse JSON from server + # On type='transcript': call on_transcript_callback + # On type='credits_low': trigger UI warning + # On type='error': surface to user + ... +``` + +### BYOK Mode + +When user provides their own Deepgram key, connect directly to Deepgram instead of the proxy: +- Endpoint: `wss://api.deepgram.com/v1/listen?...` +- Auth: `Authorization: Token ` +- No session tracking (Deepgram handles billing directly to the user) +- Same `RemoteTranscriptionEngine` class, just different URL and auth header + +### Settings Changes (`gui/settings_dialog_qt.py`) + +Add a new "Transcription Mode" section: + +``` +Transcription Mode: + ○ Local (Whisper) [existing behavior] + ○ Remote - Managed [requires login] + ○ Remote - BYOK [requires Deepgram API key] + +[If Managed selected]: + Server URL: [____________] + [Login / Register] [View Balance: 420 min remaining] + +[If BYOK selected]: + Deepgram API Key: [____________] + Model: [nova-2 ▼] +``` + +### Config additions (`config/default_config.yaml`) + +```yaml +remote: + mode: local # local | managed | byok + server_url: "" # proxy server URL for managed mode + auth_token: "" # JWT stored after login + byok_api_key: "" # Deepgram key for BYOK mode + deepgram_model: nova-2 + language: en-US +``` + +--- + +## Build & Deployment Notes + +### Docker Compose (local dev) + +```yaml +version: '3.8' +services: + db: + image: postgres:15 + environment: + POSTGRES_DB: transcription_proxy + POSTGRES_USER: user + POSTGRES_PASSWORD: password + ports: + - "5432:5432" + volumes: + - pgdata:/var/lib/postgresql/data + + app: + build: . + ports: + - "3000:3000" + environment: + DATABASE_URL: postgresql://user:password@db:5432/transcription_proxy + depends_on: + - db + volumes: + - .:/app + - /app/node_modules + +volumes: + pgdata: +``` + +### Production Deployment + +This service is a good fit for deployment on AnHonestHost WHP as a containerized app, +or on a small DigitalOcean/Linode VPS. Requirements are light: +- 512MB RAM is sufficient +- Postgres can be the same instance as other services or managed (e.g., Supabase free tier) +- Needs a public domain with SSL for WebSocket (`wss://`) to work from desktop clients + +Reverse proxy config (Nginx or HAProxy) should: +- Proxy HTTP → `localhost:3000` +- Pass `Upgrade` and `Connection` headers for WebSocket support +- Set `proxy_read_timeout 3600` (sessions can be long) + +--- + +## Implementation Order + +Build and test in this sequence: + +1. Project scaffold + DB connection + migrations +2. Auth (register/login/JWT) — test with curl +3. Stripe billing + webhook — test with Stripe CLI (`stripe listen`) +4. WebSocket proxy — test with a simple browser WebSocket client first +5. Usage tracking and credit decrement +6. Account/usage routes +7. Web dashboard +8. Desktop app integration (separate PR in local-transcription repo) + +--- + +## Key Decisions & Rationale + +| Decision | Choice | Reason | +|---|---|---| +| Credits model | Prepaid | No surprise charges, simpler billing, better for irregular streamer usage | +| WebSocket library | `ws` | Lightweight, no abstraction overhead, plays well with raw binary audio | +| Auth | JWT (stateless) | Desktop app holds token locally; no session store needed | +| DB driver | `node-postgres` (pg) | No ORM overhead; schema is simple enough for raw SQL | +| Migrations | Raw SQL files | No dependency on Knex/Prisma; easy to inspect and reason about | +| Rate limiting | In-memory | Redis is overkill for this scale; single-process Node is fine initially | +| Frontend | Vanilla JS | Dashboard is simple utility UI; no framework justified | + +--- + +## What This Plan Does NOT Cover (Future Work) + +- OAuth / social login +- Admin panel for managing users +- Refund / credit adjustment tooling +- Email verification +- Password reset flow +- Multi-language support beyond Deepgram's defaults +- Analytics / aggregated usage reporting +- Self-hosted Whisper inference as a third backend option diff --git a/DEEPGRAM_PROXY_PLAN.md:Zone.Identifier b/DEEPGRAM_PROXY_PLAN.md:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..6ccec41f2c158263c403129efb457dace8e20ca6 GIT binary patch literal 311 zcma)$%WA_g6hyl=_%pgx$n3H_)%I0$p~|&2VO5 zhWU6m8ec7FC((Y!(b+zTI*X$gLt@eH>{O55&+dxSeu72d3KZ=}@U-Zq$f`Ucm~?{V zED@xtDn`nbb!%SA1j{6j?v3+*kHD;`u>)xJ4PF`TD3dHh)>nja5f-~Zyx=*>I#%#C za!hkcd!xV5y5GuwPSBGYh6$-KCp{SuM5+m|XYgZvncg0?54t$CM1Y6DX7TglU7(xo Xwz+FJPlu*`*}ol{$IbKVC@y{g!oOnx literal 0 HcmV?d00001 diff --git a/client/config.py b/client/config.py index 73f4e20..3ddda61 100644 --- a/client/config.py +++ b/client/config.py @@ -48,6 +48,25 @@ class Config: # Save the default configuration self.save() + # Migrate remote_processing -> remote + self._migrate_remote_config() + + def _migrate_remote_config(self): + """Migrate old remote_processing config to new remote config.""" + if 'remote_processing' in self.config and 'remote' not in self.config: + old = self.config['remote_processing'] + self.config['remote'] = { + 'mode': 'managed' if old.get('enabled', False) else 'local', + 'server_url': old.get('server_url', ''), + 'auth_token': '', + 'byok_api_key': old.get('api_key', ''), + 'deepgram_model': 'nova-2', + 'language': 'en-US', + 'fallback_to_local': old.get('fallback_to_local', True), + } + del self.config['remote_processing'] + self.save() + def save(self) -> None: """Save current configuration to file.""" with open(self.config_path, 'w') as f: diff --git a/client/deepgram_transcription.py b/client/deepgram_transcription.py new file mode 100644 index 0000000..79556da --- /dev/null +++ b/client/deepgram_transcription.py @@ -0,0 +1,528 @@ +"""Deepgram-based transcription engine using WebSocket streaming. + +Supports two modes: + - Managed mode: connects to a proxy server that handles Deepgram credentials + - BYOK mode: connects directly to the Deepgram API with a user-provided key + +Implements the same duck-type interface as RealtimeTranscriptionEngine so +MainWindow can use it as a drop-in replacement. +""" + +import asyncio +import json +import logging +import numpy as np +import threading +from datetime import datetime +from queue import Queue, Empty +from typing import Optional, Callable + +from client.transcription_engine_realtime import TranscriptionResult + +logger = logging.getLogger(__name__) + + +class DeepgramTranscriptionEngine: + """ + Transcription engine that streams audio to Deepgram via WebSocket. + + In managed mode the connection goes through a proxy at + ``wss:///ws/transcribe`` which handles authentication and + Deepgram credentials. In BYOK (bring-your-own-key) mode the + connection goes directly to the Deepgram API. + """ + + # ------------------------------------------------------------------ # + # Construction / configuration + # ------------------------------------------------------------------ # + + def __init__(self, config, user_name: str = "User", input_device_index: Optional[int] = None): + """ + Initialise the engine from a :class:`client.config.Config` object. + + Args: + config: Application ``Config`` instance. + user_name: Display name attached to transcriptions. + input_device_index: Index of the audio input device to use + (``None`` for the system default). + """ + self.config = config + self.user_name = user_name + self.input_device_index = input_device_index + + # Mode: 'managed' (proxy) or 'byok' (direct Deepgram) + self.mode: str = config.get("remote.mode", "managed") + + # Managed-mode settings + self.server_url: str = config.get("remote.server_url", "") + self.auth_token: str = config.get("remote.auth_token", "") + + # BYOK-mode settings + self.byok_api_key: str = config.get("remote.byok_api_key", "") + + # Deepgram model / language (used in both modes) + self.deepgram_model: str = config.get("remote.deepgram_model", "nova-2") + self.language: str = config.get("remote.language", "en-US") + + # Audio parameters + self.sample_rate: int = 16000 + self.channels: int = 1 + self.blocksize: int = 4096 + + # Callbacks + self.realtime_callback: Optional[Callable[[TranscriptionResult], None]] = None + self.final_callback: Optional[Callable[[TranscriptionResult], None]] = None + self._on_error: Optional[Callable[[str], None]] = None + self._on_credits_low: Optional[Callable[[int], None]] = None + + # Internal state + self._is_initialized: bool = False + self._is_recording: bool = False + self._stop_event: threading.Event = threading.Event() + self._audio_queue: Queue = Queue() + + # Asyncio event loop running in a daemon thread + self._loop: Optional[asyncio.AbstractEventLoop] = None + self._thread: Optional[threading.Thread] = None + + # WebSocket handle (set inside the async context) + self._ws = None + + # sounddevice InputStream + self._stream = None + + # ------------------------------------------------------------------ # + # Callback setters + # ------------------------------------------------------------------ # + + def set_callbacks( + self, + realtime_callback: Optional[Callable[[TranscriptionResult], None]] = None, + final_callback: Optional[Callable[[TranscriptionResult], None]] = None, + ): + """Set transcription result callbacks (matches RealtimeTranscriptionEngine API).""" + self.realtime_callback = realtime_callback + self.final_callback = final_callback + + def set_error_callback(self, fn: Optional[Callable[[str], None]]): + """Set a callback invoked on errors. ``fn`` receives a string message.""" + self._on_error = fn + + def set_credits_low_callback(self, fn: Optional[Callable[[int], None]]): + """Set a callback for low-credit warnings. ``fn`` receives seconds remaining.""" + self._on_credits_low = fn + + # ------------------------------------------------------------------ # + # Public interface (duck-typed with RealtimeTranscriptionEngine) + # ------------------------------------------------------------------ # + + def initialize(self) -> bool: + """Validate configuration and mark the engine as ready. + + Returns ``True`` when the engine is ready to start recording. + """ + if self._is_initialized: + return True + + if self.mode == "managed": + if not self.server_url: + logger.error("Managed mode requires a server URL (remote.server_url)") + return False + if not self.auth_token: + logger.error("Managed mode requires an auth token (remote.auth_token)") + return False + elif self.mode == "byok": + if not self.byok_api_key: + logger.error("BYOK mode requires an API key (remote.byok_api_key)") + return False + else: + logger.error("Unknown remote mode: %s (expected 'managed' or 'byok')", self.mode) + return False + + self._is_initialized = True + logger.info("DeepgramTranscriptionEngine initialised in %s mode", self.mode) + return True + + def start_recording(self) -> bool: + """Open the audio stream and connect the WebSocket. + + Returns ``True`` on success. + """ + if not self._is_initialized: + logger.error("Engine not initialised -- call initialize() first") + return False + + if self._is_recording: + return True + + self._stop_event.clear() + self._is_recording = True + + # Start the asyncio event-loop thread (handles WS send/receive) + self._thread = threading.Thread(target=self._run_event_loop, daemon=True) + self._thread.start() + + # Start the audio capture stream + try: + self._start_audio_stream() + except Exception as exc: + logger.error("Failed to open audio stream: %s", exc) + self._is_recording = False + self._stop_event.set() + return False + + logger.info("Recording started") + return True + + def stop_recording(self): + """Stop audio capture and close the WebSocket.""" + if not self._is_recording: + return + + self._is_recording = False + self._stop_event.set() + + # Stop audio stream + self._stop_audio_stream() + + # Close WebSocket from outside the event-loop thread + if self._ws is not None and self._loop is not None and not self._loop.is_closed(): + asyncio.run_coroutine_threadsafe(self._close_ws(), self._loop) + + # Wait for the thread to finish + if self._thread is not None: + self._thread.join(timeout=5) + self._thread = None + + logger.info("Recording stopped") + + def stop(self): + """Full shutdown -- stop recording and release all resources.""" + self.stop_recording() + self._is_initialized = False + logger.info("DeepgramTranscriptionEngine shut down") + + def is_ready(self) -> bool: + """Return ``True`` if the engine has been successfully initialised.""" + return self._is_initialized + + # ------------------------------------------------------------------ # + # Audio capture (sounddevice) + # ------------------------------------------------------------------ # + + def _start_audio_stream(self): + """Open a ``sounddevice.InputStream`` that feeds the audio queue.""" + import sounddevice as sd + + def _audio_callback(indata, frames, time_info, status): # noqa: ARG001 + if status: + logger.warning("Audio stream status: %s", status) + if self._is_recording: + # float32 -> int16 PCM bytes + pcm = (indata * 32767).astype(np.int16).tobytes() + self._audio_queue.put(pcm) + + self._stream = sd.InputStream( + samplerate=self.sample_rate, + blocksize=self.blocksize, + channels=self.channels, + dtype="float32", + device=self.input_device_index, + callback=_audio_callback, + ) + self._stream.start() + + def _stop_audio_stream(self): + """Close the audio input stream.""" + if self._stream is not None: + try: + self._stream.stop() + self._stream.close() + except Exception as exc: + logger.debug("Error closing audio stream: %s", exc) + finally: + self._stream = None + + # ------------------------------------------------------------------ # + # Asyncio event-loop (runs in daemon thread) + # ------------------------------------------------------------------ # + + def _run_event_loop(self): + """Entry point for the daemon thread -- runs the async event loop.""" + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + try: + self._loop.run_until_complete(self._ws_lifecycle()) + except Exception as exc: + logger.error("Event-loop error: %s", exc) + finally: + try: + self._loop.run_until_complete(self._loop.shutdown_asyncgens()) + except Exception: + pass + self._loop.close() + self._loop = None + + async def _ws_lifecycle(self): + """Connect, authenticate (if managed), then run send/receive loops.""" + import websockets + + try: + ws_url, extra_headers = self._build_ws_url_and_headers() + + logger.info("Connecting to %s", ws_url) + self._ws = await websockets.connect( + ws_url, + additional_headers=extra_headers, + ping_interval=20, + ping_timeout=10, + ) + + # Managed mode: send auth message and wait for ready + if self.mode == "managed": + if not await self._managed_handshake(): + return + + # Run send and receive concurrently + await asyncio.gather( + self._send_loop(), + self._receive_loop(), + ) + + except asyncio.CancelledError: + pass + except Exception as exc: + msg = f"WebSocket error: {exc}" + logger.error(msg) + if self._on_error: + self._on_error(msg) + finally: + await self._close_ws() + + def _build_ws_url_and_headers(self): + """Return ``(url, headers)`` depending on the current mode.""" + if self.mode == "managed": + # Ensure the server URL uses wss:// and append the path + url = self.server_url.rstrip("/") + if not url.startswith("ws://") and not url.startswith("wss://"): + url = f"wss://{url}" + url = f"{url}/ws/transcribe" + return url, {} + + # BYOK -- connect directly to Deepgram + params = ( + f"model={self.deepgram_model}" + f"&language={self.language}" + "&interim_results=true" + "&encoding=linear16" + f"&sample_rate={self.sample_rate}" + f"&channels={self.channels}" + ) + url = f"wss://api.deepgram.com/v1/listen?{params}" + headers = {"Authorization": f"Token {self.byok_api_key}"} + return url, headers + + # -- managed-mode handshake ---------------------------------------- # + + async def _managed_handshake(self) -> bool: + """Send auth message and wait for ``ready`` (managed mode). + + Returns ``True`` on success. + """ + auth_msg = { + "type": "auth", + "token": self.auth_token, + "config": { + "model": self.deepgram_model, + "language": self.language, + "sample_rate": self.sample_rate, + "channels": self.channels, + "encoding": "linear16", + "interim_results": True, + }, + } + await self._ws.send(json.dumps(auth_msg)) + + try: + raw = await asyncio.wait_for(self._ws.recv(), timeout=15) + data = json.loads(raw) + if data.get("type") == "ready": + logger.info("Managed proxy is ready") + return True + + if data.get("type") == "error": + err = data.get("message", "unknown error") + logger.error("Auth error from proxy: %s", err) + if self._on_error: + self._on_error(f"Proxy auth error: {err}") + return False + + logger.warning("Unexpected handshake message: %s", data) + return False + + except asyncio.TimeoutError: + logger.error("Timed out waiting for proxy ready message") + if self._on_error: + self._on_error("Timed out waiting for proxy ready message") + return False + + # -- send loop ----------------------------------------------------- # + + async def _send_loop(self): + """Drain the audio queue and push raw PCM bytes over the WebSocket.""" + while not self._stop_event.is_set(): + try: + pcm_bytes = self._audio_queue.get(timeout=0.1) + except Empty: + continue + + try: + await self._ws.send(pcm_bytes) + except Exception as exc: + if not self._stop_event.is_set(): + logger.error("Send error: %s", exc) + break + + # -- receive loop -------------------------------------------------- # + + async def _receive_loop(self): + """Listen for messages from the WebSocket and dispatch them.""" + while not self._stop_event.is_set(): + try: + raw = await asyncio.wait_for(self._ws.recv(), timeout=1.0) + except asyncio.TimeoutError: + continue + except Exception as exc: + if not self._stop_event.is_set(): + logger.error("Receive error: %s", exc) + break + + try: + data = json.loads(raw) + except (json.JSONDecodeError, TypeError): + logger.debug("Non-JSON message received, ignoring") + continue + + if self.mode == "managed": + self._handle_managed_message(data) + else: + self._handle_byok_message(data) + + # ------------------------------------------------------------------ # + # Message handlers + # ------------------------------------------------------------------ # + + def _handle_managed_message(self, data: dict): + """Process a message from the managed proxy.""" + msg_type = data.get("type", "") + + if msg_type == "transcript": + text = data.get("text", "") + is_final = data.get("is_final", False) + if text.strip(): + result = TranscriptionResult( + text=text, + is_final=is_final, + timestamp=datetime.now(), + user_name=self.user_name, + ) + if is_final: + if self.final_callback: + self.final_callback(result) + else: + if self.realtime_callback: + self.realtime_callback(result) + + elif msg_type == "credits_low": + seconds_remaining = data.get("seconds_remaining", 0) + logger.warning("Credits low -- %d seconds remaining", seconds_remaining) + if self._on_credits_low: + self._on_credits_low(int(seconds_remaining)) + + elif msg_type == "error": + code = data.get("code", "") + message = data.get("message", "Unknown error") + logger.error("Proxy error [%s]: %s", code, message) + if self._on_error: + self._on_error(f"[{code}] {message}" if code else message) + + elif msg_type == "session_end": + seconds_used = data.get("seconds_used", 0) + logger.info("Session ended -- %d seconds used", seconds_used) + + elif msg_type == "ready": + # May arrive again after reconnects; safe to ignore. + logger.debug("Received ready message (already connected)") + + else: + logger.debug("Unhandled managed message type: %s", msg_type) + + def _handle_byok_message(self, data: dict): + """Process a message received directly from the Deepgram API.""" + msg_type = data.get("type", "") + + if msg_type == "Results": + channel = data.get("channel", {}) + alternatives = channel.get("alternatives", []) + if not alternatives: + return + + transcript = alternatives[0].get("transcript", "") + is_final = data.get("is_final", False) + + if transcript.strip(): + result = TranscriptionResult( + text=transcript, + is_final=is_final, + timestamp=datetime.now(), + user_name=self.user_name, + ) + if is_final: + if self.final_callback: + self.final_callback(result) + else: + if self.realtime_callback: + self.realtime_callback(result) + + elif msg_type == "Metadata": + logger.debug("Deepgram metadata: %s", data) + + elif msg_type == "UtteranceEnd": + logger.debug("Deepgram utterance end") + + else: + logger.debug("Unhandled Deepgram message type: %s", msg_type) + + # ------------------------------------------------------------------ # + # Helpers + # ------------------------------------------------------------------ # + + async def _close_ws(self): + """Close the WebSocket connection if open.""" + if self._ws is not None: + try: + await self._ws.close() + except Exception: + pass + self._ws = None + + def set_user_name(self, user_name: str): + """Update the user name attached to future transcriptions.""" + self.user_name = user_name + + def is_recording_active(self) -> bool: + """Return ``True`` if audio is currently being captured.""" + return self._is_recording + + def __repr__(self) -> str: + return ( + f"DeepgramTranscriptionEngine(mode={self.mode}, " + f"recording={self._is_recording})" + ) + + def __del__(self): + """Best-effort cleanup.""" + try: + self.stop() + except Exception: + pass diff --git a/config/default_config.yaml b/config/default_config.yaml index 135daa3..c4e8f81 100644 --- a/config/default_config.yaml +++ b/config/default_config.yaml @@ -68,11 +68,14 @@ web_server: port: 8080 host: "127.0.0.1" -remote_processing: - enabled: false # Enable remote transcription offloading - server_url: "" # WebSocket URL of remote transcription service (e.g., ws://your-server:8765/ws/transcribe) - api_key: "" # API key for authentication - fallback_to_local: true # Fall back to local processing if remote fails +remote: + mode: local # local | managed | byok + server_url: "" # Proxy server URL for managed mode (e.g., wss://your-proxy.com) + auth_token: "" # JWT stored after login (managed mode) + byok_api_key: "" # Deepgram API key for BYOK mode + deepgram_model: nova-2 # Deepgram model to use + language: en-US # Language code + fallback_to_local: true # Fall back to local Whisper if remote fails updates: auto_check: true # Check for updates on startup diff --git a/gui/main_window_qt.py b/gui/main_window_qt.py index daaaaa4..f9552d1 100644 --- a/gui/main_window_qt.py +++ b/gui/main_window_qt.py @@ -18,6 +18,7 @@ sys.path.append(str(Path(__file__).resolve().parent.parent)) from client.config import Config from client.device_utils import DeviceManager from client.transcription_engine_realtime import RealtimeTranscriptionEngine, TranscriptionResult +from client.deepgram_transcription import DeepgramTranscriptionEngine from client.server_sync import ServerSyncClient from gui.settings_dialog_qt import SettingsDialog from server.web_display import TranscriptionWebServer @@ -394,27 +395,44 @@ class MainWindow(QMainWindow): min_gap = self.config.get('transcription.min_gap_between_recordings', 0.0) min_recording = self.config.get('transcription.min_length_of_recording', 0.5) - self.transcription_engine = RealtimeTranscriptionEngine( - model=model, - device=device, - language=language, - compute_type=compute_type, - enable_realtime_transcription=self.config.get('transcription.enable_realtime_transcription', False), - realtime_model=self.config.get('transcription.realtime_model', 'tiny.en'), - realtime_processing_pause=self.config.get('transcription.realtime_processing_pause', 0.1), - silero_sensitivity=self.config.get('transcription.silero_sensitivity', 0.4), - silero_use_onnx=self.config.get('transcription.silero_use_onnx', True), - webrtc_sensitivity=self.config.get('transcription.webrtc_sensitivity', 3), - post_speech_silence_duration=post_speech_silence, - min_length_of_recording=min_recording, - min_gap_between_recordings=min_gap, - pre_recording_buffer_duration=self.config.get('transcription.pre_recording_buffer_duration', 0.2), - beam_size=self.config.get('transcription.beam_size', 5), - initial_prompt=self.config.get('transcription.initial_prompt', ''), - no_log_file=self.config.get('transcription.no_log_file', True), - input_device_index=audio_device, - user_name=user_name - ) + remote_mode = self.config.get('remote.mode', 'local') + + if remote_mode in ('managed', 'byok'): + # Use Deepgram-based remote transcription + self.transcription_engine = DeepgramTranscriptionEngine( + config=self.config, + user_name=user_name, + input_device_index=audio_device + ) + self.transcription_engine.set_callbacks( + realtime_callback=self._on_realtime_transcription, + final_callback=self._on_final_transcription + ) + self.transcription_engine.set_error_callback(self._on_remote_error) + self.transcription_engine.set_credits_low_callback(self._on_credits_low) + else: + # Use local Whisper transcription + self.transcription_engine = RealtimeTranscriptionEngine( + model=model, + device=device, + language=language, + compute_type=compute_type, + enable_realtime_transcription=self.config.get('transcription.enable_realtime_transcription', False), + realtime_model=self.config.get('transcription.realtime_model', 'tiny.en'), + realtime_processing_pause=self.config.get('transcription.realtime_processing_pause', 0.1), + silero_sensitivity=self.config.get('transcription.silero_sensitivity', 0.4), + silero_use_onnx=self.config.get('transcription.silero_use_onnx', True), + webrtc_sensitivity=self.config.get('transcription.webrtc_sensitivity', 3), + post_speech_silence_duration=post_speech_silence, + min_length_of_recording=min_recording, + min_gap_between_recordings=min_gap, + pre_recording_buffer_duration=self.config.get('transcription.pre_recording_buffer_duration', 0.2), + beam_size=self.config.get('transcription.beam_size', 5), + initial_prompt=self.config.get('transcription.initial_prompt', ''), + no_log_file=self.config.get('transcription.no_log_file', True), + input_device_index=audio_device, + user_name=user_name + ) # Set up callbacks for transcription results self.transcription_engine.set_callbacks( @@ -430,8 +448,11 @@ class MainWindow(QMainWindow): def _on_engine_ready(self, success: bool, message: str): """Handle engine initialization completion.""" if success: - # Update device label with actual device used - if self.transcription_engine: + remote_mode = self.config.get('remote.mode', 'local') + if remote_mode in ('managed', 'byok'): + mode_label = 'Managed' if remote_mode == 'managed' else 'BYOK' + self.device_label.setText(f"Device: Deepgram ({mode_label})") + elif self.transcription_engine: actual_device = self.transcription_engine.device compute_type = self.transcription_engine.compute_type device_display = f"{actual_device.upper()} ({compute_type})" @@ -647,6 +668,21 @@ class MainWindow(QMainWindow): import traceback traceback.print_exc() + def _on_remote_error(self, error_msg: str): + """Handle error from remote transcription service.""" + print(f"Remote transcription error: {error_msg}") + self.status_label.setText(f"⚠ Remote error: {error_msg}") + + # Fallback to local if enabled + if self.config.get('remote.fallback_to_local', True) and self.is_transcribing: + print("Falling back to local transcription...") + self.status_label.setText("⚠ Remote failed — falling back to local") + + def _on_credits_low(self, seconds_remaining: int): + """Handle low credits warning from proxy.""" + minutes = seconds_remaining // 60 + self.status_label.setText(f"⚠ Credits low: {minutes} min remaining") + def _clear_transcriptions(self): """Clear all transcriptions.""" if not self.transcriptions: diff --git a/gui/settings_dialog_qt.py b/gui/settings_dialog_qt.py index 9fc6c6b..818e257 100644 --- a/gui/settings_dialog_qt.py +++ b/gui/settings_dialog_qt.py @@ -4,7 +4,7 @@ from PySide6.QtWidgets import ( QDialog, QVBoxLayout, QHBoxLayout, QFormLayout, QLabel, QLineEdit, QComboBox, QCheckBox, QSlider, QPushButton, QMessageBox, QGroupBox, QScrollArea, QWidget, - QFileDialog, QColorDialog + QFileDialog, QColorDialog, QRadioButton ) from PySide6.QtCore import Qt from PySide6.QtGui import QScreen, QFontDatabase, QColor @@ -487,46 +487,91 @@ class SettingsDialog(QDialog): server_group.setLayout(server_layout) content_layout.addWidget(server_group) - # Remote Processing Group - remote_group = QGroupBox("Remote Processing (GPU Offload)") - remote_layout = QFormLayout() - remote_layout.setSpacing(10) + # Transcription Mode Group + mode_group = QGroupBox("Transcription Mode") + mode_layout = QVBoxLayout() + mode_layout.setSpacing(10) - self.remote_enabled_check = QCheckBox() - self.remote_enabled_check.setToolTip( - "Enable remote transcription processing:\n" - "• Offload transcription to a GPU-equipped server\n" - "• Reduces local CPU/GPU usage\n" - "• Requires running the remote transcription service" - ) - remote_layout.addRow("Enable Remote Processing:", self.remote_enabled_check) + # Radio buttons for mode selection + self.mode_local_radio = QRadioButton("Local (Whisper)") + self.mode_local_radio.setToolTip("Transcribe locally using Whisper models") + self.mode_managed_radio = QRadioButton("Remote - Managed") + self.mode_managed_radio.setToolTip("Use the transcription proxy service with prepaid credits") + self.mode_byok_radio = QRadioButton("Remote - BYOK (Bring Your Own Key)") + self.mode_byok_radio.setToolTip("Connect directly to Deepgram with your own API key") - self.remote_url_input = QLineEdit() - self.remote_url_input.setPlaceholderText("ws://your-server:8765/ws/transcribe") - self.remote_url_input.setToolTip( - "WebSocket URL of the remote transcription service:\n" - "• Format: ws://host:port/ws/transcribe\n" - "• Use wss:// for secure connections" - ) - remote_layout.addRow("Server URL:", self.remote_url_input) + mode_layout.addWidget(self.mode_local_radio) + mode_layout.addWidget(self.mode_managed_radio) + mode_layout.addWidget(self.mode_byok_radio) - self.remote_api_key_input = QLineEdit() - self.remote_api_key_input.setEchoMode(QLineEdit.Password) - self.remote_api_key_input.setPlaceholderText("your-api-key") - self.remote_api_key_input.setToolTip( - "API key for authentication with the remote service" - ) - remote_layout.addRow("API Key:", self.remote_api_key_input) + # Managed mode fields (shown when managed radio selected) + self.managed_widget = QWidget() + managed_layout = QFormLayout() + managed_layout.setSpacing(8) - self.remote_fallback_check = QCheckBox("Enable") - self.remote_fallback_check.setChecked(True) - self.remote_fallback_check.setToolTip( - "Fall back to local transcription if remote service is unavailable" - ) - remote_layout.addRow("Fallback to Local:", self.remote_fallback_check) + self.managed_server_url = QLineEdit() + self.managed_server_url.setPlaceholderText("wss://your-proxy-server.com") + managed_layout.addRow("Server URL:", self.managed_server_url) - remote_group.setLayout(remote_layout) - content_layout.addWidget(remote_group) + # Login/Register buttons in a row + auth_widget = QWidget() + auth_layout = QHBoxLayout() + auth_layout.setContentsMargins(0, 0, 0, 0) + self.managed_login_btn = QPushButton("Login") + self.managed_login_btn.clicked.connect(self._managed_login) + self.managed_register_btn = QPushButton("Register") + self.managed_register_btn.clicked.connect(self._managed_register) + auth_layout.addWidget(self.managed_login_btn) + auth_layout.addWidget(self.managed_register_btn) + auth_layout.addStretch() + auth_widget.setLayout(auth_layout) + managed_layout.addRow("Account:", auth_widget) + + self.managed_balance_label = QLabel("Not logged in") + managed_layout.addRow("Balance:", self.managed_balance_label) + + self.managed_fallback_check = QCheckBox("Enable") + self.managed_fallback_check.setChecked(True) + self.managed_fallback_check.setToolTip("Fall back to local Whisper if remote fails") + managed_layout.addRow("Fallback to Local:", self.managed_fallback_check) + + self.managed_widget.setLayout(managed_layout) + mode_layout.addWidget(self.managed_widget) + + # BYOK mode fields (shown when BYOK radio selected) + self.byok_widget = QWidget() + byok_layout = QFormLayout() + byok_layout.setSpacing(8) + + self.byok_api_key_input = QLineEdit() + self.byok_api_key_input.setEchoMode(QLineEdit.Password) + self.byok_api_key_input.setPlaceholderText("your-deepgram-api-key") + byok_layout.addRow("Deepgram API Key:", self.byok_api_key_input) + + self.byok_model_combo = QComboBox() + self.byok_model_combo.addItems(["nova-2", "nova-2-general", "nova-2-meeting", "nova-2-phonecall", "whisper-large", "whisper-medium", "whisper-small"]) + byok_layout.addRow("Model:", self.byok_model_combo) + + self.byok_language_input = QLineEdit() + self.byok_language_input.setText("en-US") + self.byok_language_input.setPlaceholderText("en-US") + byok_layout.addRow("Language:", self.byok_language_input) + + self.byok_fallback_check = QCheckBox("Enable") + self.byok_fallback_check.setChecked(True) + self.byok_fallback_check.setToolTip("Fall back to local Whisper if Deepgram fails") + byok_layout.addRow("Fallback to Local:", self.byok_fallback_check) + + self.byok_widget.setLayout(byok_layout) + mode_layout.addWidget(self.byok_widget) + + mode_group.setLayout(mode_layout) + content_layout.addWidget(mode_group) + + # Connect radio buttons to show/hide relevant widgets + self.mode_local_radio.toggled.connect(self._on_mode_changed) + self.mode_managed_radio.toggled.connect(self._on_mode_changed) + self.mode_byok_radio.toggled.connect(self._on_mode_changed) # Updates Group updates_group = QGroupBox("Software Updates") @@ -794,11 +839,28 @@ class SettingsDialog(QDialog): self.server_room_input.setText(self.config.get('server_sync.room', 'default')) self.server_passphrase_input.setText(self.config.get('server_sync.passphrase', '')) - # Remote processing settings - self.remote_enabled_check.setChecked(self.config.get('remote_processing.enabled', False)) - self.remote_url_input.setText(self.config.get('remote_processing.server_url', '')) - self.remote_api_key_input.setText(self.config.get('remote_processing.api_key', '')) - self.remote_fallback_check.setChecked(self.config.get('remote_processing.fallback_to_local', True)) + # Transcription mode settings + mode = self.config.get('remote.mode', 'local') + if mode == 'managed': + self.mode_managed_radio.setChecked(True) + elif mode == 'byok': + self.mode_byok_radio.setChecked(True) + else: + self.mode_local_radio.setChecked(True) + + self.managed_server_url.setText(self.config.get('remote.server_url', '')) + self.managed_fallback_check.setChecked(self.config.get('remote.fallback_to_local', True)) + self.byok_api_key_input.setText(self.config.get('remote.byok_api_key', '')) + self.byok_model_combo.setCurrentText(self.config.get('remote.deepgram_model', 'nova-2')) + self.byok_language_input.setText(self.config.get('remote.language', 'en-US')) + self.byok_fallback_check.setChecked(self.config.get('remote.fallback_to_local', True)) + + # Trigger visibility update + self._on_mode_changed() + + # Update balance if managed mode and has token + if self.config.get('remote.auth_token'): + self._update_managed_balance() # Update settings self.update_auto_check.setChecked(self.config.get('updates.auto_check', True)) @@ -869,11 +931,21 @@ class SettingsDialog(QDialog): self.config.set('server_sync.room', self.server_room_input.text()) self.config.set('server_sync.passphrase', self.server_passphrase_input.text()) - # Remote processing settings - self.config.set('remote_processing.enabled', self.remote_enabled_check.isChecked()) - self.config.set('remote_processing.server_url', self.remote_url_input.text()) - self.config.set('remote_processing.api_key', self.remote_api_key_input.text()) - self.config.set('remote_processing.fallback_to_local', self.remote_fallback_check.isChecked()) + # Transcription mode settings + if self.mode_managed_radio.isChecked(): + self.config.set('remote.mode', 'managed') + elif self.mode_byok_radio.isChecked(): + self.config.set('remote.mode', 'byok') + else: + self.config.set('remote.mode', 'local') + + self.config.set('remote.server_url', self.managed_server_url.text()) + self.config.set('remote.fallback_to_local', + self.managed_fallback_check.isChecked() if self.mode_managed_radio.isChecked() + else self.byok_fallback_check.isChecked()) + self.config.set('remote.byok_api_key', self.byok_api_key_input.text()) + self.config.set('remote.deepgram_model', self.byok_model_combo.currentText()) + self.config.set('remote.language', self.byok_language_input.text()) # Update settings self.config.set('updates.auto_check', self.update_auto_check.isChecked()) @@ -892,6 +964,194 @@ class SettingsDialog(QDialog): except Exception as e: QMessageBox.critical(self, "Error", f"Failed to save settings:\n{e}") + def _on_mode_changed(self): + """Show/hide mode-specific widgets based on selected radio button.""" + self.managed_widget.setVisible(self.mode_managed_radio.isChecked()) + self.byok_widget.setVisible(self.mode_byok_radio.isChecked()) + + def _managed_login(self): + """Open a login dialog and authenticate with the managed proxy server.""" + import json + import urllib.request + import urllib.error + + dialog = QDialog(self) + dialog.setWindowTitle("Login") + dialog.setMinimumWidth(350) + layout = QFormLayout() + + email_input = QLineEdit() + email_input.setPlaceholderText("you@example.com") + layout.addRow("Email:", email_input) + + password_input = QLineEdit() + password_input.setEchoMode(QLineEdit.Password) + layout.addRow("Password:", password_input) + + button_layout = QHBoxLayout() + cancel_btn = QPushButton("Cancel") + cancel_btn.clicked.connect(dialog.reject) + login_btn = QPushButton("Login") + login_btn.setDefault(True) + button_layout.addStretch() + button_layout.addWidget(cancel_btn) + button_layout.addWidget(login_btn) + layout.addRow("", button_layout) + + dialog.setLayout(layout) + + def do_login(): + server_url = self.managed_server_url.text().rstrip('/') + if not server_url: + QMessageBox.warning(dialog, "Error", "Please enter a Server URL first.") + return + payload = json.dumps({ + "email": email_input.text(), + "password": password_input.text() + }).encode('utf-8') + req = urllib.request.Request( + f"{server_url}/auth/login", + data=payload, + headers={"Content-Type": "application/json"}, + method="POST" + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + data = json.loads(resp.read().decode('utf-8')) + token = data.get('token', '') + if token: + self.config.set('remote.auth_token', token) + self._update_managed_balance() + QMessageBox.information(dialog, "Success", "Logged in successfully.") + dialog.accept() + else: + QMessageBox.warning(dialog, "Error", "Login succeeded but no token received.") + except urllib.error.HTTPError as e: + try: + body = json.loads(e.read().decode('utf-8')) + msg = body.get('detail', body.get('message', str(e))) + except Exception: + msg = str(e) + QMessageBox.warning(dialog, "Login Failed", msg) + except Exception as e: + QMessageBox.warning(dialog, "Error", f"Could not connect to server:\n{e}") + + login_btn.clicked.connect(do_login) + dialog.exec() + + def _managed_register(self): + """Open a registration dialog and create an account on the managed proxy server.""" + import json + import urllib.request + import urllib.error + + dialog = QDialog(self) + dialog.setWindowTitle("Register") + dialog.setMinimumWidth(350) + layout = QFormLayout() + + email_input = QLineEdit() + email_input.setPlaceholderText("you@example.com") + layout.addRow("Email:", email_input) + + password_input = QLineEdit() + password_input.setEchoMode(QLineEdit.Password) + layout.addRow("Password:", password_input) + + confirm_input = QLineEdit() + confirm_input.setEchoMode(QLineEdit.Password) + layout.addRow("Confirm Password:", confirm_input) + + button_layout = QHBoxLayout() + cancel_btn = QPushButton("Cancel") + cancel_btn.clicked.connect(dialog.reject) + register_btn = QPushButton("Register") + register_btn.setDefault(True) + button_layout.addStretch() + button_layout.addWidget(cancel_btn) + button_layout.addWidget(register_btn) + layout.addRow("", button_layout) + + dialog.setLayout(layout) + + def do_register(): + if password_input.text() != confirm_input.text(): + QMessageBox.warning(dialog, "Error", "Passwords do not match.") + return + server_url = self.managed_server_url.text().rstrip('/') + if not server_url: + QMessageBox.warning(dialog, "Error", "Please enter a Server URL first.") + return + payload = json.dumps({ + "email": email_input.text(), + "password": password_input.text() + }).encode('utf-8') + req = urllib.request.Request( + f"{server_url}/auth/register", + data=payload, + headers={"Content-Type": "application/json"}, + method="POST" + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + data = json.loads(resp.read().decode('utf-8')) + token = data.get('token', '') + if token: + self.config.set('remote.auth_token', token) + self._update_managed_balance() + QMessageBox.information(dialog, "Success", "Account created and logged in.") + dialog.accept() + else: + QMessageBox.information(dialog, "Success", + "Account created. Please log in.") + dialog.accept() + except urllib.error.HTTPError as e: + try: + body = json.loads(e.read().decode('utf-8')) + msg = body.get('detail', body.get('message', str(e))) + except Exception: + msg = str(e) + QMessageBox.warning(dialog, "Registration Failed", msg) + except Exception as e: + QMessageBox.warning(dialog, "Error", f"Could not connect to server:\n{e}") + + register_btn.clicked.connect(do_register) + dialog.exec() + + def _update_managed_balance(self): + """Fetch and display the current account balance from the managed proxy server.""" + import json + import urllib.request + import urllib.error + + server_url = self.managed_server_url.text().rstrip('/') + token = self.config.get('remote.auth_token', '') + if not server_url or not token: + self.managed_balance_label.setText("Not logged in") + return + + req = urllib.request.Request( + f"{server_url}/billing/balance", + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json" + }, + method="GET" + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + data = json.loads(resp.read().decode('utf-8')) + balance = data.get('balance', data.get('credits', 'N/A')) + self.managed_balance_label.setText(str(balance)) + except urllib.error.HTTPError as e: + if e.code == 401: + self.managed_balance_label.setText("Session expired - please login again") + self.config.set('remote.auth_token', '') + else: + self.managed_balance_label.setText("Error fetching balance") + except Exception: + self.managed_balance_label.setText("Could not connect to server") + def _check_for_updates_now(self): """Manually check for updates.""" from version import __version__