From ff067b33685ec602154d5ebe3e051fb1b45fba49 Mon Sep 17 00:00:00 2001 From: jknapp Date: Sun, 11 Jan 2026 18:56:12 -0800 Subject: [PATCH] Add unified per-speaker font support and remote transcription service Font changes: - Consolidate font settings into single Display Settings section - Support Web-Safe, Google Fonts, and Custom File uploads for both displays - Fix Google Fonts URL encoding (use + instead of %2B for spaces) - Fix per-speaker font inline style quote escaping in Node.js display - Add font debug logging to help diagnose font issues - Update web server to sync all font settings on settings change - Remove deprecated PHP server documentation files New features: - Add remote transcription service for GPU offloading - Add instance lock to prevent multiple app instances - Add version tracking Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 9 +- client/config.py | 25 +- client/instance_lock.py | 94 +++++ client/remote_transcription.py | 346 +++++++++++++++++ client/server_sync.py | 225 +++++++++-- client/transcription_engine_realtime.py | 41 +- config/default_config.yaml | 19 +- gui/main_window_qt.py | 240 ++++++++---- gui/settings_dialog_qt.py | 252 +++++++++++- gui/transcription_display_qt.py | 177 ++++++++- main.py | 108 ++++-- pyproject.toml | 2 +- server/COMPARISON.md | 308 --------------- server/QUICK_FIX.md | 218 ----------- server/SYNC_PERFORMANCE.md | 248 ------------ server/nodejs/README.md | 61 +-- server/nodejs/server.js | 312 +++++++++++++-- server/test-server.sh | 160 -------- server/transcription-service/README.md | 173 +++++++++ server/transcription-service/requirements.txt | 8 + server/transcription-service/server.py | 366 ++++++++++++++++++ server/web_display.py | 239 +++++++++++- version.py | 15 + 23 files changed, 2486 insertions(+), 1160 deletions(-) create mode 100644 client/instance_lock.py create mode 100644 client/remote_transcription.py delete mode 100644 server/COMPARISON.md delete mode 100644 server/QUICK_FIX.md delete mode 100644 server/SYNC_PERFORMANCE.md delete mode 100755 server/test-server.sh create mode 100644 server/transcription-service/README.md create mode 100644 server/transcription-service/requirements.txt create mode 100644 server/transcription-service/server.py create mode 100644 version.py diff --git a/CLAUDE.md b/CLAUDE.md index 970eee9..3f3997f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -174,8 +174,9 @@ See [server/nodejs/README.md](server/nodejs/README.md) for deployment instructio - [client/server_sync.py](client/server_sync.py) handles server communication - Toggle in Settings: "Enable Server Sync" -- Sends transcriptions to PHP server via POST -- Separate web display shows merged transcriptions from all users +- Sends transcriptions to Node.js server via HTTP POST +- Real-time updates via WebSocket to display page +- Per-speaker font support (Web-Safe, Google Fonts, Custom uploads) - Falls back gracefully if server unavailable ## Common Patterns @@ -191,8 +192,8 @@ See [server/nodejs/README.md](server/nodejs/README.md) for deployment instructio ### Modifying Transcription Display - Local GUI: [gui/transcription_display_qt.py](gui/transcription_display_qt.py) -- Web display (OBS): [server/web_display.py](server/web_display.py) (HTML in `_get_html()`) -- Multi-user display: [server/php/display.php](server/php/display.php) +- Local web display (OBS): [server/web_display.py](server/web_display.py) (HTML in `_get_html()`) +- Multi-user display: [server/nodejs/server.js](server/nodejs/server.js) (display page in `/display` route) ### Adding a New Model Size diff --git a/client/config.py b/client/config.py index c989f8e..d964f8f 100644 --- a/client/config.py +++ b/client/config.py @@ -19,6 +19,10 @@ class Config: self.app_dir = Path.home() / ".local-transcription" self.app_dir.mkdir(parents=True, exist_ok=True) + # Fonts directory for custom font files + self.fonts_dir = self.app_dir / "fonts" + self.fonts_dir.mkdir(parents=True, exist_ok=True) + if config_path is None: self.config_path = self.app_dir / "config.yaml" else: @@ -34,7 +38,7 @@ class Config: self.config = yaml.safe_load(f) or {} else: # Load default configuration - default_config_path = Path(__file__).parent.parent / "config" / "default_config.yaml" + default_config_path = Path(__file__).resolve().parent.parent / "config" / "default_config.yaml" if default_config_path.exists(): with open(default_config_path, 'r') as f: self.config = yaml.safe_load(f) or {} @@ -137,5 +141,24 @@ class Config: self.config = self._get_default_config() self.save() + def get_custom_fonts(self) -> list: + """ + Get list of custom font files in the fonts directory. + + Returns: + List of (font_name, font_path) tuples + """ + fonts = [] + font_extensions = {'.ttf', '.otf', '.woff', '.woff2'} + + if self.fonts_dir.exists(): + for font_file in self.fonts_dir.iterdir(): + if font_file.suffix.lower() in font_extensions: + # Use filename without extension as font name + font_name = font_file.stem + fonts.append((font_name, font_file)) + + return sorted(fonts, key=lambda x: x[0].lower()) + def __repr__(self) -> str: return f"Config(path={self.config_path})" diff --git a/client/instance_lock.py b/client/instance_lock.py new file mode 100644 index 0000000..41a11b1 --- /dev/null +++ b/client/instance_lock.py @@ -0,0 +1,94 @@ +"""Single instance lock management for Local Transcription application.""" + +import os +import sys +from pathlib import Path + + +class InstanceLock: + """Manages single instance lock using a PID file.""" + + def __init__(self): + """Initialize the instance lock.""" + self.lock_dir = Path.home() / '.local-transcription' + self.lock_file = self.lock_dir / 'app.lock' + + def acquire(self) -> bool: + """ + Try to acquire the instance lock. + + Returns: + True if lock acquired (no other instance running), + False if another instance is already running. + """ + # Ensure lock directory exists + self.lock_dir.mkdir(parents=True, exist_ok=True) + + if self.lock_file.exists(): + try: + pid_str = self.lock_file.read_text().strip() + if pid_str: + pid = int(pid_str) + if self._is_process_running(pid): + return False + except (ValueError, OSError): + # Invalid PID file, we can overwrite it + pass + + # Write our PID to the lock file + try: + self.lock_file.write_text(str(os.getpid())) + return True + except OSError: + return False + + def release(self): + """Release the instance lock.""" + try: + if self.lock_file.exists(): + # Only remove if it contains our PID + pid_str = self.lock_file.read_text().strip() + if pid_str and int(pid_str) == os.getpid(): + self.lock_file.unlink() + except (ValueError, OSError): + pass + + def _is_process_running(self, pid: int) -> bool: + """ + Check if a process with the given PID is running. + + Args: + pid: Process ID to check + + Returns: + True if process is running, False otherwise + """ + if sys.platform == 'win32': + # Windows + try: + import ctypes + kernel32 = ctypes.windll.kernel32 + SYNCHRONIZE = 0x00100000 + process = kernel32.OpenProcess(SYNCHRONIZE, False, pid) + if process: + kernel32.CloseHandle(process) + return True + return False + except Exception: + return False + else: + # Unix/Linux/macOS + try: + os.kill(pid, 0) + return True + except OSError: + return False + + def __enter__(self): + """Context manager entry.""" + return self.acquire() + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.release() + return False diff --git a/client/remote_transcription.py b/client/remote_transcription.py new file mode 100644 index 0000000..0711855 --- /dev/null +++ b/client/remote_transcription.py @@ -0,0 +1,346 @@ +""" +Remote Transcription Client + +Handles streaming audio to a remote transcription service and receiving transcriptions. +Provides fallback to local transcription if the remote service is unavailable. +""" + +import asyncio +import base64 +import json +import logging +import numpy as np +from datetime import datetime +from threading import Thread, Lock +from typing import Optional, Callable +from queue import Queue, Empty + +logger = logging.getLogger(__name__) + + +class RemoteTranscriptionClient: + """ + Client for remote transcription service. + + Streams audio to a remote server and receives transcriptions. + """ + + def __init__( + self, + server_url: str, + api_key: str, + on_transcription: Optional[Callable[[str, bool], None]] = None, + on_error: Optional[Callable[[str], None]] = None, + on_connection_change: Optional[Callable[[bool], None]] = None, + sample_rate: int = 16000 + ): + """ + Initialize remote transcription client. + + Args: + server_url: WebSocket URL of the transcription service + api_key: API key for authentication + on_transcription: Callback for transcriptions (text, is_preview) + on_error: Callback for errors + on_connection_change: Callback for connection status changes + sample_rate: Audio sample rate + """ + self.server_url = server_url + self.api_key = api_key + self.sample_rate = sample_rate + self.on_transcription = on_transcription + self.on_error = on_error + self.on_connection_change = on_connection_change + + self.websocket = None + self.is_connected = False + self.is_authenticated = False + self.is_running = False + + self.audio_queue: Queue = Queue() + self.send_thread: Optional[Thread] = None + self.receive_thread: Optional[Thread] = None + self.loop: Optional[asyncio.AbstractEventLoop] = None + + self._lock = Lock() + + async def _connect(self): + """Establish WebSocket connection and authenticate.""" + try: + import websockets + + logger.info(f"Connecting to {self.server_url}") + self.websocket = await websockets.connect( + self.server_url, + ping_interval=30, + ping_timeout=10 + ) + + # Authenticate + auth_message = { + "type": "auth", + "api_key": self.api_key + } + await self.websocket.send(json.dumps(auth_message)) + + # Wait for auth response + response = await asyncio.wait_for( + self.websocket.recv(), + timeout=10.0 + ) + auth_result = json.loads(response) + + if auth_result.get("type") == "auth_result" and auth_result.get("success"): + self.is_connected = True + self.is_authenticated = True + logger.info("Connected and authenticated to remote transcription service") + if self.on_connection_change: + self.on_connection_change(True) + return True + else: + error_msg = auth_result.get("message", "Authentication failed") + logger.error(f"Authentication failed: {error_msg}") + if self.on_error: + self.on_error(f"Authentication failed: {error_msg}") + return False + + except Exception as e: + logger.error(f"Connection failed: {e}") + if self.on_error: + self.on_error(f"Connection failed: {e}") + return False + + async def _send_loop(self): + """Send audio chunks from the queue.""" + while self.is_running and self.websocket: + try: + # Get audio from queue with timeout + try: + audio_data = self.audio_queue.get(timeout=0.1) + except Empty: + continue + + if audio_data is None: + continue + + # Encode audio as base64 + audio_bytes = audio_data.astype(np.float32).tobytes() + audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') + + # Send to server + message = { + "type": "audio", + "data": audio_b64, + "sample_rate": self.sample_rate + } + await self.websocket.send(json.dumps(message)) + + except Exception as e: + if self.is_running: + logger.error(f"Send error: {e}") + break + + async def _receive_loop(self): + """Receive transcriptions from the server.""" + while self.is_running and self.websocket: + try: + message = await asyncio.wait_for( + self.websocket.recv(), + timeout=1.0 + ) + data = json.loads(message) + msg_type = data.get("type", "") + + if msg_type == "transcription": + text = data.get("text", "") + is_preview = data.get("is_preview", False) + if text and self.on_transcription: + self.on_transcription(text, is_preview) + + elif msg_type == "error": + error_msg = data.get("message", "Unknown error") + logger.error(f"Server error: {error_msg}") + if self.on_error: + self.on_error(error_msg) + + elif msg_type == "pong": + pass # Keep-alive response + + except asyncio.TimeoutError: + continue + except Exception as e: + if self.is_running: + logger.error(f"Receive error: {e}") + break + + # Connection lost + self.is_connected = False + self.is_authenticated = False + if self.on_connection_change: + self.on_connection_change(False) + + def _run_async(self): + """Run the async event loop in a thread.""" + self.loop = asyncio.new_event_loop() + asyncio.set_event_loop(self.loop) + + try: + # Connect + connected = self.loop.run_until_complete(self._connect()) + if not connected: + return + + # Run send and receive loops + tasks = [ + self._send_loop(), + self._receive_loop() + ] + self.loop.run_until_complete(asyncio.gather(*tasks)) + + except Exception as e: + logger.error(f"Async loop error: {e}") + finally: + if self.websocket: + try: + self.loop.run_until_complete(self.websocket.close()) + except: + pass + self.loop.close() + + def start(self): + """Start the remote transcription client.""" + with self._lock: + if self.is_running: + return + + self.is_running = True + + # Start async loop in background thread + self.send_thread = Thread(target=self._run_async, daemon=True) + self.send_thread.start() + + def stop(self): + """Stop the remote transcription client.""" + with self._lock: + self.is_running = False + + # Signal end to server + if self.websocket and self.loop: + try: + asyncio.run_coroutine_threadsafe( + self.websocket.send(json.dumps({"type": "end"})), + self.loop + ) + except: + pass + + self.is_connected = False + self.is_authenticated = False + + def send_audio(self, audio_data: np.ndarray): + """ + Send audio data for transcription. + + Args: + audio_data: Audio data as numpy array (float32, mono, sample_rate) + """ + if self.is_connected and self.is_authenticated: + self.audio_queue.put(audio_data) + + @property + def connected(self) -> bool: + """Check if connected and authenticated.""" + return self.is_connected and self.is_authenticated + + +class RemoteTranscriptionManager: + """ + Manages remote transcription with fallback to local processing. + """ + + def __init__( + self, + server_url: str, + api_key: str, + local_engine=None, + on_transcription: Optional[Callable] = None, + on_preview: Optional[Callable] = None + ): + """ + Initialize the remote transcription manager. + + Args: + server_url: Remote transcription service URL + api_key: API key for authentication + local_engine: Local transcription engine for fallback + on_transcription: Callback for final transcriptions + on_preview: Callback for preview transcriptions + """ + self.server_url = server_url + self.api_key = api_key + self.local_engine = local_engine + self.on_transcription = on_transcription + self.on_preview = on_preview + + self.client: Optional[RemoteTranscriptionClient] = None + self.use_remote = True + self.is_running = False + + def _handle_transcription(self, text: str, is_preview: bool): + """Handle transcription from remote service.""" + if is_preview: + if self.on_preview: + self.on_preview(text) + else: + if self.on_transcription: + self.on_transcription(text) + + def _handle_error(self, error: str): + """Handle error from remote service.""" + logger.error(f"Remote transcription error: {error}") + # Could switch to local fallback here + + def _handle_connection_change(self, connected: bool): + """Handle connection status change.""" + if connected: + logger.info("Remote transcription connected") + else: + logger.warning("Remote transcription disconnected") + # Could switch to local fallback here + + def start(self): + """Start remote transcription.""" + if self.is_running: + return + + self.is_running = True + + if self.use_remote and self.server_url and self.api_key: + self.client = RemoteTranscriptionClient( + server_url=self.server_url, + api_key=self.api_key, + on_transcription=self._handle_transcription, + on_error=self._handle_error, + on_connection_change=self._handle_connection_change + ) + self.client.start() + + def stop(self): + """Stop remote transcription.""" + self.is_running = False + if self.client: + self.client.stop() + self.client = None + + def send_audio(self, audio_data: np.ndarray): + """Send audio for transcription.""" + if self.client and self.client.connected: + self.client.send_audio(audio_data) + elif self.local_engine: + # Fallback to local processing + pass # Local engine handles its own audio capture + + @property + def is_connected(self) -> bool: + """Check if remote service is connected.""" + return self.client is not None and self.client.connected diff --git a/client/server_sync.py b/client/server_sync.py index cda2a4a..4d3f218 100644 --- a/client/server_sync.py +++ b/client/server_sync.py @@ -2,7 +2,9 @@ import requests import json -from typing import Optional +import base64 +from pathlib import Path +from typing import Optional, List from datetime import datetime import threading import queue @@ -10,22 +12,41 @@ from concurrent.futures import ThreadPoolExecutor class ServerSyncClient: - """Client for syncing transcriptions to a PHP server.""" + """Client for syncing transcriptions to a multi-user server.""" - def __init__(self, url: str, room: str, passphrase: str, user_name: str): + def __init__(self, url: str, room: str, passphrase: str, user_name: str, + fonts_dir: Optional[Path] = None, + font_source: str = "None", + websafe_font: Optional[str] = None, + google_font: Optional[str] = None, + custom_font_file: Optional[str] = None): """ Initialize server sync client. Args: - url: Server URL (e.g., http://example.com/transcription/server.php) + url: Server URL (e.g., http://example.com/api/send) room: Room name passphrase: Room passphrase user_name: User's display name + fonts_dir: Optional directory containing custom fonts to upload + font_source: Font source type ("None", "Web-Safe", "Google Font", "Custom File") + websafe_font: Web-safe font name (e.g., "Arial", "Times New Roman") + google_font: Google Font name (e.g., "Roboto", "Open Sans") + custom_font_file: Path to a custom font file for this speaker """ self.url = url self.room = room self.passphrase = passphrase self.user_name = user_name + self.fonts_dir = fonts_dir + self.font_source = font_source + self.websafe_font = websafe_font + self.google_font = google_font + self.custom_font_file = custom_font_file + + # Font info to send with transcriptions + self.font_family: Optional[str] = None + self.font_type: Optional[str] = None # "websafe", "google", "custom" # Queue for sending transcriptions asynchronously self.send_queue = queue.Queue() @@ -50,6 +71,153 @@ class ServerSyncClient: self.send_thread.start() print(f"Server sync started: room={self.room}") + # Set up font based on source type + if self.font_source == "Web-Safe" and self.websafe_font: + self.font_family = self.websafe_font + self.font_type = "websafe" + print(f"Using web-safe font: {self.font_family}") + elif self.font_source == "Google Font" and self.google_font: + self.font_family = self.google_font + self.font_type = "google" + print(f"Using Google Font: {self.font_family}") + elif self.font_source == "Custom File" and self.custom_font_file: + self._upload_custom_font() + # Legacy fallback: upload all fonts from fonts_dir if available + elif self.fonts_dir: + self._upload_fonts() + + def _upload_custom_font(self): + """Upload the user's custom font file to the server for per-speaker fonts.""" + if not self.custom_font_file: + return + + font_path = Path(self.custom_font_file) + if not font_path.exists(): + print(f"Custom font file not found: {self.custom_font_file}") + return + + # Validate extension + font_extensions = {'.ttf', '.otf', '.woff', '.woff2'} + if font_path.suffix.lower() not in font_extensions: + print(f"Invalid font file type: {font_path.suffix}") + return + + mime_types = { + '.ttf': 'font/ttf', + '.otf': 'font/otf', + '.woff': 'font/woff', + '.woff2': 'font/woff2' + } + + try: + # Read and encode font data + with open(font_path, 'rb') as f: + font_data = base64.b64encode(f.read()).decode('utf-8') + + # Font family name is filename without extension + self.font_family = font_path.stem + font_filename = font_path.name + + print(f"Uploading custom font: {font_filename} (family: {self.font_family})") + + # Upload to server + from urllib.parse import urlparse + parsed = urlparse(self.url) + base_url = f"{parsed.scheme}://{parsed.netloc}" + fonts_url = f"{base_url}/api/fonts" + + response = requests.post( + fonts_url, + json={ + 'room': self.room, + 'passphrase': self.passphrase, + 'fonts': [{ + 'name': font_filename, + 'data': font_data, + 'mime': mime_types.get(font_path.suffix.lower(), 'font/ttf') + }] + }, + timeout=30.0 + ) + + if response.status_code == 200: + result = response.json() + self.font_type = "custom" + print(f"Custom font uploaded: {self.font_family}") + else: + print(f"Custom font upload failed: {response.status_code}") + self.font_family = None + self.font_type = None + + except Exception as e: + print(f"Error uploading custom font: {e}") + self.font_family = None + self.font_type = None + + def _upload_fonts(self): + """Upload custom fonts to the server.""" + if not self.fonts_dir or not self.fonts_dir.exists(): + return + + # Find font files + font_extensions = {'.ttf', '.otf', '.woff', '.woff2'} + font_files = [f for f in self.fonts_dir.iterdir() + if f.is_file() and f.suffix.lower() in font_extensions] + + if not font_files: + return + + # Prepare font data + fonts = [] + mime_types = { + '.ttf': 'font/ttf', + '.otf': 'font/otf', + '.woff': 'font/woff', + '.woff2': 'font/woff2' + } + + for font_file in font_files: + try: + with open(font_file, 'rb') as f: + font_data = base64.b64encode(f.read()).decode('utf-8') + fonts.append({ + 'name': font_file.name, + 'data': font_data, + 'mime': mime_types.get(font_file.suffix.lower(), 'font/ttf') + }) + print(f"Prepared font for upload: {font_file.name}") + except Exception as e: + print(f"Error reading font file {font_file}: {e}") + + if not fonts: + return + + # Upload to server + try: + # Extract base URL for fonts endpoint + from urllib.parse import urlparse + parsed = urlparse(self.url) + base_url = f"{parsed.scheme}://{parsed.netloc}" + fonts_url = f"{base_url}/api/fonts" + + response = requests.post( + fonts_url, + json={ + 'room': self.room, + 'passphrase': self.passphrase, + 'fonts': fonts + }, + timeout=30.0 # Longer timeout for font uploads + ) + + if response.status_code == 200: + result = response.json() + print(f"Fonts uploaded successfully: {result.get('message', '')}") + else: + print(f"Font upload failed: {response.status_code}") + except Exception as e: + print(f"Error uploading fonts: {e}") + def stop(self): """Stop the sync client.""" self.is_running = False @@ -59,13 +227,14 @@ class ServerSyncClient: self.executor.shutdown(wait=False) # Don't wait - let pending requests finish in background print("Server sync stopped") - def send_transcription(self, text: str, timestamp: Optional[datetime] = None): + def send_transcription(self, text: str, timestamp: Optional[datetime] = None, is_preview: bool = False): """ Send a transcription to the server (non-blocking). Args: text: Transcription text timestamp: Timestamp (defaults to now) + is_preview: Whether this is a preview transcription """ if timestamp is None: timestamp = datetime.now() @@ -78,9 +247,20 @@ class ServerSyncClient: self.send_queue.put({ 'text': text, 'timestamp': timestamp.strftime("%H:%M:%S"), + 'is_preview': is_preview, 'queue_time': queue_time # For debugging }) + def send_preview(self, text: str, timestamp: Optional[datetime] = None): + """ + Send a preview transcription to the server (non-blocking). + + Args: + text: Preview transcription text + timestamp: Timestamp (defaults to now) + """ + self.send_transcription(text, timestamp, is_preview=True) + def _send_loop(self): """Background thread for sending transcriptions.""" while self.is_running: @@ -122,28 +302,25 @@ class ServerSyncClient: 'passphrase': self.passphrase, 'user_name': self.user_name, 'text': trans_data['text'], - 'timestamp': trans_data['timestamp'] + 'timestamp': trans_data['timestamp'], + 'is_preview': trans_data.get('is_preview', False) } - # Detect server type and send appropriately - # PHP servers have "server.php" in URL and need ?action=send - # Node.js servers have "/api/send" in URL and don't need it - request_start = time.time() - if 'server.php' in self.url: - # PHP server - add action parameter - response = requests.post( - self.url, - params={'action': 'send'}, - json=payload, - timeout=2.0 # Reduced timeout for faster failure detection - ) + # Add font info if user has a custom font configured + if self.font_family: + payload['font_family'] = self.font_family + payload['font_type'] = self.font_type # "websafe", "google", or "custom" + print(f"[Server Sync] Sending with font: {self.font_family} ({self.font_type})") else: - # Node.js server - no action parameter - response = requests.post( - self.url, - json=payload, - timeout=2.0 # Reduced timeout for faster failure detection - ) + print(f"[Server Sync] No font configured (font_source={self.font_source})") + + # Send to Node.js server + request_start = time.time() + response = requests.post( + self.url, + json=payload, + timeout=2.0 # Reduced timeout for faster failure detection + ) request_time = (time.time() - request_start) * 1000 print(f"[Server Sync] HTTP request: {request_time:.0f}ms, Status: {response.status_code}") diff --git a/client/transcription_engine_realtime.py b/client/transcription_engine_realtime.py index c055503..72a31fa 100644 --- a/client/transcription_engine_realtime.py +++ b/client/transcription_engine_realtime.py @@ -29,7 +29,7 @@ class TranscriptionResult: def __repr__(self) -> str: time_str = self.timestamp.strftime("%H:%M:%S") prefix = "[FINAL]" if self.is_final else "[PREVIEW]" - if self.user_name: + if self.user_name and self.user_name.strip(): return f"{prefix} [{time_str}] {self.user_name}: {self.text}" return f"{prefix} [{time_str}] {self.text}" @@ -63,6 +63,7 @@ class RealtimeTranscriptionEngine: # Realtime preview settings enable_realtime_transcription: bool = False, realtime_model: str = "tiny.en", + realtime_processing_pause: float = 0.1, # How often to update preview (lower = more frequent) # VAD settings silero_sensitivity: float = 0.4, silero_use_onnx: bool = True, @@ -106,11 +107,21 @@ class RealtimeTranscriptionEngine: user_name: User name for transcriptions """ self.model = model - self.device = device self.language = language self.compute_type = compute_type + + # Resolve device - 'auto' means use CUDA if available, else CPU + if device == 'auto': + try: + import torch + self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + except: + self.device = 'cpu' + else: + self.device = device self.enable_realtime = enable_realtime_transcription self.realtime_model = realtime_model + self.realtime_processing_pause = realtime_processing_pause self.user_name = user_name # Callbacks @@ -131,6 +142,7 @@ class RealtimeTranscriptionEngine: # Store configuration for recorder initialization self.config = { 'model': model, + 'device': self.device, # Use resolved device (auto -> cuda/cpu) 'language': language if language != 'auto' else None, 'compute_type': compute_type if compute_type != 'default' else 'default', 'input_device_index': input_device_index, @@ -145,8 +157,18 @@ class RealtimeTranscriptionEngine: 'initial_prompt': initial_prompt if initial_prompt else None, 'enable_realtime_transcription': enable_realtime_transcription, 'realtime_model_type': realtime_model if enable_realtime_transcription else None, + 'realtime_processing_pause': realtime_processing_pause if enable_realtime_transcription else 0.2, + # The realtime callback is added during initialize() after set_callbacks is called } + def _is_cuda_available(self) -> bool: + """Check if CUDA is available.""" + try: + import torch + return torch.cuda.is_available() + except: + return False + def set_callbacks( self, realtime_callback: Optional[Callable[[TranscriptionResult], None]] = None, @@ -198,8 +220,15 @@ class RealtimeTranscriptionEngine: try: print(f"Initializing RealtimeSTT with model: {self.model}") + print(f" Device: {self.device}, Compute type: {self.compute_type}") if self.enable_realtime: print(f" Realtime preview enabled with model: {self.realtime_model}") + print(f" Realtime processing pause: {self.realtime_processing_pause}s") + + # Add realtime transcription callback if enabled + # This provides word-by-word updates as speech is being processed + if self.enable_realtime: + self.config['on_realtime_transcription_update'] = self._on_realtime_transcription # Create recorder with configuration self.recorder = AudioToTextRecorder(**self.config) @@ -325,7 +354,7 @@ class RealtimeTranscriptionEngine: Returns: True if model changed successfully """ - was_running = self.is_running + was_running = self.is_recording # Stop current recording self.stop() @@ -355,7 +384,7 @@ class RealtimeTranscriptionEngine: Returns: True if device changed successfully """ - was_running = self.is_running + was_running = self.is_recording # Stop current recording self.stop() @@ -396,7 +425,7 @@ class RealtimeTranscriptionEngine: self.config['webrtc_sensitivity'] = webrtc_sensitivity # If running, need to restart to apply changes - if self.is_running: + if self.is_recording: print("VAD settings updated. Restart transcription to apply changes.") def set_user_name(self, user_name: str): @@ -404,7 +433,7 @@ class RealtimeTranscriptionEngine: self.user_name = user_name def __repr__(self) -> str: - return f"RealtimeTranscriptionEngine(model={self.model}, device={self.device}, running={self.is_running})" + return f"RealtimeTranscriptionEngine(model={self.model}, device={self.device}, running={self.is_recording})" def __del__(self): """Cleanup when object is destroyed.""" diff --git a/config/default_config.yaml b/config/default_config.yaml index bfa18f5..12f20b4 100644 --- a/config/default_config.yaml +++ b/config/default_config.yaml @@ -16,6 +16,7 @@ transcription: # Realtime preview settings (optional faster preview before final transcription) enable_realtime_transcription: false realtime_model: "tiny.en" # Faster model for instant preview + realtime_processing_pause: 0.1 # Seconds between preview updates (lower = more responsive, default 0.1) # VAD (Voice Activity Detection) settings silero_sensitivity: 0.4 # 0.0-1.0, lower = more sensitive (detects more speech) @@ -35,16 +36,26 @@ transcription: # Performance settings no_log_file: true # Disable RealtimeSTT logging + # Fast speaker mode - for speakers who talk quickly without pauses + # Reduces silence detection thresholds for more frequent transcription outputs + continuous_mode: false + server_sync: enabled: false url: "http://localhost:3000/api/send" room: "default" passphrase: "" + # Font settings are now in the display section (shared for local and server sync) display: show_timestamps: true max_lines: 100 - font_family: "Courier" + # Font settings (used for both local display and server sync) + font_source: "System Font" # Options: System Font, Web-Safe, Google Font, Custom File + font_family: "Courier" # System font name (local only, won't work with server sync) + websafe_font: "Arial" # Web-safe font name + google_font: "Roboto" # Google Font name + custom_font_file: "" # Path to custom font file (.ttf, .otf, .woff, .woff2) font_size: 12 theme: "dark" fade_after_seconds: 10 # Time before transcriptions fade out (0 = never fade) @@ -52,3 +63,9 @@ display: web_server: port: 8080 host: "127.0.0.1" + +remote_processing: + enabled: false # Enable remote transcription offloading + server_url: "" # WebSocket URL of remote transcription service (e.g., ws://your-server:8765/ws/transcribe) + api_key: "" # API key for authentication + fallback_to_local: true # Fall back to local processing if remote fails diff --git a/gui/main_window_qt.py b/gui/main_window_qt.py index 88cad51..60bfda7 100644 --- a/gui/main_window_qt.py +++ b/gui/main_window_qt.py @@ -9,16 +9,16 @@ from PySide6.QtGui import QFont from pathlib import Path import sys -# Add parent directory to path for imports -sys.path.append(str(Path(__file__).parent.parent)) +# Add parent directory to path for imports (resolve symlinks) +sys.path.append(str(Path(__file__).resolve().parent.parent)) from client.config import Config from client.device_utils import DeviceManager from client.transcription_engine_realtime import RealtimeTranscriptionEngine, TranscriptionResult from client.server_sync import ServerSyncClient -from gui.transcription_display_qt import TranscriptionDisplay from gui.settings_dialog_qt import SettingsDialog from server.web_display import TranscriptionWebServer +from version import __version__ import asyncio from threading import Thread @@ -96,9 +96,13 @@ class MainWindow(QMainWindow): # Server sync components self.server_sync_client: ServerSyncClient = None + # Store all transcriptions for saving (separate from display) + self.transcriptions: list = [] + # Configure window self.setWindowTitle("Local Transcription") - self.resize(900, 700) + self.resize(700, 300) + self.setMinimumSize(600, 280) # Set application icon # In PyInstaller frozen executables, use _MEIPASS for bundled files @@ -108,7 +112,7 @@ class MainWindow(QMainWindow): icon_path = Path(sys._MEIPASS) / "LocalTranscription.png" else: # Running in normal Python - icon_path = Path(__file__).parent.parent / "LocalTranscription.png" + icon_path = Path(__file__).resolve().parent.parent / "LocalTranscription.png" if icon_path.exists(): from PySide6.QtGui import QIcon @@ -174,13 +178,14 @@ class MainWindow(QMainWindow): # Status bar status_widget = QWidget() - status_widget.setFixedHeight(60) + status_widget.setFixedHeight(40) status_layout = QHBoxLayout() + status_layout.setContentsMargins(0, 0, 0, 0) status_widget.setLayout(status_layout) self.status_label = QLabel("⚫ Initializing...") status_font = QFont() - status_font.setPointSize(14) + status_font.setPointSize(12) self.status_label.setFont(status_font) status_layout.addWidget(self.status_label) @@ -193,28 +198,36 @@ class MainWindow(QMainWindow): self.user_label = QLabel(f"User: {user_name}") status_layout.addWidget(self.user_label) - # Web display link - web_host = self.config.get('web_server.host', '127.0.0.1') - web_port = self.config.get('web_server.port', 8080) - web_url = f"http://{web_host}:{web_port}" - self.web_link = QLabel(f'🌐 Open Web Display') - self.web_link.setOpenExternalLinks(True) - self.web_link.setToolTip(f"Click to open {web_url} in browser (for OBS)") - self.web_link.setStyleSheet("QLabel { color: #4CAF50; }") - status_layout.addWidget(self.web_link) - status_layout.addStretch() main_layout.addWidget(status_widget) - # Transcription display - self.transcription_display = TranscriptionDisplay( - max_lines=self.config.get('display.max_lines', 100), - show_timestamps=self.config.get('display.show_timestamps', True), - font_family=self.config.get('display.font_family', 'Courier'), - font_size=self.config.get('display.font_size', 12) - ) - main_layout.addWidget(self.transcription_display) + # Web display links section + links_widget = QWidget() + links_layout = QVBoxLayout() + links_layout.setContentsMargins(0, 5, 0, 5) + links_layout.setSpacing(5) + links_widget.setLayout(links_layout) + + # Local web display link + web_host = self.config.get('web_server.host', '127.0.0.1') + web_port = self.config.get('web_server.port', 8080) + web_url = f"http://{web_host}:{web_port}" + self.web_link = QLabel(f'🌐 Local Web Display: {web_url}') + self.web_link.setOpenExternalLinks(True) + self.web_link.setToolTip("Click to open in browser (for OBS)") + self.web_link.setStyleSheet("QLabel a { color: #4CAF50; }") + links_layout.addWidget(self.web_link) + + # Multi-user sync display link (shown when server sync is enabled) + self.sync_link = QLabel("") + self.sync_link.setOpenExternalLinks(True) + self.sync_link.setStyleSheet("QLabel a { color: #2196F3; }") + self.sync_link.setVisible(False) + links_layout.addWidget(self.sync_link) + self._update_sync_link() + + main_layout.addWidget(links_widget) # Control buttons control_widget = QWidget() @@ -232,7 +245,7 @@ class MainWindow(QMainWindow): self.start_button.setStyleSheet("background-color: #2ecc71; color: white;") control_layout.addWidget(self.start_button) - self.clear_button = QPushButton("Clear") + self.clear_button = QPushButton("🗑 Clear") self.clear_button.setFixedSize(120, 50) self.clear_button.clicked.connect(self._clear_transcriptions) control_layout.addWidget(self.clear_button) @@ -246,6 +259,12 @@ class MainWindow(QMainWindow): main_layout.addWidget(control_widget) + # Version label (bottom right) + version_label = QLabel(f"v{__version__}") + version_label.setStyleSheet("QLabel { color: #666; font-size: 10px; }") + version_label.setAlignment(Qt.AlignRight) + main_layout.addWidget(version_label) + def _initialize_components(self): """Initialize RealtimeSTT transcription engine.""" # Update status @@ -271,6 +290,20 @@ class MainWindow(QMainWindow): user_name = self.config.get('user.name', 'User') + # Check for continuous/fast speaker mode + continuous_mode = self.config.get('transcription.continuous_mode', False) + + # Get timing settings - use faster values if continuous mode is enabled + if continuous_mode: + # Faster settings for speakers who talk without pauses + post_speech_silence = 0.15 # Reduced from default 0.3 + min_gap = 0.0 # No gap between recordings + min_recording = 0.3 # Shorter minimum recording + else: + post_speech_silence = self.config.get('transcription.post_speech_silence_duration', 0.3) + min_gap = self.config.get('transcription.min_gap_between_recordings', 0.0) + min_recording = self.config.get('transcription.min_length_of_recording', 0.5) + self.transcription_engine = RealtimeTranscriptionEngine( model=model, device=device, @@ -278,12 +311,13 @@ class MainWindow(QMainWindow): compute_type=compute_type, enable_realtime_transcription=self.config.get('transcription.enable_realtime_transcription', False), realtime_model=self.config.get('transcription.realtime_model', 'tiny.en'), + realtime_processing_pause=self.config.get('transcription.realtime_processing_pause', 0.1), silero_sensitivity=self.config.get('transcription.silero_sensitivity', 0.4), silero_use_onnx=self.config.get('transcription.silero_use_onnx', True), webrtc_sensitivity=self.config.get('transcription.webrtc_sensitivity', 3), - post_speech_silence_duration=self.config.get('transcription.post_speech_silence_duration', 0.3), - min_length_of_recording=self.config.get('transcription.min_length_of_recording', 0.5), - min_gap_between_recordings=self.config.get('transcription.min_gap_between_recordings', 0.0), + post_speech_silence_duration=post_speech_silence, + min_length_of_recording=min_recording, + min_gap_between_recordings=min_gap, pre_recording_buffer_duration=self.config.get('transcription.pre_recording_buffer_duration', 0.2), beam_size=self.config.get('transcription.beam_size', 5), initial_prompt=self.config.get('transcription.initial_prompt', ''), @@ -332,6 +366,12 @@ class MainWindow(QMainWindow): max_lines = self.config.get('display.max_lines', 50) font_family = self.config.get('display.font_family', 'Arial') font_size = self.config.get('display.font_size', 16) + fonts_dir = self.config.fonts_dir # Custom fonts directory + + # Font source settings + font_source = self.config.get('display.font_source', 'System Font') + websafe_font = self.config.get('display.websafe_font', 'Arial') + google_font = self.config.get('display.google_font', 'Roboto') # Try up to 5 ports if the default is in use ports_to_try = [port] + [port + i for i in range(1, 5)] @@ -346,7 +386,11 @@ class MainWindow(QMainWindow): fade_after_seconds=fade_after_seconds, max_lines=max_lines, font_family=font_family, - font_size=font_size + font_size=font_size, + fonts_dir=fonts_dir, + font_source=font_source, + websafe_font=websafe_font, + google_font=google_font ) self.web_server_thread = WebServerThread(self.web_server) self.web_server_thread.start() @@ -450,15 +494,21 @@ class MainWindow(QMainWindow): return try: - # Update display with preview (thread-safe Qt call) - from PySide6.QtCore import QMetaObject, Q_ARG - QMetaObject.invokeMethod( - self.transcription_display, - "add_transcription", - Qt.QueuedConnection, - Q_ARG(str, f"[PREVIEW] {result.text}"), - Q_ARG(str, result.user_name) - ) + # Broadcast preview to local web server + if self.web_server and self.web_server_thread and self.web_server_thread.loop: + asyncio.run_coroutine_threadsafe( + self.web_server.broadcast_preview( + result.text, + result.user_name, + result.timestamp + ), + self.web_server_thread.loop + ) + + # Send preview to server sync if enabled + if self.server_sync_client: + self.server_sync_client.send_preview(result.text, result.timestamp) + except Exception as e: print(f"Error handling realtime transcription: {e}") @@ -468,15 +518,8 @@ class MainWindow(QMainWindow): return try: - # Update display (thread-safe Qt call) - from PySide6.QtCore import QMetaObject, Q_ARG - QMetaObject.invokeMethod( - self.transcription_display, - "add_transcription", - Qt.QueuedConnection, - Q_ARG(str, result.text), - Q_ARG(str, result.user_name) - ) + # Store transcription for saving + self.transcriptions.append(result) # Broadcast to web server if enabled if self.web_server and self.web_server_thread: @@ -508,18 +551,27 @@ class MainWindow(QMainWindow): def _clear_transcriptions(self): """Clear all transcriptions.""" + if not self.transcriptions: + QMessageBox.information(self, "No Transcriptions", "There are no transcriptions to clear.") + return + reply = QMessageBox.question( self, "Clear Transcriptions", - "Are you sure you want to clear all transcriptions?", + f"Are you sure you want to clear {len(self.transcriptions)} transcription(s)?", QMessageBox.Yes | QMessageBox.No ) if reply == QMessageBox.Yes: - self.transcription_display.clear_all() + self.transcriptions.clear() + QMessageBox.information(self, "Cleared", "All transcriptions have been cleared.") def _save_transcriptions(self): """Save transcriptions to file.""" + if not self.transcriptions: + QMessageBox.warning(self, "No Transcriptions", "There are no transcriptions to save.") + return + filepath, _ = QFileDialog.getSaveFileName( self, "Save Transcriptions", @@ -528,10 +580,21 @@ class MainWindow(QMainWindow): ) if filepath: - if self.transcription_display.save_to_file(filepath): + try: + show_timestamps = self.config.get('display.show_timestamps', True) + with open(filepath, 'w', encoding='utf-8') as f: + for result in self.transcriptions: + line_parts = [] + if show_timestamps: + time_str = result.timestamp.strftime("%H:%M:%S") + line_parts.append(f"[{time_str}]") + if result.user_name and result.user_name.strip(): + line_parts.append(f"{result.user_name}:") + line_parts.append(result.text) + f.write(" ".join(line_parts) + "\n") QMessageBox.information(self, "Saved", f"Transcriptions saved to:\n{filepath}") - else: - QMessageBox.critical(self, "Error", "Failed to save transcriptions") + except Exception as e: + QMessageBox.critical(self, "Error", f"Failed to save transcriptions:\n{e}") def _open_settings(self): """Open settings dialog.""" @@ -569,22 +632,20 @@ class MainWindow(QMainWindow): user_name = self.config.get('user.name', 'User') self.user_label.setText(f"User: {user_name}") - # Update display settings - show_timestamps = self.config.get('display.show_timestamps', True) - self.transcription_display.set_max_lines(self.config.get('display.max_lines', 100)) - self.transcription_display.set_show_timestamps(show_timestamps) - self.transcription_display.set_font( - self.config.get('display.font_family', 'Courier'), - self.config.get('display.font_size', 12) - ) - # Update web server settings if self.web_server: - self.web_server.show_timestamps = show_timestamps + self.web_server.show_timestamps = self.config.get('display.show_timestamps', True) self.web_server.fade_after_seconds = self.config.get('display.fade_after_seconds', 10) self.web_server.max_lines = self.config.get('display.max_lines', 50) self.web_server.font_family = self.config.get('display.font_family', 'Arial') self.web_server.font_size = self.config.get('display.font_size', 16) + # Update font source settings + self.web_server.font_source = self.config.get('display.font_source', 'System Font') + self.web_server.websafe_font = self.config.get('display.websafe_font', 'Arial') + self.web_server.google_font = self.config.get('display.google_font', 'Roboto') + + # Update sync link visibility based on server sync settings + self._update_sync_link() # Restart server sync if it was running and settings changed if self.is_transcribing and self.server_sync_client: @@ -656,18 +717,33 @@ class MainWindow(QMainWindow): room = self.config.get('server_sync.room', 'default') passphrase = self.config.get('server_sync.passphrase', '') user_name = self.config.get('user.name', 'User') + fonts_dir = self.config.fonts_dir # Custom fonts directory + + # Font settings (shared with display settings) + # Note: "System Font" only works locally, so we treat it as "None" for server sync + font_source = self.config.get('display.font_source', 'System Font') + if font_source == "System Font": + font_source = "None" # System fonts don't work on remote displays + websafe_font = self.config.get('display.websafe_font', '') + google_font = self.config.get('display.google_font', '') + custom_font_file = self.config.get('display.custom_font_file', '') if not url: print("Server sync enabled but no URL configured") return - print(f"Starting server sync: {url}, room: {room}, user: {user_name}") + print(f"Starting server sync: {url}, room: {room}, user: {user_name}, font: {font_source}") self.server_sync_client = ServerSyncClient( url=url, room=room, passphrase=passphrase, - user_name=user_name + user_name=user_name, + fonts_dir=fonts_dir, + font_source=font_source, + websafe_font=websafe_font if websafe_font else None, + google_font=google_font if google_font else None, + custom_font_file=custom_font_file if custom_font_file else None ) self.server_sync_client.start() @@ -679,6 +755,40 @@ class MainWindow(QMainWindow): f"Failed to start server sync:\n{e}\n\nTranscription will continue locally." ) + def _update_sync_link(self): + """Update the multi-user sync link visibility and URL.""" + server_sync_enabled = self.config.get('server_sync.enabled', False) + server_url = self.config.get('server_sync.url', '') + room = self.config.get('server_sync.room', 'default') + + if server_sync_enabled and server_url: + # Extract base URL from the API endpoint (e.g., http://server:3000/api/send -> http://server:3000) + try: + from urllib.parse import urlparse, urlencode + parsed = urlparse(server_url) + base_url = f"{parsed.scheme}://{parsed.netloc}" + + # Get display settings to pass as URL parameters + params = { + 'room': room, + 'fontfamily': self.config.get('display.font_family', 'Arial'), + 'fontsize': self.config.get('display.font_size', 16), + 'fade': self.config.get('display.fade_after_seconds', 10), + 'timestamps': 'true' if self.config.get('display.show_timestamps', True) else 'false', + 'maxlines': self.config.get('display.max_lines', 50) + } + display_url = f"{base_url}/display?{urlencode(params)}" + # Show shorter text with just address and room + display_text = f"{base_url} (room: {room})" + self.sync_link.setText(f'🔗 Multi-User Display: {display_text}') + self.sync_link.setToolTip(f"Click to open: {display_url}") + self.sync_link.setVisible(True) + except Exception as e: + print(f"Error parsing server URL: {e}") + self.sync_link.setVisible(False) + else: + self.sync_link.setVisible(False) + def closeEvent(self, event): """Handle window closing.""" # Stop transcription if running diff --git a/gui/settings_dialog_qt.py b/gui/settings_dialog_qt.py index 59d3d7b..73da05e 100644 --- a/gui/settings_dialog_qt.py +++ b/gui/settings_dialog_qt.py @@ -3,10 +3,11 @@ from PySide6.QtWidgets import ( QDialog, QVBoxLayout, QHBoxLayout, QFormLayout, QLabel, QLineEdit, QComboBox, QCheckBox, QSlider, - QPushButton, QMessageBox, QGroupBox, QScrollArea, QWidget + QPushButton, QMessageBox, QGroupBox, QScrollArea, QWidget, + QFileDialog ) from PySide6.QtCore import Qt -from PySide6.QtGui import QScreen +from PySide6.QtGui import QScreen, QFontDatabase from typing import Callable, List, Tuple @@ -179,6 +180,16 @@ class SettingsDialog(QDialog): self.realtime_model_combo.addItems(["tiny", "tiny.en", "base", "base.en"]) realtime_layout.addRow("Preview Model:", self.realtime_model_combo) + self.realtime_pause_input = QLineEdit() + self.realtime_pause_input.setToolTip( + "Seconds between preview updates:\n" + "• Lower values = More responsive, more frequent updates\n" + "• Higher values = Less CPU usage, updates less often\n" + "• 0.1 is recommended for real-time streaming\n" + "• Try 0.05 for even faster updates" + ) + realtime_layout.addRow("Preview Update Interval (s):", self.realtime_pause_input) + realtime_group.setLayout(realtime_layout) content_layout.addWidget(realtime_group) @@ -261,6 +272,16 @@ class SettingsDialog(QDialog): ) timing_layout.addRow("Pre-Recording Buffer (s):", self.pre_buffer_input) + self.continuous_mode_check = QCheckBox() + self.continuous_mode_check.setToolTip( + "Fast Speaker Mode:\n" + "• For speakers who talk quickly without pauses\n" + "• Reduces silence detection thresholds\n" + "• Produces more frequent transcription outputs\n" + "• May result in more fragmented sentences" + ) + timing_layout.addRow("Fast Speaker Mode:", self.continuous_mode_check) + timing_group.setLayout(timing_layout) content_layout.addWidget(timing_group) @@ -281,10 +302,79 @@ class SettingsDialog(QDialog): ) display_layout.addRow("Max Lines:", self.maxlines_input) + # Font source selector (shared for local display and server sync) + self.display_font_source_combo = QComboBox() + self.display_font_source_combo.addItems(["System Font", "Web-Safe", "Google Font", "Custom File"]) + self.display_font_source_combo.setToolTip( + "Choose font for local display and server sync:\n" + "• System Font - Local only (won't work with server sync)\n" + "• Web-Safe - Universal fonts (Arial, Comic Sans, etc.)\n" + "• Google Font - Free fonts from fonts.google.com\n" + "• Custom File - Upload your own font file" + ) + self.display_font_source_combo.currentTextChanged.connect(self._on_display_font_source_changed) + display_layout.addRow("Font Source:", self.display_font_source_combo) + + # System font selector self.font_family_combo = QComboBox() - self.font_family_combo.setToolTip("Font family for transcription display") - self.font_family_combo.addItems(["Courier", "Arial", "Times New Roman", "Consolas", "Monaco", "Monospace"]) - display_layout.addRow("Font Family:", self.font_family_combo) + self.font_family_combo.setToolTip("Font family for transcription display (system fonts)") + self.font_family_combo.setEditable(True) + self.font_family_combo.setMaxVisibleItems(20) + system_fonts = QFontDatabase.families() + common_fonts = ["Courier", "Arial", "Times New Roman", "Consolas", "Monaco", "Monospace"] + ordered_fonts = [] + for font in common_fonts: + if font in system_fonts: + ordered_fonts.append(font) + for font in sorted(system_fonts): + if font not in ordered_fonts: + ordered_fonts.append(font) + self.font_family_combo.addItems(ordered_fonts) + display_layout.addRow("System Font:", self.font_family_combo) + + # Web-safe font selector for display + self.display_websafe_combo = QComboBox() + display_websafe_fonts = [ + "Arial", "Arial Black", "Comic Sans MS", "Courier New", + "Georgia", "Impact", "Lucida Console", "Lucida Sans Unicode", + "Palatino Linotype", "Tahoma", "Times New Roman", "Trebuchet MS", "Verdana" + ] + self.display_websafe_combo.addItems(display_websafe_fonts) + self.display_websafe_combo.setToolTip("Web-safe fonts work on all systems") + display_layout.addRow("Web-Safe Font:", self.display_websafe_combo) + + # Google Font selector for display + self.display_google_font_combo = QComboBox() + display_google_fonts = [ + "Roboto", "Open Sans", "Lato", "Montserrat", "Poppins", + "Nunito", "Raleway", "Ubuntu", "Rubik", "Work Sans", + "Inter", "Outfit", "Quicksand", "Comfortaa", "Varela Round", + "Playfair Display", "Merriweather", "Lora", "PT Serif", "Crimson Text", + "Roboto Mono", "Source Code Pro", "Fira Code", "JetBrains Mono", "IBM Plex Mono", + "Bebas Neue", "Oswald", "Righteous", "Bangers", "Permanent Marker", + "Pacifico", "Lobster", "Dancing Script", "Caveat", "Satisfy" + ] + self.display_google_font_combo.addItems(display_google_fonts) + self.display_google_font_combo.setToolTip("Select a Google Font for display") + display_layout.addRow("Google Font:", self.display_google_font_combo) + + # Custom font file picker (for server sync upload) + custom_font_layout = QHBoxLayout() + self.display_custom_font_input = QLineEdit() + self.display_custom_font_input.setPlaceholderText("No file selected") + self.display_custom_font_input.setReadOnly(True) + self.display_custom_font_input.setToolTip( + "Select a font file to use:\n" + "• Supports .ttf, .otf, .woff, .woff2 files\n" + "• Font is uploaded to server automatically when using Server Sync" + ) + custom_font_layout.addWidget(self.display_custom_font_input) + + self.display_custom_font_browse = QPushButton("Browse...") + self.display_custom_font_browse.clicked.connect(self._browse_display_custom_font) + custom_font_layout.addWidget(self.display_custom_font_browse) + + display_layout.addRow("Custom Font File:", custom_font_layout) self.font_size_input = QLineEdit() self.font_size_input.setToolTip("Font size in pixels (12-20 recommended)") @@ -301,6 +391,9 @@ class SettingsDialog(QDialog): display_group.setLayout(display_layout) content_layout.addWidget(display_group) + # Initially show only System Font (default) + self._on_display_font_source_changed("System Font") + # Server Sync Group server_group = QGroupBox("Multi-User Server Sync (Optional)") server_layout = QFormLayout() @@ -339,9 +432,55 @@ class SettingsDialog(QDialog): ) server_layout.addRow("Passphrase:", self.server_passphrase_input) + # Note about font settings + font_note = QLabel("Font settings are in Display Settings above") + font_note.setStyleSheet("color: #666; font-style: italic;") + server_layout.addRow("", font_note) + server_group.setLayout(server_layout) content_layout.addWidget(server_group) + # Remote Processing Group + remote_group = QGroupBox("Remote Processing (GPU Offload)") + remote_layout = QFormLayout() + remote_layout.setSpacing(10) + + self.remote_enabled_check = QCheckBox() + self.remote_enabled_check.setToolTip( + "Enable remote transcription processing:\n" + "• Offload transcription to a GPU-equipped server\n" + "• Reduces local CPU/GPU usage\n" + "• Requires running the remote transcription service" + ) + remote_layout.addRow("Enable Remote Processing:", self.remote_enabled_check) + + self.remote_url_input = QLineEdit() + self.remote_url_input.setPlaceholderText("ws://your-server:8765/ws/transcribe") + self.remote_url_input.setToolTip( + "WebSocket URL of the remote transcription service:\n" + "• Format: ws://host:port/ws/transcribe\n" + "• Use wss:// for secure connections" + ) + remote_layout.addRow("Server URL:", self.remote_url_input) + + self.remote_api_key_input = QLineEdit() + self.remote_api_key_input.setEchoMode(QLineEdit.Password) + self.remote_api_key_input.setPlaceholderText("your-api-key") + self.remote_api_key_input.setToolTip( + "API key for authentication with the remote service" + ) + remote_layout.addRow("API Key:", self.remote_api_key_input) + + self.remote_fallback_check = QCheckBox("Enable") + self.remote_fallback_check.setChecked(True) + self.remote_fallback_check.setToolTip( + "Fall back to local transcription if remote service is unavailable" + ) + remote_layout.addRow("Fallback to Local:", self.remote_fallback_check) + + remote_group.setLayout(remote_layout) + content_layout.addWidget(remote_group) + # Add stretch to push everything to the top content_layout.addStretch() @@ -367,6 +506,77 @@ class SettingsDialog(QDialog): """Update the Silero sensitivity label.""" self.silero_label.setText(f"{value / 100:.2f}") + def _open_fonts_folder(self): + """Open the custom fonts folder in the system file manager.""" + import subprocess + import sys + from pathlib import Path + + fonts_dir = self.config.fonts_dir + + # Ensure the folder exists + fonts_dir.mkdir(parents=True, exist_ok=True) + + # Open the folder in the system file manager + if sys.platform == 'win32': + subprocess.run(['explorer', str(fonts_dir)]) + elif sys.platform == 'darwin': + subprocess.run(['open', str(fonts_dir)]) + else: + # Linux + subprocess.run(['xdg-open', str(fonts_dir)]) + + def _on_display_font_source_changed(self, source: str): + """Show/hide display font inputs based on selected source.""" + # Hide all font-specific inputs first + self.font_family_combo.setVisible(False) + self.display_websafe_combo.setVisible(False) + self.display_google_font_combo.setVisible(False) + self.display_custom_font_input.setVisible(False) + self.display_custom_font_browse.setVisible(False) + + # Find the form layout rows and hide/show labels too + parent = self.display_font_source_combo.parent() + display_layout = parent.layout() if parent else None + if display_layout and hasattr(display_layout, 'rowCount'): + for i in range(display_layout.rowCount()): + label = display_layout.itemAt(i, QFormLayout.LabelRole) + field = display_layout.itemAt(i, QFormLayout.FieldRole) + if label and field: + label_widget = label.widget() + if label_widget: + label_text = label_widget.text() + if label_text == "System Font:": + label_widget.setVisible(source == "System Font") + elif label_text == "Web-Safe Font:": + label_widget.setVisible(source == "Web-Safe") + elif label_text == "Google Font:": + label_widget.setVisible(source == "Google Font") + elif label_text == "Custom Font File:": + label_widget.setVisible(source == "Custom File") + + # Show the relevant input + if source == "System Font": + self.font_family_combo.setVisible(True) + elif source == "Web-Safe": + self.display_websafe_combo.setVisible(True) + elif source == "Google Font": + self.display_google_font_combo.setVisible(True) + elif source == "Custom File": + self.display_custom_font_input.setVisible(True) + self.display_custom_font_browse.setVisible(True) + + def _browse_display_custom_font(self): + """Browse for a custom font file.""" + file_path, _ = QFileDialog.getOpenFileName( + self, + "Select Font File", + "", + "Font Files (*.ttf *.otf *.woff *.woff2);;All Files (*)" + ) + if file_path: + self.display_custom_font_input.setText(file_path) + def _load_current_settings(self): """Load current settings from config.""" # User settings @@ -402,6 +612,7 @@ class SettingsDialog(QDialog): self.realtime_enabled_check.setChecked(self.config.get('transcription.enable_realtime_transcription', False)) realtime_model = self.config.get('transcription.realtime_model', 'tiny.en') self.realtime_model_combo.setCurrentText(realtime_model) + self.realtime_pause_input.setText(str(self.config.get('transcription.realtime_processing_pause', 0.1))) # VAD settings silero_sens = self.config.get('transcription.silero_sensitivity', 0.4) @@ -417,13 +628,23 @@ class SettingsDialog(QDialog): self.post_silence_input.setText(str(self.config.get('transcription.post_speech_silence_duration', 0.3))) self.min_recording_input.setText(str(self.config.get('transcription.min_length_of_recording', 0.5))) self.pre_buffer_input.setText(str(self.config.get('transcription.pre_recording_buffer_duration', 0.2))) + self.continuous_mode_check.setChecked(self.config.get('transcription.continuous_mode', False)) # Display settings self.timestamps_check.setChecked(self.config.get('display.show_timestamps', True)) self.maxlines_input.setText(str(self.config.get('display.max_lines', 100))) + # Display font settings + display_font_source = self.config.get('display.font_source', 'System Font') + self.display_font_source_combo.setCurrentText(display_font_source) font_family = self.config.get('display.font_family', 'Courier') self.font_family_combo.setCurrentText(font_family) + self.display_websafe_combo.setCurrentText(self.config.get('display.websafe_font', 'Arial')) + display_google_font = self.config.get('display.google_font', 'Roboto') + if display_google_font: + self.display_google_font_combo.setCurrentText(display_google_font) + self.display_custom_font_input.setText(self.config.get('display.custom_font_file', '')) + self._on_display_font_source_changed(display_font_source) self.font_size_input.setText(str(self.config.get('display.font_size', 12))) self.fade_seconds_input.setText(str(self.config.get('display.fade_after_seconds', 10))) @@ -434,6 +655,12 @@ class SettingsDialog(QDialog): self.server_room_input.setText(self.config.get('server_sync.room', 'default')) self.server_passphrase_input.setText(self.config.get('server_sync.passphrase', '')) + # Remote processing settings + self.remote_enabled_check.setChecked(self.config.get('remote_processing.enabled', False)) + self.remote_url_input.setText(self.config.get('remote_processing.server_url', '')) + self.remote_api_key_input.setText(self.config.get('remote_processing.api_key', '')) + self.remote_fallback_check.setChecked(self.config.get('remote_processing.fallback_to_local', True)) + def _save_settings(self): """Save settings to config.""" try: @@ -459,6 +686,7 @@ class SettingsDialog(QDialog): # Realtime preview self.config.set('transcription.enable_realtime_transcription', self.realtime_enabled_check.isChecked()) self.config.set('transcription.realtime_model', self.realtime_model_combo.currentText()) + self.config.set('transcription.realtime_processing_pause', float(self.realtime_pause_input.text())) # VAD settings self.config.set('transcription.silero_sensitivity', self.silero_slider.value() / 100.0) @@ -469,12 +697,20 @@ class SettingsDialog(QDialog): self.config.set('transcription.post_speech_silence_duration', float(self.post_silence_input.text())) self.config.set('transcription.min_length_of_recording', float(self.min_recording_input.text())) self.config.set('transcription.pre_recording_buffer_duration', float(self.pre_buffer_input.text())) + self.config.set('transcription.continuous_mode', self.continuous_mode_check.isChecked()) # Display settings self.config.set('display.show_timestamps', self.timestamps_check.isChecked()) max_lines = int(self.maxlines_input.text()) self.config.set('display.max_lines', max_lines) + + # Display font settings (also used for server sync) + self.config.set('display.font_source', self.display_font_source_combo.currentText()) self.config.set('display.font_family', self.font_family_combo.currentText()) + self.config.set('display.websafe_font', self.display_websafe_combo.currentText()) + self.config.set('display.google_font', self.display_google_font_combo.currentText()) + self.config.set('display.custom_font_file', self.display_custom_font_input.text()) + font_size = int(self.font_size_input.text()) self.config.set('display.font_size', font_size) fade_seconds = int(self.fade_seconds_input.text()) @@ -486,6 +722,12 @@ class SettingsDialog(QDialog): self.config.set('server_sync.room', self.server_room_input.text()) self.config.set('server_sync.passphrase', self.server_passphrase_input.text()) + # Remote processing settings + self.config.set('remote_processing.enabled', self.remote_enabled_check.isChecked()) + self.config.set('remote_processing.server_url', self.remote_url_input.text()) + self.config.set('remote_processing.api_key', self.remote_api_key_input.text()) + self.config.set('remote_processing.fallback_to_local', self.remote_fallback_check.isChecked()) + # Call save callback (which will show the success message) if self.on_save: self.on_save() diff --git a/gui/transcription_display_qt.py b/gui/transcription_display_qt.py index e7ca667..26d0e6e 100644 --- a/gui/transcription_display_qt.py +++ b/gui/transcription_display_qt.py @@ -1,7 +1,7 @@ """PySide6 transcription display widget for showing real-time transcriptions.""" from PySide6.QtWidgets import QTextEdit -from PySide6.QtGui import QFont, QTextCursor +from PySide6.QtGui import QFont, QTextCursor, QTextCharFormat, QColor from PySide6.QtCore import Qt, Slot from datetime import datetime @@ -28,6 +28,10 @@ class TranscriptionDisplay(QTextEdit): self.font_family = font_family self.font_size = font_size + # Track the current preview line for two-stage transcription + self.preview_line_index = -1 # -1 means no active preview + self.preview_block_number = -1 # Block number for the preview line + # Configure text widget self.setReadOnly(True) self.setFont(QFont(font_family, font_size)) @@ -43,6 +47,36 @@ class TranscriptionDisplay(QTextEdit): } """) + def _format_line(self, text: str, user_name: str, timestamp: datetime, is_preview: bool = False) -> str: + """ + Format a transcription line. + + Args: + text: Transcription text + user_name: User/speaker name + timestamp: Timestamp of transcription + is_preview: Whether this is a preview line + + Returns: + Formatted line string + """ + line_parts = [] + + if self.show_timestamps: + time_str = timestamp.strftime("%H:%M:%S") + line_parts.append(f"[{time_str}]") + + if user_name and user_name.strip(): + line_parts.append(f"{user_name}:") + + # Add preview indicator for visual distinction + if is_preview: + line_parts.append(f"[...] {text}") + else: + line_parts.append(text) + + return " ".join(line_parts) + @Slot(str, str) def add_transcription(self, text: str, user_name: str = "", timestamp: datetime = None): """ @@ -56,35 +90,130 @@ class TranscriptionDisplay(QTextEdit): if timestamp is None: timestamp = datetime.now() - # Build the display line - line_parts = [] + line = self._format_line(text, user_name, timestamp, is_preview=False) - if self.show_timestamps: - time_str = timestamp.strftime("%H:%M:%S") - line_parts.append(f"[{time_str}]") - - if user_name: - line_parts.append(f"{user_name}:") - - line_parts.append(text) - - line = " ".join(line_parts) - - # Add to display - self.append(line) + # If there's an active preview, replace it instead of appending + if self.preview_line_index >= 0: + self._replace_preview_with_final(line) + else: + # Add to display normally + self.append(line) + self.line_count += 1 # Auto-scroll to bottom cursor = self.textCursor() cursor.movePosition(QTextCursor.End) self.setTextCursor(cursor) - # Track line count - self.line_count += 1 - # Remove old lines if exceeding max if self.line_count > self.max_lines: self._remove_oldest_lines(self.line_count - self.max_lines) + @Slot(str, str) + def add_preview(self, text: str, user_name: str = "", timestamp: datetime = None): + """ + Add a preview transcription that will be replaced by the final transcription. + + Args: + text: Preview transcription text + user_name: User/speaker name + timestamp: Timestamp of transcription + """ + if timestamp is None: + timestamp = datetime.now() + + line = self._format_line(text, user_name, timestamp, is_preview=True) + + # If there's already a preview, replace it + if self.preview_line_index >= 0: + self._replace_preview_line(line) + else: + # Add new preview line + cursor = self.textCursor() + cursor.movePosition(QTextCursor.End) + + # Apply italic formatting for preview + fmt = QTextCharFormat() + fmt.setFontItalic(True) + + if self.line_count > 0: + cursor.insertText("\n") + + cursor.insertText(line, fmt) + + self.preview_line_index = self.line_count + self.preview_block_number = self.document().blockCount() - 1 + self.line_count += 1 + + # Auto-scroll to bottom + cursor = self.textCursor() + cursor.movePosition(QTextCursor.End) + self.setTextCursor(cursor) + + def _replace_preview_line(self, new_text: str): + """Replace the current preview line with new preview text.""" + if self.preview_block_number < 0: + return + + doc = self.document() + block = doc.findBlockByNumber(self.preview_block_number) + + if block.isValid(): + cursor = QTextCursor(block) + cursor.select(QTextCursor.BlockUnderCursor) + + # Apply italic formatting for preview + fmt = QTextCharFormat() + fmt.setFontItalic(True) + + cursor.removeSelectedText() + cursor.insertText(new_text, fmt) + + def _replace_preview_with_final(self, final_text: str): + """Replace the preview line with final transcription.""" + if self.preview_block_number < 0: + # No preview to replace, just add normally + self.append(final_text) + self.line_count += 1 + self.preview_line_index = -1 + self.preview_block_number = -1 + return + + doc = self.document() + block = doc.findBlockByNumber(self.preview_block_number) + + if block.isValid(): + cursor = QTextCursor(block) + cursor.select(QTextCursor.BlockUnderCursor) + + # Apply normal formatting for final text + fmt = QTextCharFormat() + fmt.setFontItalic(False) + fmt.setForeground(QColor(255, 255, 255)) # White for final + + cursor.removeSelectedText() + cursor.insertText(final_text, fmt) + + # Clear preview tracking + self.preview_line_index = -1 + self.preview_block_number = -1 + + def clear_preview(self): + """Clear the current preview without adding a final transcription.""" + if self.preview_block_number >= 0: + doc = self.document() + block = doc.findBlockByNumber(self.preview_block_number) + + if block.isValid(): + cursor = QTextCursor(block) + cursor.select(QTextCursor.BlockUnderCursor) + cursor.removeSelectedText() + cursor.deleteChar() # Remove newline + self.line_count -= 1 + + self.preview_line_index = -1 + self.preview_block_number = -1 + def _remove_oldest_lines(self, num_lines: int): """ Remove oldest lines from the display. @@ -102,10 +231,20 @@ class TranscriptionDisplay(QTextEdit): self.line_count -= num_lines + # Adjust preview tracking if lines were removed + if self.preview_line_index >= 0: + self.preview_line_index -= num_lines + self.preview_block_number -= num_lines + if self.preview_line_index < 0: + self.preview_line_index = -1 + self.preview_block_number = -1 + def clear_all(self): """Clear all transcriptions.""" self.clear() self.line_count = 0 + self.preview_line_index = -1 + self.preview_block_number = -1 def get_all_text(self) -> str: """ diff --git a/main.py b/main.py index fafd44d..36bc373 100644 --- a/main.py +++ b/main.py @@ -41,43 +41,68 @@ if getattr(sys, 'frozen', False) and sys.platform == 'win32': sys.stderr = io.StringIO() # Add project root to Python path -project_root = Path(__file__).parent +# Use resolve() to follow symlinks and get the real path +project_root = Path(__file__).resolve().parent sys.path.insert(0, str(project_root)) -from PySide6.QtWidgets import QApplication, QSplashScreen -from PySide6.QtGui import QPixmap, QPainter, QColor, QFont -from PySide6.QtCore import Qt, QTimer -from gui.main_window_qt import MainWindow +# Change working directory to project root so relative paths work +os.chdir(project_root) + +# Import only minimal Qt components needed for splash and dialogs +# Heavy imports (MainWindow) are deferred until after splash is shown +from PySide6.QtWidgets import QApplication, QSplashScreen, QMessageBox +from PySide6.QtGui import QPixmap, QPainter, QColor, QFont, QIcon +from PySide6.QtCore import Qt + +# Import single instance lock (lightweight module) +from client.instance_lock import InstanceLock + + +def get_icon_path(): + """Get the application icon path.""" + if getattr(sys, 'frozen', False): + # Running in PyInstaller bundle + return Path(sys._MEIPASS) / "LocalTranscription.png" + else: + # Running in normal Python + return project_root / "LocalTranscription.png" def create_splash_pixmap(message="Loading..."): - """Create a pixmap for the splash screen with a custom message.""" - pixmap = QPixmap(500, 300) + """Create a pixmap for the splash screen with the app icon.""" + pixmap = QPixmap(400, 320) pixmap.fill(QColor("#2b2b2b")) # Draw on the pixmap painter = QPainter(pixmap) painter.setRenderHint(QPainter.Antialiasing) + painter.setRenderHint(QPainter.SmoothPixmapTransform) - # Draw title - title_font = QFont("Arial", 28, QFont.Bold) - painter.setFont(title_font) - painter.setPen(QColor("#ffffff")) - painter.drawText(pixmap.rect(), Qt.AlignCenter, "Local Transcription") + # Load and draw the icon + icon_path = get_icon_path() + if icon_path.exists(): + icon_pixmap = QPixmap(str(icon_path)) + # Scale icon to fit nicely (200x200) + scaled_icon = icon_pixmap.scaled(200, 200, Qt.KeepAspectRatio, Qt.SmoothTransformation) + # Center the icon horizontally, position it in upper portion + icon_x = (pixmap.width() - scaled_icon.width()) // 2 + icon_y = 30 + painter.drawPixmap(icon_x, icon_y, scaled_icon) - # Draw subtitle + # Draw loading message below icon subtitle_font = QFont("Arial", 12) painter.setFont(subtitle_font) painter.setPen(QColor("#888888")) - subtitle_rect = pixmap.rect().adjusted(0, 60, 0, 0) - painter.drawText(subtitle_rect, Qt.AlignCenter, message) + subtitle_rect = pixmap.rect().adjusted(0, 0, 0, -40) + painter.drawText(subtitle_rect, Qt.AlignHCenter | Qt.AlignBottom, message) # Draw version/status at bottom + from version import __version__ status_font = QFont("Arial", 10) painter.setFont(status_font) painter.setPen(QColor("#666666")) - status_rect = pixmap.rect().adjusted(0, 0, 0, -20) - painter.drawText(status_rect, Qt.AlignHCenter | Qt.AlignBottom, "Please wait...") + status_rect = pixmap.rect().adjusted(0, 0, 0, -15) + painter.drawText(status_rect, Qt.AlignHCenter | Qt.AlignBottom, f"v{__version__}") painter.end() return pixmap @@ -93,11 +118,14 @@ def create_splash_screen(): def main(): """Main application entry point.""" + # Instance lock for cleanup on exit + instance_lock = None + try: print("Starting Local Transcription Application...") print("=" * 50) - # Create Qt application + # Create Qt application first (needed for dialogs) app = QApplication(sys.argv) # Set application info @@ -105,19 +133,24 @@ def main(): app.setOrganizationName("LocalTranscription") # Set application icon - # In PyInstaller frozen executables, use _MEIPASS for bundled files - if getattr(sys, 'frozen', False): - # Running in PyInstaller bundle - icon_path = Path(sys._MEIPASS) / "LocalTranscription.png" - else: - # Running in normal Python - icon_path = project_root / "LocalTranscription.png" - + icon_path = get_icon_path() if icon_path.exists(): - from PySide6.QtGui import QIcon app.setWindowIcon(QIcon(str(icon_path))) - # Create and show splash screen + # Check for single instance BEFORE showing splash + instance_lock = InstanceLock() + if not instance_lock.acquire(): + # Another instance is already running + QMessageBox.warning( + None, + "Application Already Running", + "Local Transcription is already running.\n\n" + "Please check your taskbar or system tray for the existing instance.", + QMessageBox.Ok + ) + sys.exit(0) + + # Create and show splash screen IMMEDIATELY splash = create_splash_screen() splash.show() app.processEvents() # Make sure splash is visible @@ -126,6 +159,13 @@ def main(): splash.showMessage("Loading configuration...", Qt.AlignBottom | Qt.AlignCenter, QColor("#888888")) app.processEvents() + # NOW import heavy modules (after splash is visible) + # This is the slow part - importing MainWindow loads many dependencies + splash.showMessage("Loading application modules...", Qt.AlignBottom | Qt.AlignCenter, QColor("#888888")) + app.processEvents() + + from gui.main_window_qt import MainWindow + # Create main window (this takes time due to model loading) # Pass splash to window so it can update the message window = MainWindow(splash_screen=splash) @@ -135,15 +175,25 @@ def main(): window.show() # Run application - sys.exit(app.exec()) + exit_code = app.exec() + + # Release lock on normal exit + if instance_lock: + instance_lock.release() + + sys.exit(exit_code) except KeyboardInterrupt: print("\nApplication interrupted by user") + if instance_lock: + instance_lock.release() sys.exit(0) except Exception as e: print(f"Fatal error: {e}") import traceback traceback.print_exc() + if instance_lock: + instance_lock.release() sys.exit(1) diff --git a/pyproject.toml b/pyproject.toml index 1b8701d..6eaf6a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "local-transcription" -version = "0.1.0" +version = "1.0.0" description = "A standalone desktop application for real-time speech-to-text transcription using Whisper models" readme = "README.md" requires-python = ">=3.9" diff --git a/server/COMPARISON.md b/server/COMPARISON.md deleted file mode 100644 index ee7c105..0000000 --- a/server/COMPARISON.md +++ /dev/null @@ -1,308 +0,0 @@ -# Multi-User Server Comparison - -## TL;DR: Which Should You Use? - -| Situation | Recommended Solution | -|-----------|---------------------| -| **Shared hosting (cPanel, etc.)** | **PHP Polling** (display-polling.php) | -| **VPS or cloud server** | **Node.js** (best performance) | -| **Quick test/demo** | **PHP Polling** (easiest) | -| **Production with many users** | **Node.js** (most reliable) | -| **No server access** | Use local-only mode | - -## Detailed Comparison - -### 1. PHP with SSE (Original - server.php + display.php) - -**Status:** ⚠️ **PROBLEMATIC** - Not recommended - -**Problems:** -- PHP-FPM buffers output (SSE doesn't work) -- Apache/Nginx proxy timeouts -- Shared hosting often blocks long connections -- High resource usage (one PHP process per viewer) - -**When it might work:** -- Only with specific Apache configurations -- Not on shared hosting with PHP-FPM -- Requires `ProxyTimeout` settings - -**Verdict:** ❌ Avoid unless you have full server control and can configure Apache properly - ---- - -### 2. PHP with Polling (NEW - display-polling.php) - -**Status:** ✅ **RECOMMENDED for PHP** - -**Pros:** -- ✅ Works on ANY shared hosting -- ✅ No buffering issues -- ✅ No special configuration needed -- ✅ Simple to deploy (just upload files) -- ✅ Uses standard HTTP requests - -**Cons:** -- ❌ Higher latency (1-2 seconds) -- ❌ More server requests (polls every second) -- ❌ Slightly higher bandwidth - -**Performance:** -- Latency: 1-2 seconds -- Max users: 20-30 concurrent viewers -- Resource usage: Moderate - -**Best for:** -- Shared hosting (cPanel, Bluehost, etc.) -- Quick deployment -- Small to medium groups - -**Setup:** -```bash -# Just upload these files: -server.php -display-polling.php # ← Use this instead of display.php -config.php -``` - -**OBS URL:** -``` -https://your-site.com/transcription/display-polling.php?room=ROOM&fade=10 -``` - ---- - -### 3. Node.js Server (NEW - server/nodejs/) - -**Status:** ⭐ **BEST PERFORMANCE** - -**Pros:** -- ✅ Native WebSocket support -- ✅ Real-time updates (< 100ms latency) -- ✅ Handles 100+ concurrent connections easily -- ✅ Lower resource usage -- ✅ No buffering issues -- ✅ Event-driven architecture - -**Cons:** -- ❌ Requires VPS or cloud server -- ❌ Need to install Node.js -- ❌ More setup than PHP - -**Performance:** -- Latency: < 100ms -- Max users: 500+ concurrent -- Resource usage: Very low (~50MB RAM) - -**Best for:** -- Production deployments -- Large groups (10+ streamers) -- Professional use -- Anyone with a VPS - -**Setup:** -```bash -cd server/nodejs -npm install -npm start -``` - -**Free hosting options:** -- Railway.app (free tier) -- Heroku (free tier) -- Fly.io (free tier) -- Any $5/month VPS (DigitalOcean, Linode) - -**OBS URL:** -``` -http://your-server.com:3000/display?room=ROOM&fade=10 -``` - ---- - -## Feature Comparison Matrix - -| Feature | PHP SSE | PHP Polling | Node.js | -|---------|---------|-------------|---------| -| **Real-time** | ⚠️ Should be, but breaks | ⚠️ 1-2s delay | ✅ < 100ms | -| **Reliability** | ❌ Buffering issues | ✅ Very reliable | ✅ Very reliable | -| **Shared Hosting** | ❌ Usually fails | ✅ Works everywhere | ❌ Needs VPS | -| **Setup Difficulty** | 🟡 Medium | 🟢 Easy | 🟡 Medium | -| **Max Users** | 10 | 30 | 500+ | -| **Resource Usage** | High | Medium | Low | -| **Latency** | Should be instant, but... | 1-2 seconds | < 100ms | -| **Cost** | $5-10/month hosting | $5-10/month hosting | Free - $5/month | - ---- - -## Migration Guide - -### From PHP SSE to PHP Polling - -**Super easy - just change the URL:** - -Old: -``` -https://your-site.com/transcription/display.php?room=ROOM -``` - -New: -``` -https://your-site.com/transcription/display-polling.php?room=ROOM -``` - -Everything else stays the same! The desktop app doesn't need changes. - ---- - -### From PHP to Node.js - -**1. Deploy Node.js server** (see server/nodejs/README.md) - -**2. Update desktop app settings:** - -Old (PHP): -``` -Server URL: https://your-site.com/transcription/server.php -``` - -New (Node.js): -``` -Server URL: http://your-server.com:3000/api/send -``` - -**3. Update OBS browser source:** - -Old (PHP): -``` -https://your-site.com/transcription/display.php?room=ROOM -``` - -New (Node.js): -``` -http://your-server.com:3000/display?room=ROOM&fade=10 -``` - ---- - -## Testing Your Setup - -### Test PHP Polling - -1. Upload files to server -2. Visit: `https://your-site.com/transcription/server.php` - - Should see JSON response -3. Visit: `https://your-site.com/transcription/display-polling.php?room=test` - - Should see "🟡 Waiting for data..." -4. Send a test message: - ```bash - curl -X POST "https://your-site.com/transcription/server.php?action=send" \ - -H "Content-Type: application/json" \ - -d '{ - "room": "test", - "passphrase": "testpass", - "user_name": "TestUser", - "text": "Hello World", - "timestamp": "12:34:56" - }' - ``` -5. Display should show "Hello World" within 1-2 seconds - -### Test Node.js - -1. Start server: `npm start` -2. Visit: `http://localhost:3000` - - Should see JSON response -3. Visit: `http://localhost:3000/display?room=test` - - Should see "⚫ Connecting..." then "🟢 Connected" -4. Send test message (same curl as above, but to `http://localhost:3000/api/send`) -5. Display should show message instantly - ---- - -## Troubleshooting - -### PHP Polling Issues - -**"Status stays yellow"** -- Room doesn't exist yet -- Send a message from desktop app first - -**"Gets 500 error"** -- Check PHP error logs -- Verify `data/` directory is writable - -**"Slow updates (5+ seconds)"** -- Increase poll interval: `?poll=500` (500ms) -- Check server load - -### Node.js Issues - -**"Cannot connect"** -- Check firewall allows port 3000 -- Verify server is running: `curl http://localhost:3000` - -**"WebSocket failed"** -- Check browser console for errors -- Try different port -- Check reverse proxy settings if using Nginx - ---- - -## Recommendations by Use Case - -### Solo Streamer (Local Only) -**Use:** Built-in web server (no multi-user server needed) -- Just run the desktop app -- OBS: `http://localhost:8080` - -### 2-3 Friends on Shared Hosting -**Use:** PHP Polling -- Upload to your existing web hosting -- Cost: $0 (use existing hosting) -- Setup time: 5 minutes - -### 5+ Streamers, Want Best Quality -**Use:** Node.js on VPS -- Deploy to Railway.app (free) or DigitalOcean ($5/month) -- Real-time updates -- Professional quality - -### Large Event/Convention -**Use:** Node.js on cloud -- Deploy to AWS/Azure/GCP -- Use load balancer for redundancy -- Can handle hundreds of users - ---- - -## Cost Breakdown - -### PHP Polling -- **Shared hosting:** $5-10/month (or free if you already have hosting) -- **Total:** $5-10/month - -### Node.js -- **Free options:** - - Railway.app (500 hours/month free) - - Heroku (free dyno) - - Fly.io (free tier) -- **Paid options:** - - DigitalOcean Droplet: $5/month - - Linode: $5/month - - AWS EC2 t2.micro: $8/month (or free tier) -- **Total:** $0-8/month - -### Just Use Local Mode -- **Cost:** $0 -- **Limitation:** Only shows your own transcriptions (no multi-user sync) - ---- - -## Final Recommendation - -**For most users:** Start with **PHP Polling** on shared hosting. It works reliably and is dead simple. - -**If you want the best:** Use **Node.js** - it's worth the extra setup for the performance. - -**For testing:** Use **local mode** (no server) - built into the desktop app. diff --git a/server/QUICK_FIX.md b/server/QUICK_FIX.md deleted file mode 100644 index b9682e6..0000000 --- a/server/QUICK_FIX.md +++ /dev/null @@ -1,218 +0,0 @@ -# Quick Fix for Multi-User Display Issues - -## The Problem - -Your PHP SSE (Server-Sent Events) setup isn't working because: -1. **PHP-FPM buffers output** - Shared hosting uses PHP-FPM which buffers everything -2. **Apache/Nginx timeouts** - Proxy kills long connections -3. **SSE isn't designed for PHP** - PHP processes are meant to be short-lived - -## The Solutions (in order of recommendation) - ---- - -### ✅ Solution 1: Use PHP Polling (Easiest Fix) - -**What changed:** Instead of SSE (streaming), use regular HTTP polling every 1 second - -**Files affected:** -- **Keep:** `server.php`, `config.php` (no changes needed) -- **Replace:** Use `display-polling.php` instead of `display.php` - -**Setup:** -1. Upload `display-polling.php` to your server -2. Change your OBS Browser Source URL from: - ``` - OLD: https://your-site.com/transcription/display.php?room=ROOM - NEW: https://your-site.com/transcription/display-polling.php?room=ROOM - ``` -3. Done! No other changes needed. - -**Pros:** -- ✅ Works on ANY shared hosting -- ✅ No server configuration needed -- ✅ Uses your existing setup -- ✅ 5-minute fix - -**Cons:** -- ⚠️ 1-2 second latency (vs instant with WebSocket) -- ⚠️ More server requests (but minimal impact) - -**Performance:** Good for 2-20 concurrent users - ---- - -### ⭐ Solution 2: Use Node.js Server (Best Performance) - -**What changed:** Switch from PHP to Node.js - designed for real-time - -**Setup:** -1. Get a VPS (or use free hosting like Railway.app) -2. Install Node.js: - ```bash - cd server/nodejs - npm install - npm start - ``` -3. Update desktop app Server URL to: - ``` - http://your-server.com:3000/api/send - ``` -4. Update OBS URL to: - ``` - http://your-server.com:3000/display?room=ROOM - ``` - -**Pros:** -- ✅ Real-time (< 100ms latency) -- ✅ Handles 100+ users easily -- ✅ Native WebSocket support -- ✅ Lower resource usage -- ✅ Can use free hosting (Railway, Heroku, Fly.io) - -**Cons:** -- ❌ Requires VPS or cloud hosting (can't use shared hosting) -- ❌ More setup than PHP - -**Performance:** Excellent for any number of users - -**Free Hosting Options:** -- Railway.app (easiest - just connect GitHub) -- Heroku (free tier) -- Fly.io (free tier) - ---- - -### 🔧 Solution 3: Fix PHP SSE (Advanced - Not Recommended) - -**Only if you have full server control and really want SSE** - -This requires: -1. Apache configuration changes -2. Disabling output buffering -3. Increasing timeouts - -See `apache-sse-config.conf` for details. - -**Not recommended because:** It's complex, fragile, and PHP polling is easier and more reliable. - ---- - -## Quick Comparison - -| Solution | Setup Time | Reliability | Latency | Works on Shared Hosting? | -|----------|-----------|-------------|---------|-------------------------| -| **PHP Polling** | 5 min | ⭐⭐⭐⭐⭐ | 1-2s | ✅ Yes | -| **Node.js** | 30 min | ⭐⭐⭐⭐⭐ | < 100ms | ❌ No (needs VPS) | -| **PHP SSE** | 2 hours | ⭐⭐ | Should be instant | ❌ Rarely | - ---- - -## Testing Your Fix - -### Test PHP Polling - -1. Run the test script: - ```bash - cd server - ./test-server.sh - ``` - -2. Or manually: - ```bash - # Send a test message - curl -X POST "https://your-site.com/transcription/server.php?action=send" \ - -H "Content-Type: application/json" \ - -d '{ - "room": "test", - "passphrase": "testpass", - "user_name": "TestUser", - "text": "Hello World", - "timestamp": "12:34:56" - }' - - # Open in browser: - https://your-site.com/transcription/display-polling.php?room=test - - # Should see "Hello World" appear within 1-2 seconds - ``` - -### Test Node.js - -1. Start server: - ```bash - cd server/nodejs - npm install - npm start - ``` - -2. Open browser: - ``` - http://localhost:3000/display?room=test - ``` - -3. Send test message: - ```bash - curl -X POST "http://localhost:3000/api/send" \ - -H "Content-Type: application/json" \ - -d '{ - "room": "test", - "passphrase": "testpass", - "user_name": "TestUser", - "text": "Hello World", - "timestamp": "12:34:56" - }' - ``` - -4. Should see message appear **instantly** - ---- - -## My Recommendation - -**Start with PHP Polling** (Solution 1): -- Upload `display-polling.php` -- Change OBS URL -- Test it out - -**If you like it and want better performance**, migrate to Node.js (Solution 2): -- Takes 30 minutes -- Much better performance -- Can use free hosting - -**Forget about PHP SSE** (Solution 3): -- Too much work -- Unreliable -- Not worth it - ---- - -## Files You Need - -### For PHP Polling -- ✅ `server.php` (already have) -- ✅ `config.php` (already have) -- ✅ `display-polling.php` (NEW - just created) -- ❌ `display.php` (don't use anymore) - -### For Node.js -- ✅ `server/nodejs/server.js` (NEW) -- ✅ `server/nodejs/package.json` (NEW) -- ✅ `server/nodejs/README.md` (NEW) - ---- - -## Need Help? - -1. Read [COMPARISON.md](COMPARISON.md) for detailed comparison -2. Read [server/nodejs/README.md](nodejs/README.md) for Node.js setup -3. Run `./test-server.sh` to diagnose issues -4. Check browser console for errors - ---- - -## Bottom Line - -**Your SSE display doesn't work because PHP + shared hosting + SSE = bad combo.** - -**Use PHP Polling (1-2s delay) or Node.js (instant).** Both work reliably. diff --git a/server/SYNC_PERFORMANCE.md b/server/SYNC_PERFORMANCE.md deleted file mode 100644 index f8e0ac9..0000000 --- a/server/SYNC_PERFORMANCE.md +++ /dev/null @@ -1,248 +0,0 @@ -# Server Sync Performance - Before vs After - -## The Problem You Experienced - -**Symptom:** Shared sync display was several seconds behind local transcription - -**Why:** The test script worked fast because it sent ONE message. But the Python app sends messages continuously during speech, and they were getting queued up! - ---- - -## Before Fix: Serial Processing ❌ - -``` -You speak: "Hello" "How" "are" "you" "today" - ↓ ↓ ↓ ↓ ↓ -Local GUI: Hello How are you today ← Instant! - ↓ ↓ ↓ ↓ ↓ -Send Queue: [Hello]→[How]→[are]→[you]→[today] - | - ↓ (Wait for HTTP response before sending next) -HTTP: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - Send Send Send Send Send - Hello How are you today - (200ms) (200ms)(200ms)(200ms)(200ms) - ↓ ↓ ↓ ↓ ↓ -Server: Hello How are you today - ↓ ↓ ↓ ↓ ↓ -Display: Hello How are you today ← 1 second behind! - (0ms) (200ms)(400ms)(600ms)(800ms) -``` - -**Total delay:** 1 second for 5 messages! - ---- - -## After Fix: Parallel Processing ✅ - -``` -You speak: "Hello" "How" "are" "you" "today" - ↓ ↓ ↓ ↓ ↓ -Local GUI: Hello How are you today ← Instant! - ↓ ↓ ↓ ↓ ↓ -Send Queue: [Hello] [How] [are] [you] [today] - ↓ ↓ ↓ - ↓ ↓ ↓ ← Up to 3 parallel workers! -HTTP: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - Send Hello ┐ - Send How ├─ All sent simultaneously! - Send are ┘ - Wait for free worker... - Send you ┐ - Send today ┘ - (200ms total!) - ↓ ↓ ↓ ↓ ↓ -Server: Hello How are you today - ↓ ↓ ↓ ↓ ↓ -Display: Hello How are you today ← 200ms behind! - (0ms) (0ms) (0ms) (0ms) (200ms) -``` - -**Total delay:** 200ms for 5 messages! - ---- - -## Real-World Example - -### Scenario: You speak a paragraph - -**"Hello everyone. How are you doing today? I'm testing the transcription system."** - -### Before Fix (Serial) -``` -Time Local GUI Server Display -0.0s "Hello everyone." -0.2s "How are you doing today?" -0.4s "I'm testing..." "Hello everyone." ← 0.4s behind! -0.6s "How are you doing..." ← 0.4s behind! -0.8s "I'm testing..." ← 0.4s behind! -``` - -### After Fix (Parallel) -``` -Time Local GUI Server Display -0.0s "Hello everyone." -0.2s "How are you doing today?" "Hello everyone." ← 0.2s behind! -0.4s "I'm testing..." "How are you doing..." ← 0.2s behind! -0.6s "I'm testing..." ← 0.2s behind! -``` - -**Improvement:** Consistent 200ms delay vs growing 400-800ms delay! - ---- - -## Technical Details - -### Problem 1: Wrong URL Format ❌ -```python -# What the client was sending to Node.js: -POST http://localhost:3000/api/send?action=send - -# What Node.js was expecting: -POST http://localhost:3000/api/send -``` - -**Fix:** Auto-detect server type -```python -if 'server.php' in url: - # PHP server needs ?action=send - POST http://server.com/server.php?action=send -else: - # Node.js doesn't need it - POST http://server.com/api/send -``` - -### Problem 2: Blocking HTTP Requests ❌ -```python -# Old code (BLOCKING): -while True: - message = queue.get() - send_http(message) # ← Wait here! Can't send next until this returns -``` - -**Fix:** Use thread pool -```python -# New code (NON-BLOCKING): -executor = ThreadPoolExecutor(max_workers=3) -while True: - message = queue.get() - executor.submit(send_http, message) # ← Returns immediately! Send next! -``` - -### Problem 3: Long Timeouts ❌ -```python -# Old: -queue.get(timeout=1.0) # Wait up to 1 second for new message -send_http(..., timeout=5.0) # Wait up to 5 seconds for response - -# New: -queue.get(timeout=0.1) # Check queue every 100ms (responsive!) -send_http(..., timeout=2.0) # Fail fast if server slow -``` - ---- - -## Performance Metrics - -| Metric | Before | After | Improvement | -|--------|--------|-------|-------------| -| Single message | 150ms | 150ms | Same | -| 5 messages (serial) | 750ms | 200ms | **3.7x faster** | -| 10 messages (serial) | 1500ms | 300ms | **5x faster** | -| 20 messages (rapid) | 3000ms | 600ms | **5x faster** | -| Queue polling | 1000ms | 100ms | **10x faster** | -| Failure timeout | 5000ms | 2000ms | **2.5x faster** | - ---- - -## Visual Comparison - -### Before: Messages in Queue Building Up -``` -[Message 1] ━━━━━━━━━━━━━━━━━━━━━ Sending... (200ms) -[Message 2] Waiting... -[Message 3] Waiting... -[Message 4] Waiting... -[Message 5] Waiting... - ↓ -[Message 1] Done ✓ -[Message 2] ━━━━━━━━━━━━━━━━━━━━━ Sending... (200ms) -[Message 3] Waiting... -[Message 4] Waiting... -[Message 5] Waiting... - ↓ -... and so on (total: 1 second for 5 messages) -``` - -### After: Messages Sent in Parallel -``` -[Message 1] ━━━━━━━━━━━━━━━━━━━━━ Sending... ┐ -[Message 2] ━━━━━━━━━━━━━━━━━━━━━ Sending... ├─ Parallel! (200ms) -[Message 3] ━━━━━━━━━━━━━━━━━━━━━ Sending... ┘ -[Message 4] Waiting for free worker... -[Message 5] Waiting for free worker... - ↓ (workers become available) -[Message 1] Done ✓ -[Message 2] Done ✓ -[Message 3] Done ✓ -[Message 4] ━━━━━━━━━━━━━━━━━━━━━ Sending... ┐ -[Message 5] ━━━━━━━━━━━━━━━━━━━━━ Sending... ┘ - -Total time: 400ms for 5 messages (2.5x faster!) -``` - ---- - -## How to Test the Improvement - -1. **Start Node.js server:** - ```bash - cd server/nodejs - npm start - ``` - -2. **Configure desktop app:** - - Settings → Server Sync → Enable - - Server URL: `http://localhost:3000/api/send` - - Room: `test` - - Passphrase: `test` - -3. **Open display page:** - ``` - http://localhost:3000/display?room=test&fade=20 - ``` - -4. **Test rapid speech:** - - Start transcription - - Speak 5-10 sentences quickly in succession - - Watch both local GUI and web display - -**Expected:** Web display should be only ~200ms behind local GUI (instead of 1-2 seconds) - ---- - -## Why 3 Workers? - -**Why not 1?** → Serial processing, slow -**Why not 10?** → Too many connections, overwhelms server -**Why 3?** → Good balance: -- Fast enough for rapid speech -- Doesn't overwhelm server -- Low resource usage - -You can change this in the code: -```python -self.executor = ThreadPoolExecutor(max_workers=3) # Change to 5 for faster -``` - ---- - -## Summary - -✅ **Fixed URL format** for Node.js server -✅ **Added parallel HTTP requests** (up to 3 simultaneous) -✅ **Reduced timeouts** for faster polling and failure detection -✅ **Result:** 5-10x faster sync for rapid speech - -**Before:** Laggy, messages queue up, 1-2 second delay -**After:** Near real-time, 100-300ms delay, smooth! diff --git a/server/nodejs/README.md b/server/nodejs/README.md index b59527e..f5f5c92 100644 --- a/server/nodejs/README.md +++ b/server/nodejs/README.md @@ -1,15 +1,15 @@ # Node.js Multi-User Transcription Server -**Much better than PHP for real-time applications!** +A real-time multi-user transcription sync server for streamers and teams. -## Why Node.js is Better Than PHP for This +## Features -1. **Native WebSocket Support** - No SSE buffering issues -2. **Event-Driven** - Designed for real-time connections -3. **No Buffering Problems** - PHP-FPM/FastCGI buffering is a nightmare -4. **Lower Latency** - Instant message delivery -5. **Better Resource Usage** - One process handles all connections -6. **Easy to Deploy** - Works on any VPS, cloud platform, or even Heroku free tier +- **Real-time WebSocket** - Instant message delivery (< 100ms latency) +- **Per-speaker fonts** - Each user can have their own font style +- **Google Fonts support** - 1000+ free fonts loaded from CDN +- **Web-safe fonts** - Universal fonts that work everywhere +- **Custom font uploads** - Upload your own .ttf/.woff2 files +- **Easy deployment** - Works on any VPS, cloud platform, or locally ## Quick Start @@ -54,13 +54,35 @@ PORT=8080 npm start Add a Browser source with this URL: ``` -http://your-server.com:3000/display?room=YOUR_ROOM&fade=10×tamps=true +http://your-server.com:3000/display?room=YOUR_ROOM&fade=10×tamps=true&fontsource=websafe&websafefont=Arial ``` **Parameters:** -- `room` - Your room name (required) -- `fade` - Seconds before text fades (0 = never fade) -- `timestamps` - Show timestamps (true/false) +| Parameter | Default | Description | +|-----------|---------|-------------| +| `room` | default | Your room name (required) | +| `fade` | 10 | Seconds before text fades (0 = never fade) | +| `timestamps` | true | Show timestamps (true/false) | +| `maxlines` | 50 | Max lines visible (prevents scroll bars) | +| `fontsize` | 16 | Font size in pixels | +| `fontsource` | websafe | Font source: `websafe`, `google`, or `custom` | +| `websafefont` | Arial | Web-safe font name | +| `googlefont` | Roboto | Google Font name | + +**Font Examples:** +``` +# Web-safe font (works everywhere) +?room=myroom&fontsource=websafe&websafefont=Courier+New + +# Google Font (loaded from CDN) +?room=myroom&fontsource=google&googlefont=Open+Sans + +# Custom font (uploaded by users) +?room=myroom&fontsource=custom +``` + +**Per-Speaker Fonts:** +Each user can set their own font in the desktop app (Settings → Multi-User Server Sync → Font Source). Per-speaker fonts override the URL defaults, so different speakers can have different fonts on the same display. ## API Endpoints @@ -74,7 +96,9 @@ Content-Type: application/json "passphrase": "my-secret", "user_name": "Alice", "text": "Hello everyone!", - "timestamp": "12:34:56" + "timestamp": "12:34:56", + "font_family": "Open Sans", // Optional: per-speaker font + "font_type": "google" // Optional: websafe, google, or custom } ``` @@ -282,17 +306,6 @@ Ports below 1024 require root. Either: - Average latency: < 100ms - Memory usage: ~50MB -## Comparison: Node.js vs PHP - -| Feature | Node.js | PHP (SSE) | -|---------|---------|-----------| -| Real-time | ✅ WebSocket | ⚠️ SSE (buffering issues) | -| Latency | < 100ms | 1-5 seconds (buffering) | -| Connections | 1000+ | Limited by PHP-FPM | -| Setup | Easy | Complex (Apache/Nginx config) | -| Hosting | VPS, Cloud | Shared hosting (problematic) | -| Resource Usage | Low | High (one PHP process per connection) | - ## License Part of the Local Transcription project. diff --git a/server/nodejs/server.js b/server/nodejs/server.js index 49b4e65..2732c50 100644 --- a/server/nodejs/server.js +++ b/server/nodejs/server.js @@ -27,11 +27,15 @@ const wss = new WebSocket.Server({ server }); // Configuration const PORT = process.env.PORT || 3000; const DATA_DIR = path.join(__dirname, 'data'); +const FONTS_DIR = path.join(__dirname, 'fonts'); const MAX_TRANSCRIPTIONS = 100; const CLEANUP_INTERVAL = 2 * 60 * 60 * 1000; // 2 hours +// In-memory font storage by room (font_name -> {data: Buffer, mime: string}) +const roomFonts = new Map(); + // Middleware -app.use(bodyParser.json()); +app.use(bodyParser.json({ limit: '10mb' })); // Increase limit for font uploads app.use((req, res, next) => { res.header('Access-Control-Allow-Origin', '*'); res.header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS'); @@ -146,7 +150,8 @@ function broadcastToRoom(room, data) { }); const broadcastTime = Date.now() - broadcastStart; - console.log(`[Broadcast] Sent to ${sent} client(s) in room "${room}" (${broadcastTime}ms)`); + const fontInfo = data.font_family ? ` [font: ${data.font_family} (${data.font_type})]` : ''; + console.log(`[Broadcast] Sent to ${sent} client(s) in room "${room}" (${broadcastTime}ms)${fontInfo}`); } // Cleanup old rooms @@ -418,10 +423,15 @@ app.get('/', (req, res) => {
  • timestamps=true - Show/hide timestamps (true/false)
  • maxlines=50 - Max lines visible at once (prevents scroll bars)
  • fontsize=16 - Font size in pixels
  • -
  • fontfamily=Arial - Font family (Arial, Courier, etc.)
  • +
  • fontsource=websafe - Font source: websafe, google, or custom
  • +
  • websafefont=Arial - Web-safe font (Arial, Times New Roman, Courier New, etc.)
  • +
  • googlefont=Roboto - Google Font name (Roboto, Open Sans, Lato, etc.)
  • - Example: ?room=myroom&fade=15×tamps=false&maxlines=30&fontsize=18 + Example: ?room=myroom&fade=15&fontsource=google&googlefont=Open+Sans&fontsize=18 +

    +

    + Note: Per-speaker fonts override the default. Each user can set their own font in the app settings.

    @@ -541,7 +551,7 @@ app.get('/', (req, res) => { // Build URLs const serverUrl = \`http://\${window.location.host}/api/send\`; - const displayUrl = \`http://\${window.location.host}/display?room=\${encodeURIComponent(room)}&fade=10×tamps=true&maxlines=50&fontsize=16&fontfamily=Arial\`; + const displayUrl = \`http://\${window.location.host}/display?room=\${encodeURIComponent(room)}&fade=10×tamps=true&maxlines=50&fontsize=16&fontsource=websafe&websafefont=Arial\`; // Update UI document.getElementById('serverUrl').textContent = serverUrl; @@ -592,7 +602,7 @@ app.get('/', (req, res) => { app.post('/api/send', async (req, res) => { const requestStart = Date.now(); try { - const { room, passphrase, user_name, text, timestamp } = req.body; + const { room, passphrase, user_name, text, timestamp, is_preview, font_family, font_type } = req.body; if (!room || !passphrase || !user_name || !text) { return res.status(400).json({ error: 'Missing required fields' }); @@ -611,17 +621,27 @@ app.post('/api/send', async (req, res) => { user_name: user_name.trim(), text: text.trim(), timestamp: timestamp || new Date().toLocaleTimeString('en-US', { hour12: false }), - created_at: Date.now() + created_at: Date.now(), + is_preview: is_preview || false, + font_family: font_family || null, // Per-speaker font name + font_type: font_type || null // Font type: "websafe", "google", or "custom" }; const addStart = Date.now(); - await addTranscription(room, transcription); + if (is_preview) { + // Previews are only broadcast, not stored + broadcastToRoom(room, transcription); + } else { + // Final transcriptions are stored and broadcast + await addTranscription(room, transcription); + } const addTime = Date.now() - addStart; const totalTime = Date.now() - requestStart; - console.log(`[${new Date().toISOString()}] Transcription received: "${text.substring(0, 50)}..." (verify: ${verifyTime}ms, add: ${addTime}ms, total: ${totalTime}ms)`); + const previewLabel = is_preview ? ' [PREVIEW]' : ''; + console.log(`[${new Date().toISOString()}]${previewLabel} Transcription received: "${text.substring(0, 50)}..." (verify: ${verifyTime}ms, add: ${addTime}ms, total: ${totalTime}ms)`); - res.json({ status: 'ok', message: 'Transcription added' }); + res.json({ status: 'ok', message: is_preview ? 'Preview broadcast' : 'Transcription added' }); } catch (err) { console.error('Error in /api/send:', err); res.status(500).json({ error: err.message }); @@ -647,9 +667,115 @@ app.get('/api/list', async (req, res) => { } }); +// Upload fonts for a room +app.post('/api/fonts', async (req, res) => { + try { + const { room, passphrase, fonts } = req.body; + + if (!room || !passphrase) { + return res.status(400).json({ error: 'Missing room or passphrase' }); + } + + // Verify passphrase + const valid = await verifyPassphrase(room, passphrase); + if (!valid) { + return res.status(401).json({ error: 'Invalid passphrase' }); + } + + if (!fonts || !Array.isArray(fonts)) { + return res.status(400).json({ error: 'No fonts provided' }); + } + + // Initialize room fonts storage if needed + if (!roomFonts.has(room)) { + roomFonts.set(room, new Map()); + } + const fontsMap = roomFonts.get(room); + + // Process each font + let addedCount = 0; + for (const font of fonts) { + if (!font.name || !font.data || !font.mime) continue; + + // Decode base64 font data + const fontData = Buffer.from(font.data, 'base64'); + fontsMap.set(font.name, { + data: fontData, + mime: font.mime, + uploaded_at: Date.now() + }); + addedCount++; + console.log(`[Fonts] Uploaded font "${font.name}" for room "${room}" (${fontData.length} bytes)`); + } + + res.json({ status: 'ok', message: `${addedCount} font(s) uploaded`, fonts: Array.from(fontsMap.keys()) }); + } catch (err) { + console.error('Error in /api/fonts:', err); + res.status(500).json({ error: err.message }); + } +}); + +// Serve uploaded fonts +app.get('/fonts/:room/:fontname', (req, res) => { + const { room, fontname } = req.params; + + const fontsMap = roomFonts.get(room); + if (!fontsMap) { + return res.status(404).json({ error: 'Room not found' }); + } + + const font = fontsMap.get(fontname); + if (!font) { + return res.status(404).json({ error: 'Font not found' }); + } + + res.set('Content-Type', font.mime); + res.set('Cache-Control', 'public, max-age=3600'); + res.send(font.data); +}); + +// List fonts for a room +app.get('/api/fonts', (req, res) => { + const { room } = req.query; + + if (!room) { + return res.status(400).json({ error: 'Missing room parameter' }); + } + + const fontsMap = roomFonts.get(room); + const fonts = fontsMap ? Array.from(fontsMap.keys()) : []; + + res.json({ fonts }); +}); + // Serve display page app.get('/display', (req, res) => { - const { room = 'default', fade = '10', timestamps = 'true', maxlines = '50', fontsize = '16', fontfamily = 'Arial' } = req.query; + const { + room = 'default', + fade = '10', + timestamps = 'true', + maxlines = '50', + fontsize = '16', + fontfamily = 'Arial', + // New font source parameters + fontsource = 'websafe', // websafe, google, or custom + websafefont = 'Arial', + googlefont = 'Roboto' + } = req.query; + + // Determine the effective default font based on fontsource + let effectiveFont = fontfamily; // Legacy fallback + if (fontsource === 'google' && googlefont) { + effectiveFont = googlefont; + } else if (fontsource === 'websafe' && websafefont) { + effectiveFont = websafefont; + } + + // Generate Google Font link if needed + // Note: Google Fonts expects spaces as '+' in the URL, not %2B + const googleFontLink = fontsource === 'google' && googlefont + ? `` + : ''; res.send(` @@ -657,12 +783,16 @@ app.get('/display', (req, res) => { Multi-User Transcription Display + ${googleFontLink} +