Compare commits
88 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1c586738f3 | ||
|
|
fb02a24334 | ||
|
|
ce64cacc5e | ||
|
|
14a7ca3b30 | ||
|
|
5b7387f9c6 | ||
|
|
293362baa1 | ||
|
|
41f50dedec | ||
|
|
d8b7811153 | ||
|
|
ec8922672c | ||
|
|
375669f657 | ||
|
|
c8b11fb0ad | ||
|
|
273a926f03 | ||
|
|
5bbbc38875 | ||
|
|
d50be6654d | ||
|
|
68abf49018 | ||
|
|
8cc2a3ec7a | ||
|
|
8aa9dfc644 | ||
|
|
3f16aa838d | ||
|
|
3d3d7ec3c5 | ||
|
|
bb039399fc | ||
|
|
9dcb14e92c | ||
|
|
8db9b8298b | ||
|
|
411779f578 | ||
|
|
bc6055a707 | ||
|
|
e42a922507 | ||
|
|
8fc2d11c5f | ||
|
|
11832e911b | ||
|
|
18e6b974c0 | ||
|
|
08e464daaf | ||
|
|
5d22adcaa4 | ||
|
|
36b4f7dad5 | ||
|
|
1ecb23b83f | ||
|
|
4b88871a9b | ||
|
|
0ae48a67d5 | ||
|
|
924cae6c75 | ||
|
|
5139936e18 | ||
|
|
47724f1ac0 | ||
|
|
3b204be37e | ||
|
|
4c02a48135 | ||
|
|
997e97c19a | ||
|
|
6ca8fc41b2 | ||
|
|
d9d90563cc | ||
|
|
5a674ed199 | ||
|
|
9d78fce3f0 | ||
|
|
a8de39de84 | ||
|
|
bc82584dff | ||
|
|
4d0b4ee1c5 | ||
|
|
c73e9de0ac | ||
|
|
288c6ad6a3 | ||
|
|
af8046f9b1 | ||
|
|
6003885519 | ||
|
|
8829846b53 | ||
|
|
cf449d9338 | ||
|
|
5a6910834c | ||
|
|
a6c7eb5d5e | ||
|
|
135d5d534b | ||
|
|
76f34fe17d | ||
|
|
68ad31b6a7 | ||
|
|
fcbe405e23 | ||
|
|
4adfd2adc6 | ||
|
|
f3843d59f1 | ||
|
|
ad68251e04 | ||
|
|
9468d01a88 | ||
|
|
a3151ad55e | ||
|
|
5bff40e9b4 | ||
|
|
0ccb02ba27 | ||
|
|
aa4033b412 | ||
|
|
b4b9435317 | ||
|
|
ee1d4f8643 | ||
|
|
4a186d1de6 | ||
|
|
fff37992b1 | ||
|
|
8afe3230d3 | ||
|
|
04e7fb1a99 | ||
|
|
9a282215c9 | ||
|
|
cc2d17a627 | ||
|
|
61c5ffa4fa | ||
|
|
289b9dabe1 | ||
|
|
9522f28c57 | ||
|
|
a8e2e7dca8 | ||
|
|
3bcf4f09a3 | ||
|
|
ef5734ef15 | ||
| c9db43d56c | |||
|
|
4c519a109a | ||
|
|
47ca74e75d | ||
|
|
25d2a55efb | ||
|
|
af534bf768 | ||
|
|
9ff883e2e3 | ||
| bb8a8c251d |
9
.claude/settings.local.json
Normal file
9
.claude/settings.local.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(python3:*)",
|
||||
"Bash(node --check:*)",
|
||||
"Bash(ls:*)"
|
||||
]
|
||||
}
|
||||
}
|
||||
95
.gitea/workflows/build-app-linux.yml
Normal file
95
.gitea/workflows/build-app-linux.yml
Normal file
@@ -0,0 +1,95 @@
|
||||
name: Build App (Linux)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: 'Release tag to build (e.g. v1.4.5)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build-linux:
|
||||
name: Build App (Linux)
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NODE_VERSION: "20"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
run: |
|
||||
echo "Building for tag: ${RELEASE_TAG}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
|
||||
- name: Install Rust stable
|
||||
run: |
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libgtk-3-dev libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf xdg-utils rpm
|
||||
|
||||
- name: Install npm dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Build Tauri app
|
||||
run: npm run tauri build
|
||||
|
||||
- name: Upload to release
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
sudo apt-get install -y jq
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${RELEASE_TAG}"
|
||||
echo "Release tag: ${TAG}"
|
||||
|
||||
echo "Waiting for release ${TAG} to be available..."
|
||||
RELEASE_ID=""
|
||||
for i in $(seq 1 30); do
|
||||
RELEASE_JSON=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/tags/${TAG}")
|
||||
RELEASE_ID=$(echo "$RELEASE_JSON" | jq -r '.id // empty')
|
||||
|
||||
if [ -n "${RELEASE_ID}" ] && [ "${RELEASE_ID}" != "null" ]; then
|
||||
echo "Found release: ${TAG} (ID: ${RELEASE_ID})"
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Attempt ${i}/30: Release not ready yet, retrying in 10s..."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ -z "${RELEASE_ID}" ] || [ "${RELEASE_ID}" = "null" ]; then
|
||||
echo "ERROR: Failed to find release for tag ${TAG} after 30 attempts."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
find src-tauri/target/release/bundle -type f \( -name "*.deb" -o -name "*.rpm" -o -name "*.AppImage" \) | while IFS= read -r file; do
|
||||
filename=$(basename "$file")
|
||||
encoded_name=$(echo "$filename" | sed 's/ /%20/g')
|
||||
echo "Uploading ${filename} ($(du -h "$file" | cut -f1))..."
|
||||
|
||||
ASSET_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets" | jq -r ".[] | select(.name == \"${filename}\") | .id // empty")
|
||||
if [ -n "${ASSET_ID}" ]; then
|
||||
curl -s -X DELETE -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets/${ASSET_ID}"
|
||||
fi
|
||||
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/octet-stream" \
|
||||
-T "$file" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets?name=${encoded_name}")
|
||||
echo "Upload response: HTTP ${HTTP_CODE}"
|
||||
done
|
||||
93
.gitea/workflows/build-app-macos.yml
Normal file
93
.gitea/workflows/build-app-macos.yml
Normal file
@@ -0,0 +1,93 @@
|
||||
name: Build App (macOS)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: 'Release tag to build (e.g. v1.4.5)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build-macos:
|
||||
name: Build App (macOS)
|
||||
runs-on: macos-latest
|
||||
env:
|
||||
NODE_VERSION: "20"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
run: |
|
||||
echo "Building for tag: ${RELEASE_TAG}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
|
||||
- name: Install Rust stable
|
||||
run: |
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install system dependencies
|
||||
run: brew install --quiet create-dmg || true
|
||||
|
||||
- name: Install npm dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Build Tauri app
|
||||
run: npm run tauri build
|
||||
|
||||
- name: Upload to release
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
which jq || brew install jq
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${RELEASE_TAG}"
|
||||
echo "Release tag: ${TAG}"
|
||||
|
||||
echo "Waiting for release ${TAG} to be available..."
|
||||
RELEASE_ID=""
|
||||
for i in $(seq 1 30); do
|
||||
RELEASE_JSON=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/tags/${TAG}")
|
||||
RELEASE_ID=$(echo "$RELEASE_JSON" | jq -r '.id // empty')
|
||||
|
||||
if [ -n "${RELEASE_ID}" ] && [ "${RELEASE_ID}" != "null" ]; then
|
||||
echo "Found release: ${TAG} (ID: ${RELEASE_ID})"
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Attempt ${i}/30: Release not ready yet, retrying in 10s..."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ -z "${RELEASE_ID}" ] || [ "${RELEASE_ID}" = "null" ]; then
|
||||
echo "ERROR: Failed to find release for tag ${TAG} after 30 attempts."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
find src-tauri/target/release/bundle -type f -name "*.dmg" | while IFS= read -r file; do
|
||||
filename=$(basename "$file")
|
||||
encoded_name=$(echo "$filename" | sed 's/ /%20/g')
|
||||
echo "Uploading ${filename} ($(du -h "$file" | cut -f1))..."
|
||||
|
||||
ASSET_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets" | jq -r ".[] | select(.name == \"${filename}\") | .id // empty")
|
||||
if [ -n "${ASSET_ID}" ]; then
|
||||
curl -s -X DELETE -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets/${ASSET_ID}"
|
||||
fi
|
||||
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/octet-stream" \
|
||||
-T "$file" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets?name=${encoded_name}")
|
||||
echo "Upload response: HTTP ${HTTP_CODE}"
|
||||
done
|
||||
117
.gitea/workflows/build-app-windows.yml
Normal file
117
.gitea/workflows/build-app-windows.yml
Normal file
@@ -0,0 +1,117 @@
|
||||
name: Build App (Windows)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: 'Release tag to build (e.g. v1.4.5)'
|
||||
required: true
|
||||
|
||||
env:
|
||||
NODE_VERSION: "20"
|
||||
|
||||
jobs:
|
||||
build-windows:
|
||||
name: Build App (Windows)
|
||||
runs-on: windows-latest
|
||||
env:
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
shell: powershell
|
||||
run: |
|
||||
Write-Host "Building for tag: $env:RELEASE_TAG"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
|
||||
- name: Install Rust stable
|
||||
shell: powershell
|
||||
run: |
|
||||
if (Get-Command rustup -ErrorAction SilentlyContinue) {
|
||||
rustup default stable
|
||||
} else {
|
||||
Invoke-WebRequest -Uri https://win.rustup.rs/x86_64 -OutFile rustup-init.exe
|
||||
.\rustup-init.exe -y --default-toolchain stable
|
||||
echo "$env:USERPROFILE\.cargo\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
}
|
||||
|
||||
- name: Install npm dependencies
|
||||
shell: powershell
|
||||
run: npm ci
|
||||
|
||||
- name: Build Tauri app
|
||||
shell: powershell
|
||||
run: npm run tauri build
|
||||
|
||||
- name: Upload to release
|
||||
shell: powershell
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
$REPO_API = "${{ github.server_url }}/api/v1/repos/${{ github.repository }}"
|
||||
$Headers = @{ "Authorization" = "token $env:BUILD_TOKEN" }
|
||||
$TAG = $env:RELEASE_TAG
|
||||
Write-Host "Release tag: $TAG"
|
||||
|
||||
if (-not $TAG) {
|
||||
Write-Host "ERROR: RELEASE_TAG is empty"
|
||||
exit 1
|
||||
}
|
||||
|
||||
Write-Host "Waiting for release $TAG to be available..."
|
||||
$RELEASE_ID = $null
|
||||
|
||||
for ($i = 1; $i -le 30; $i++) {
|
||||
try {
|
||||
$release = Invoke-RestMethod -Uri "$REPO_API/releases/tags/$TAG" -Headers $Headers -ErrorAction Stop
|
||||
$RELEASE_ID = $release.id
|
||||
|
||||
if ($RELEASE_ID) {
|
||||
Write-Host "Found release: $TAG (ID: $RELEASE_ID)"
|
||||
break
|
||||
}
|
||||
} catch {}
|
||||
|
||||
Write-Host "Attempt ${i}/30: Release not ready yet, retrying in 10s..."
|
||||
Start-Sleep -Seconds 10
|
||||
}
|
||||
|
||||
if (-not $RELEASE_ID) {
|
||||
Write-Host "ERROR: Failed to find release for tag $TAG after 30 attempts."
|
||||
exit 1
|
||||
}
|
||||
|
||||
Get-ChildItem -Path src-tauri\target\release\bundle -Recurse -Include *.msi,*-setup.exe | ForEach-Object {
|
||||
$filename = $_.Name
|
||||
$encodedName = [System.Uri]::EscapeDataString($filename)
|
||||
$size = [math]::Round($_.Length / 1MB, 1)
|
||||
Write-Host "Uploading $filename ($size MB)..."
|
||||
|
||||
try {
|
||||
$assets = Invoke-RestMethod -Uri "$REPO_API/releases/$RELEASE_ID/assets" -Headers $Headers
|
||||
$existing = $assets | Where-Object { $_.name -eq $filename }
|
||||
if ($existing) {
|
||||
Invoke-RestMethod -Uri "$REPO_API/releases/$RELEASE_ID/assets/$($existing.id)" -Method Delete -Headers $Headers
|
||||
}
|
||||
} catch {}
|
||||
|
||||
$uploadUrl = "$REPO_API/releases/$RELEASE_ID/assets?name=$encodedName"
|
||||
$result = curl.exe --fail --silent --show-error `
|
||||
-X POST `
|
||||
-H "Authorization: token $env:BUILD_TOKEN" `
|
||||
-H "Content-Type: application/octet-stream" `
|
||||
-T "$($_.FullName)" `
|
||||
"$uploadUrl" 2>&1
|
||||
if ($LASTEXITCODE -eq 0) {
|
||||
Write-Host "Upload successful: $filename"
|
||||
} else {
|
||||
Write-Host "WARNING: Upload failed for ${filename}: $result"
|
||||
}
|
||||
}
|
||||
229
.gitea/workflows/build-sidecar-cloud.yml
Normal file
229
.gitea/workflows/build-sidecar-cloud.yml
Normal file
@@ -0,0 +1,229 @@
|
||||
name: Build Sidecar (Cloud)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: 'Sidecar release tag to build (e.g. sidecar-v1.0.5)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build-cloud-linux:
|
||||
name: Build Cloud Sidecar (Linux)
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
PYTHON_VERSION: "3.11"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
run: |
|
||||
echo "Building cloud sidecar for tag ${RELEASE_TAG}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Install uv
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y portaudio19-dev
|
||||
|
||||
- name: Build cloud sidecar
|
||||
env:
|
||||
UV_NO_SOURCES: "1"
|
||||
run: |
|
||||
uv venv
|
||||
uv pip install pyinstaller numpy sounddevice fastapi uvicorn websockets pydantic requests pyyaml packaging
|
||||
.venv/bin/pyinstaller local-transcription-cloud.spec
|
||||
|
||||
- name: Package
|
||||
run: |
|
||||
cd dist/local-transcription-backend && zip -r ../../sidecar-linux-x86_64-cloud.zip .
|
||||
|
||||
- name: Upload to release
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
sudo apt-get install -y jq
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${RELEASE_TAG}"
|
||||
|
||||
for i in $(seq 1 30); do
|
||||
RELEASE_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/tags/${TAG}" | jq -r '.id // empty')
|
||||
if [ -n "${RELEASE_ID}" ] && [ "${RELEASE_ID}" != "null" ]; then
|
||||
echo "Found release ${TAG} (ID: ${RELEASE_ID})"
|
||||
break
|
||||
fi
|
||||
echo "Attempt ${i}/30: waiting for release..."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ -z "${RELEASE_ID}" ] || [ "${RELEASE_ID}" = "null" ]; then
|
||||
echo "ERROR: Release not found"; exit 1
|
||||
fi
|
||||
|
||||
for file in sidecar-*-cloud.zip; do
|
||||
filename=$(basename "$file")
|
||||
ASSET_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets" | jq -r ".[] | select(.name == \"${filename}\") | .id // empty")
|
||||
[ -n "${ASSET_ID}" ] && curl -s -X DELETE -H "Authorization: token ${BUILD_TOKEN}" "${REPO_API}/releases/${RELEASE_ID}/assets/${ASSET_ID}"
|
||||
curl -s -o /dev/null -w "Upload ${filename}: HTTP %{http_code}\n" -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" -H "Content-Type: application/octet-stream" \
|
||||
-T "$file" "${REPO_API}/releases/${RELEASE_ID}/assets?name=${filename}"
|
||||
done
|
||||
|
||||
build-cloud-windows:
|
||||
name: Build Cloud Sidecar (Windows)
|
||||
runs-on: windows-latest
|
||||
env:
|
||||
PYTHON_VERSION: "3.11"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
shell: powershell
|
||||
run: Write-Host "Building cloud sidecar for tag $env:RELEASE_TAG"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Install uv
|
||||
shell: powershell
|
||||
run: |
|
||||
if (Get-Command uv -ErrorAction SilentlyContinue) {
|
||||
Write-Host "uv already installed"
|
||||
} else {
|
||||
irm https://astral.sh/uv/install.ps1 | iex
|
||||
$uvPaths = @("$env:USERPROFILE\.local\bin", "$env:USERPROFILE\.cargo\bin", "$env:LOCALAPPDATA\uv\bin")
|
||||
foreach ($p in $uvPaths) { if (Test-Path $p) { echo $p | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append } }
|
||||
}
|
||||
|
||||
- name: Set up Python
|
||||
shell: powershell
|
||||
run: uv python install ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Build cloud sidecar
|
||||
shell: powershell
|
||||
env:
|
||||
UV_NO_SOURCES: "1"
|
||||
run: |
|
||||
uv venv
|
||||
uv pip install pyinstaller numpy sounddevice fastapi uvicorn websockets pydantic requests pyyaml packaging
|
||||
.venv\Scripts\pyinstaller.exe local-transcription-cloud.spec
|
||||
|
||||
- name: Package
|
||||
shell: powershell
|
||||
run: |
|
||||
if (-not (Get-Command 7z -ErrorAction SilentlyContinue)) { choco install 7zip -y }
|
||||
7z a -tzip -mx=5 sidecar-windows-x86_64-cloud.zip .\dist\local-transcription-backend\*
|
||||
|
||||
- name: Upload to release
|
||||
shell: powershell
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
$REPO_API = "${{ github.server_url }}/api/v1/repos/${{ github.repository }}"
|
||||
$Headers = @{ "Authorization" = "token $env:BUILD_TOKEN" }
|
||||
$TAG = $env:RELEASE_TAG
|
||||
|
||||
$RELEASE_ID = $null
|
||||
for ($i = 1; $i -le 30; $i++) {
|
||||
try {
|
||||
$release = Invoke-RestMethod -Uri "$REPO_API/releases/tags/$TAG" -Headers $Headers -ErrorAction Stop
|
||||
$RELEASE_ID = $release.id
|
||||
if ($RELEASE_ID) { Write-Host "Found release $TAG (ID: $RELEASE_ID)"; break }
|
||||
} catch {}
|
||||
Write-Host "Attempt ${i}/30: waiting..."; Start-Sleep -Seconds 10
|
||||
}
|
||||
if (-not $RELEASE_ID) { Write-Host "ERROR: Release not found"; exit 1 }
|
||||
|
||||
Get-ChildItem -Path . -Filter "sidecar-*-cloud.zip" | ForEach-Object {
|
||||
$fn = $_.Name; $enc = [System.Uri]::EscapeDataString($fn)
|
||||
try {
|
||||
$assets = Invoke-RestMethod -Uri "$REPO_API/releases/$RELEASE_ID/assets" -Headers $Headers
|
||||
$existing = $assets | Where-Object { $_.name -eq $fn }
|
||||
if ($existing) { Invoke-RestMethod -Uri "$REPO_API/releases/$RELEASE_ID/assets/$($existing.id)" -Method Delete -Headers $Headers }
|
||||
} catch {}
|
||||
curl.exe --fail -s -X POST -H "Authorization: token $env:BUILD_TOKEN" -H "Content-Type: application/octet-stream" -T "$($_.FullName)" "$REPO_API/releases/$RELEASE_ID/assets?name=$enc"
|
||||
Write-Host "Uploaded $fn"
|
||||
}
|
||||
|
||||
build-cloud-macos:
|
||||
name: Build Cloud Sidecar (macOS)
|
||||
runs-on: macos-latest
|
||||
env:
|
||||
PYTHON_VERSION: "3.11"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
run: |
|
||||
echo "Building cloud sidecar for tag ${RELEASE_TAG}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Install uv
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install system dependencies
|
||||
run: brew install portaudio
|
||||
|
||||
- name: Build cloud sidecar
|
||||
env:
|
||||
UV_NO_SOURCES: "1"
|
||||
run: |
|
||||
uv venv
|
||||
uv pip install pyinstaller numpy sounddevice fastapi uvicorn websockets pydantic requests pyyaml packaging
|
||||
.venv/bin/pyinstaller local-transcription-cloud.spec
|
||||
|
||||
- name: Package
|
||||
run: |
|
||||
cd dist/local-transcription-backend && zip -r ../../sidecar-macos-aarch64-cloud.zip .
|
||||
|
||||
- name: Upload to release
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
which jq || brew install jq
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${RELEASE_TAG}"
|
||||
|
||||
for i in $(seq 1 30); do
|
||||
RELEASE_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/tags/${TAG}" | jq -r '.id // empty')
|
||||
if [ -n "${RELEASE_ID}" ] && [ "${RELEASE_ID}" != "null" ]; then
|
||||
echo "Found release ${TAG} (ID: ${RELEASE_ID})"
|
||||
break
|
||||
fi
|
||||
echo "Attempt ${i}/30: waiting for release..."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ -z "${RELEASE_ID}" ] || [ "${RELEASE_ID}" = "null" ]; then
|
||||
echo "ERROR: Release not found"; exit 1
|
||||
fi
|
||||
|
||||
for file in sidecar-*-cloud.zip; do
|
||||
filename=$(basename "$file")
|
||||
ASSET_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets" | jq -r ".[] | select(.name == \"${filename}\") | .id // empty")
|
||||
[ -n "${ASSET_ID}" ] && curl -s -X DELETE -H "Authorization: token ${BUILD_TOKEN}" "${REPO_API}/releases/${RELEASE_ID}/assets/${ASSET_ID}"
|
||||
curl -s -o /dev/null -w "Upload ${filename}: HTTP %{http_code}\n" -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" -H "Content-Type: application/octet-stream" \
|
||||
-T "$file" "${REPO_API}/releases/${RELEASE_ID}/assets?name=${filename}"
|
||||
done
|
||||
100
.gitea/workflows/build-sidecar-linux.yml
Normal file
100
.gitea/workflows/build-sidecar-linux.yml
Normal file
@@ -0,0 +1,100 @@
|
||||
name: Build Sidecar (Linux)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: 'Sidecar release tag to build (e.g. sidecar-v1.0.3)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build-sidecar-linux:
|
||||
name: Build Sidecar (Linux)
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
PYTHON_VERSION: "3.11"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
run: |
|
||||
echo "Building for tag: ${RELEASE_TAG}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Install uv
|
||||
run: |
|
||||
if command -v uv &> /dev/null; then
|
||||
echo "uv already installed: $(uv --version)"
|
||||
else
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
fi
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y portaudio19-dev
|
||||
|
||||
- name: Build sidecar (CPU)
|
||||
env:
|
||||
UV_NO_SOURCES: "1"
|
||||
run: |
|
||||
uv sync
|
||||
.venv/bin/pyinstaller local-transcription-headless.spec
|
||||
|
||||
- name: Package sidecar (CPU)
|
||||
run: |
|
||||
cd dist/local-transcription-backend && zip -9 -r ../../sidecar-linux-x86_64-cpu.zip .
|
||||
|
||||
- name: Upload to sidecar release
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
sudo apt-get install -y jq
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${RELEASE_TAG}"
|
||||
|
||||
echo "Waiting for sidecar release ${TAG} to be available..."
|
||||
for i in $(seq 1 30); do
|
||||
RELEASE_JSON=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/tags/${TAG}")
|
||||
RELEASE_ID=$(echo "$RELEASE_JSON" | jq -r '.id // empty')
|
||||
|
||||
if [ -n "${RELEASE_ID}" ] && [ "${RELEASE_ID}" != "null" ]; then
|
||||
echo "Found sidecar release: ${TAG} (ID: ${RELEASE_ID})"
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Attempt ${i}/30: Release not ready yet, retrying in 10s..."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ -z "${RELEASE_ID}" ] || [ "${RELEASE_ID}" = "null" ]; then
|
||||
echo "ERROR: Failed to find sidecar release for tag ${TAG} after 30 attempts."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for file in sidecar-*.zip; do
|
||||
filename=$(basename "$file")
|
||||
encoded_name=$(echo "$filename" | sed 's/ /%20/g')
|
||||
echo "Uploading ${filename} ($(du -h "$file" | cut -f1))..."
|
||||
|
||||
ASSET_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets" | jq -r ".[] | select(.name == \"${filename}\") | .id // empty")
|
||||
if [ -n "${ASSET_ID}" ]; then
|
||||
curl -s -X DELETE -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets/${ASSET_ID}"
|
||||
fi
|
||||
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/octet-stream" \
|
||||
-T "$file" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets?name=${encoded_name}")
|
||||
echo "Upload response: HTTP ${HTTP_CODE}"
|
||||
done
|
||||
101
.gitea/workflows/build-sidecar-macos.yml
Normal file
101
.gitea/workflows/build-sidecar-macos.yml
Normal file
@@ -0,0 +1,101 @@
|
||||
name: Build Sidecar (macOS)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: 'Sidecar release tag to build (e.g. sidecar-v1.0.3)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build-sidecar-macos:
|
||||
name: Build Sidecar (macOS)
|
||||
runs-on: macos-latest
|
||||
env:
|
||||
PYTHON_VERSION: "3.11"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
run: |
|
||||
echo "Building for tag: ${RELEASE_TAG}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Install uv
|
||||
run: |
|
||||
if command -v uv &> /dev/null; then
|
||||
echo "uv already installed: $(uv --version)"
|
||||
else
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
fi
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install system dependencies
|
||||
run: brew install portaudio
|
||||
|
||||
- name: Build sidecar (CPU)
|
||||
env:
|
||||
UV_NO_SOURCES: "1"
|
||||
run: |
|
||||
# UV_NO_SOURCES bypasses pyproject.toml's [tool.uv.sources] which forces
|
||||
# torch from the CUDA index (no macOS ARM wheels there).
|
||||
# Default PyPI torch includes MPS (Apple Silicon GPU) support.
|
||||
uv sync
|
||||
.venv/bin/pyinstaller local-transcription-headless.spec
|
||||
|
||||
- name: Package sidecar (CPU)
|
||||
run: |
|
||||
cd dist/local-transcription-backend && zip -r ../../sidecar-macos-aarch64-cpu.zip .
|
||||
|
||||
- name: Upload to sidecar release
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
which jq || brew install jq
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${RELEASE_TAG}"
|
||||
|
||||
echo "Waiting for sidecar release ${TAG} to be available..."
|
||||
for i in $(seq 1 30); do
|
||||
RELEASE_JSON=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/tags/${TAG}")
|
||||
RELEASE_ID=$(echo "$RELEASE_JSON" | jq -r '.id // empty')
|
||||
|
||||
if [ -n "${RELEASE_ID}" ] && [ "${RELEASE_ID}" != "null" ]; then
|
||||
echo "Found sidecar release: ${TAG} (ID: ${RELEASE_ID})"
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Attempt ${i}/30: Release not ready yet, retrying in 10s..."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ -z "${RELEASE_ID}" ] || [ "${RELEASE_ID}" = "null" ]; then
|
||||
echo "ERROR: Failed to find sidecar release for tag ${TAG} after 30 attempts."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for file in sidecar-*.zip; do
|
||||
filename=$(basename "$file")
|
||||
encoded_name=$(echo "$filename" | sed 's/ /%20/g')
|
||||
echo "Uploading ${filename} ($(du -h "$file" | cut -f1))..."
|
||||
|
||||
ASSET_ID=$(curl -s -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets" | jq -r ".[] | select(.name == \"${filename}\") | .id // empty")
|
||||
if [ -n "${ASSET_ID}" ]; then
|
||||
curl -s -X DELETE -H "Authorization: token ${BUILD_TOKEN}" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets/${ASSET_ID}"
|
||||
fi
|
||||
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/octet-stream" \
|
||||
-T "$file" \
|
||||
"${REPO_API}/releases/${RELEASE_ID}/assets?name=${encoded_name}")
|
||||
echo "Upload response: HTTP ${HTTP_CODE}"
|
||||
done
|
||||
134
.gitea/workflows/build-sidecar-windows.yml
Normal file
134
.gitea/workflows/build-sidecar-windows.yml
Normal file
@@ -0,0 +1,134 @@
|
||||
name: Build Sidecar (Windows)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: 'Sidecar release tag to build (e.g. sidecar-v1.0.3)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build-sidecar-windows:
|
||||
name: Build Sidecar (Windows)
|
||||
runs-on: windows-latest
|
||||
env:
|
||||
PYTHON_VERSION: "3.11"
|
||||
RELEASE_TAG: "${{ inputs.tag }}"
|
||||
steps:
|
||||
- name: Show tag
|
||||
shell: powershell
|
||||
run: |
|
||||
Write-Host "Building for tag: $env:RELEASE_TAG"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.tag }}
|
||||
|
||||
- name: Install uv
|
||||
shell: powershell
|
||||
run: |
|
||||
if (Get-Command uv -ErrorAction SilentlyContinue) {
|
||||
Write-Host "uv already installed: $(uv --version)"
|
||||
} else {
|
||||
irm https://astral.sh/uv/install.ps1 | iex
|
||||
$uvPaths = @(
|
||||
"$env:USERPROFILE\.local\bin",
|
||||
"$env:USERPROFILE\.cargo\bin",
|
||||
"$env:LOCALAPPDATA\uv\bin"
|
||||
)
|
||||
foreach ($p in $uvPaths) {
|
||||
if (Test-Path $p) {
|
||||
echo $p | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
- name: Set up Python
|
||||
shell: powershell
|
||||
run: uv python install ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install 7-Zip
|
||||
shell: powershell
|
||||
run: |
|
||||
if (-not (Get-Command 7z -ErrorAction SilentlyContinue)) {
|
||||
choco install 7zip -y
|
||||
}
|
||||
|
||||
- name: Build sidecar (CPU)
|
||||
shell: powershell
|
||||
env:
|
||||
UV_NO_SOURCES: "1"
|
||||
run: |
|
||||
uv sync
|
||||
.venv\Scripts\pyinstaller.exe local-transcription-headless.spec
|
||||
|
||||
- name: Package sidecar (CPU)
|
||||
shell: powershell
|
||||
run: |
|
||||
7z a -tzip -mx=9 sidecar-windows-x86_64-cpu.zip .\dist\local-transcription-backend\*
|
||||
|
||||
- name: Upload to sidecar release
|
||||
shell: powershell
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
$REPO_API = "${{ github.server_url }}/api/v1/repos/${{ github.repository }}"
|
||||
$Headers = @{ "Authorization" = "token $env:BUILD_TOKEN" }
|
||||
$TAG = $env:RELEASE_TAG
|
||||
Write-Host "Release tag: $TAG"
|
||||
|
||||
if (-not $TAG) {
|
||||
Write-Host "ERROR: RELEASE_TAG is empty"
|
||||
exit 1
|
||||
}
|
||||
|
||||
Write-Host "Waiting for sidecar release $TAG to be available..."
|
||||
$RELEASE_ID = $null
|
||||
|
||||
for ($i = 1; $i -le 30; $i++) {
|
||||
try {
|
||||
$release = Invoke-RestMethod -Uri "$REPO_API/releases/tags/$TAG" -Headers $Headers -ErrorAction Stop
|
||||
$RELEASE_ID = $release.id
|
||||
|
||||
if ($RELEASE_ID) {
|
||||
Write-Host "Found sidecar release: $TAG (ID: $RELEASE_ID)"
|
||||
break
|
||||
}
|
||||
} catch {}
|
||||
|
||||
Write-Host "Attempt ${i}/30: Release not ready yet, retrying in 10s..."
|
||||
Start-Sleep -Seconds 10
|
||||
}
|
||||
|
||||
if (-not $RELEASE_ID) {
|
||||
Write-Host "ERROR: Failed to find sidecar release for tag $TAG after 30 attempts."
|
||||
exit 1
|
||||
}
|
||||
|
||||
Get-ChildItem -Path . -Filter "sidecar-*.zip" | ForEach-Object {
|
||||
$filename = $_.Name
|
||||
$encodedName = [System.Uri]::EscapeDataString($filename)
|
||||
$size = [math]::Round($_.Length / 1MB, 1)
|
||||
Write-Host "Uploading $filename ($size MB)..."
|
||||
|
||||
try {
|
||||
$assets = Invoke-RestMethod -Uri "$REPO_API/releases/$RELEASE_ID/assets" -Headers $Headers
|
||||
$existing = $assets | Where-Object { $_.name -eq $filename }
|
||||
if ($existing) {
|
||||
Invoke-RestMethod -Uri "$REPO_API/releases/$RELEASE_ID/assets/$($existing.id)" -Method Delete -Headers $Headers
|
||||
}
|
||||
} catch {}
|
||||
|
||||
$uploadUrl = "$REPO_API/releases/$RELEASE_ID/assets?name=$encodedName"
|
||||
$result = curl.exe --fail --silent --show-error `
|
||||
-X POST `
|
||||
-H "Authorization: token $env:BUILD_TOKEN" `
|
||||
-H "Content-Type: application/octet-stream" `
|
||||
-T "$($_.FullName)" `
|
||||
"$uploadUrl" 2>&1
|
||||
if ($LASTEXITCODE -eq 0) {
|
||||
Write-Host "Upload successful: $filename"
|
||||
} else {
|
||||
Write-Host "WARNING: Upload failed for ${filename}: $result"
|
||||
}
|
||||
}
|
||||
122
.gitea/workflows/release.yml
Normal file
122
.gitea/workflows/release.yml
Normal file
@@ -0,0 +1,122 @@
|
||||
name: Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: Run Tests
|
||||
if: "!contains(github.event.head_commit.message, '[skip ci]')"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
|
||||
- name: Install npm deps
|
||||
run: npm ci
|
||||
|
||||
- name: Frontend tests
|
||||
run: npx vitest run
|
||||
|
||||
- name: Install uv
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Python tests
|
||||
run: |
|
||||
uv venv .testvenv
|
||||
VIRTUAL_ENV=.testvenv uv pip install pytest httpx pytest-asyncio anyio fastapi pydantic pyyaml uvicorn requests
|
||||
.testvenv/bin/python -m pytest backend/tests/ client/tests/ -v --tb=short
|
||||
|
||||
bump-version:
|
||||
name: Bump version and tag
|
||||
needs: test
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
new_version: ${{ steps.bump.outputs.new_version }}
|
||||
tag: ${{ steps.bump.outputs.tag }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure git
|
||||
run: |
|
||||
git config user.name "Gitea Actions"
|
||||
git config user.email "actions@gitea.local"
|
||||
|
||||
- name: Bump patch version
|
||||
id: bump
|
||||
run: |
|
||||
CURRENT=$(grep '"version"' package.json | head -1 | sed 's/.*"version": *"\([^"]*\)".*/\1/')
|
||||
echo "Current version: ${CURRENT}"
|
||||
|
||||
MAJOR=$(echo "${CURRENT}" | cut -d. -f1)
|
||||
MINOR=$(echo "${CURRENT}" | cut -d. -f2)
|
||||
PATCH=$(echo "${CURRENT}" | cut -d. -f3)
|
||||
NEW_PATCH=$((PATCH + 1))
|
||||
NEW_VERSION="${MAJOR}.${MINOR}.${NEW_PATCH}"
|
||||
echo "New version: ${NEW_VERSION}"
|
||||
|
||||
sed -i "s/\"version\": \"${CURRENT}\"/\"version\": \"${NEW_VERSION}\"/" package.json
|
||||
sed -i "s/\"version\": \"${CURRENT}\"/\"version\": \"${NEW_VERSION}\"/" src-tauri/tauri.conf.json
|
||||
sed -i "s/^version = \"${CURRENT}\"/version = \"${NEW_VERSION}\"/" src-tauri/Cargo.toml
|
||||
sed -i "s/__version__ = \"${CURRENT}\"/__version__ = \"${NEW_VERSION}\"/" version.py
|
||||
sed -i "s/__version_info__ = .*/__version_info__ = (${MAJOR}, ${MINOR}, ${NEW_PATCH})/" version.py
|
||||
|
||||
echo "new_version=${NEW_VERSION}" >> $GITHUB_OUTPUT
|
||||
echo "tag=v${NEW_VERSION}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Commit and tag
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
NEW_VERSION="${{ steps.bump.outputs.new_version }}"
|
||||
git add package.json src-tauri/tauri.conf.json src-tauri/Cargo.toml version.py
|
||||
git commit -m "chore: bump version to ${NEW_VERSION} [skip ci]"
|
||||
git tag "v${NEW_VERSION}"
|
||||
|
||||
REMOTE_URL=$(git remote get-url origin | sed "s|://|://gitea-actions:${BUILD_TOKEN}@|")
|
||||
git pull --rebase "${REMOTE_URL}" main || true
|
||||
git push "${REMOTE_URL}" HEAD:main
|
||||
git push "${REMOTE_URL}" "v${NEW_VERSION}"
|
||||
|
||||
- name: Create Gitea release
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${{ steps.bump.outputs.tag }}"
|
||||
RELEASE_NAME="Local Transcription ${TAG}"
|
||||
|
||||
curl -s -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"tag_name\": \"${TAG}\", \"name\": \"${RELEASE_NAME}\", \"body\": \"Automated build.\", \"draft\": false, \"prerelease\": false}" \
|
||||
"${REPO_API}/releases"
|
||||
echo "Created release: ${RELEASE_NAME}"
|
||||
|
||||
- name: Trigger per-OS app builds
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${{ steps.bump.outputs.tag }}"
|
||||
|
||||
for workflow in build-app-linux.yml build-app-windows.yml build-app-macos.yml; do
|
||||
echo "Dispatching ${workflow} for ${TAG}..."
|
||||
HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/dispatch_resp.txt -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"ref\": \"main\", \"inputs\": {\"tag\": \"${TAG}\"}}" \
|
||||
"${REPO_API}/actions/workflows/${workflow}/dispatches")
|
||||
echo " -> HTTP ${HTTP_CODE}"
|
||||
[ "$HTTP_CODE" != "204" ] && cat /tmp/dispatch_resp.txt && echo ""
|
||||
done
|
||||
|
||||
# NOTE: Automatic cleanup disabled -- it races with async builds.
|
||||
# Clean up old releases manually from the Gitea UI when needed.
|
||||
132
.gitea/workflows/sidecar-release.yml
Normal file
132
.gitea/workflows/sidecar-release.yml
Normal file
@@ -0,0 +1,132 @@
|
||||
name: Sidecar Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: Run Tests
|
||||
if: "!contains(github.event.head_commit.message, '[skip ci]')"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install uv
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Python tests
|
||||
run: |
|
||||
uv venv .testvenv
|
||||
VIRTUAL_ENV=.testvenv uv pip install pytest httpx pytest-asyncio anyio fastapi pydantic pyyaml uvicorn requests
|
||||
.testvenv/bin/python -m pytest backend/tests/ client/tests/ -v --tb=short
|
||||
|
||||
bump-sidecar-version:
|
||||
name: Bump sidecar version and tag
|
||||
needs: test
|
||||
if: "!contains(github.event.head_commit.message, '[skip ci]')"
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
version: ${{ steps.bump.outputs.version }}
|
||||
tag: ${{ steps.bump.outputs.tag }}
|
||||
has_changes: ${{ steps.check_changes.outputs.has_changes }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check for backend changes
|
||||
id: check_changes
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "has_changes=true" >> $GITHUB_OUTPUT
|
||||
exit 0
|
||||
fi
|
||||
CHANGED=$(git diff --name-only HEAD~1 HEAD -- client/ server/ backend/ pyproject.toml local-transcription-headless.spec 2>/dev/null || echo "")
|
||||
if [ -n "$CHANGED" ]; then
|
||||
echo "has_changes=true" >> $GITHUB_OUTPUT
|
||||
echo "Backend changes detected: $CHANGED"
|
||||
else
|
||||
echo "has_changes=false" >> $GITHUB_OUTPUT
|
||||
echo "No backend changes detected, skipping sidecar build"
|
||||
fi
|
||||
|
||||
- name: Configure git
|
||||
if: steps.check_changes.outputs.has_changes == 'true'
|
||||
run: |
|
||||
git config user.name "Gitea Actions"
|
||||
git config user.email "actions@gitea.local"
|
||||
|
||||
- name: Bump sidecar patch version
|
||||
if: steps.check_changes.outputs.has_changes == 'true'
|
||||
id: bump
|
||||
run: |
|
||||
CURRENT=$(grep '^version = ' pyproject.toml | head -1 | sed 's/version = "\(.*\)"/\1/')
|
||||
echo "Current sidecar version: ${CURRENT}"
|
||||
|
||||
MAJOR=$(echo "${CURRENT}" | cut -d. -f1)
|
||||
MINOR=$(echo "${CURRENT}" | cut -d. -f2)
|
||||
PATCH=$(echo "${CURRENT}" | cut -d. -f3)
|
||||
NEW_PATCH=$((PATCH + 1))
|
||||
NEW_VERSION="${MAJOR}.${MINOR}.${NEW_PATCH}"
|
||||
echo "New sidecar version: ${NEW_VERSION}"
|
||||
|
||||
sed -i "s/^version = \"${CURRENT}\"/version = \"${NEW_VERSION}\"/" pyproject.toml
|
||||
|
||||
echo "version=${NEW_VERSION}" >> $GITHUB_OUTPUT
|
||||
echo "tag=sidecar-v${NEW_VERSION}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Commit and tag
|
||||
if: steps.check_changes.outputs.has_changes == 'true'
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
NEW_VERSION="${{ steps.bump.outputs.version }}"
|
||||
TAG="${{ steps.bump.outputs.tag }}"
|
||||
git add pyproject.toml
|
||||
git commit -m "chore: bump sidecar version to ${NEW_VERSION} [skip ci]"
|
||||
git tag "${TAG}"
|
||||
|
||||
REMOTE_URL=$(git remote get-url origin | sed "s|://|://gitea-actions:${BUILD_TOKEN}@|")
|
||||
git pull --rebase "${REMOTE_URL}" main || true
|
||||
git push "${REMOTE_URL}" HEAD:main
|
||||
git push "${REMOTE_URL}" "${TAG}"
|
||||
|
||||
- name: Create Gitea release
|
||||
if: steps.check_changes.outputs.has_changes == 'true'
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${{ steps.bump.outputs.tag }}"
|
||||
VERSION="${{ steps.bump.outputs.version }}"
|
||||
RELEASE_NAME="Sidecar v${VERSION}"
|
||||
|
||||
curl -s -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"tag_name\": \"${TAG}\", \"name\": \"${RELEASE_NAME}\", \"body\": \"Automated sidecar build.\", \"draft\": false, \"prerelease\": false}" \
|
||||
"${REPO_API}/releases"
|
||||
echo "Created release: ${RELEASE_NAME}"
|
||||
|
||||
- name: Trigger per-OS sidecar builds
|
||||
if: steps.check_changes.outputs.has_changes == 'true'
|
||||
env:
|
||||
BUILD_TOKEN: ${{ secrets.BUILD_TOKEN }}
|
||||
run: |
|
||||
REPO_API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||
TAG="${{ steps.bump.outputs.tag }}"
|
||||
|
||||
for workflow in build-sidecar-linux.yml build-sidecar-windows.yml build-sidecar-macos.yml build-sidecar-cloud.yml; do
|
||||
echo "Dispatching ${workflow} for ${TAG}..."
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
|
||||
-H "Authorization: token ${BUILD_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"ref\": \"main\", \"inputs\": {\"tag\": \"${TAG}\"}}" \
|
||||
"${REPO_API}/actions/workflows/${workflow}/dispatches")
|
||||
echo " -> HTTP ${HTTP_CODE}"
|
||||
done
|
||||
|
||||
# NOTE: Automatic cleanup disabled -- it races with async builds.
|
||||
# Clean up old releases manually from the Gitea UI when needed.
|
||||
66
.gitea/workflows/test.yml
Normal file
66
.gitea/workflows/test.yml
Normal file
@@ -0,0 +1,66 @@
|
||||
name: Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
python-tests:
|
||||
name: Python Backend Tests
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install uv
|
||||
run: |
|
||||
if command -v uv &> /dev/null; then
|
||||
echo "uv already installed: $(uv --version)"
|
||||
else
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
fi
|
||||
|
||||
- name: Run pytest
|
||||
run: |
|
||||
uv venv .testvenv
|
||||
VIRTUAL_ENV=.testvenv uv pip install pytest httpx pytest-asyncio anyio fastapi pydantic pyyaml uvicorn requests
|
||||
.testvenv/bin/python -m pytest backend/tests/ client/tests/ -v --tb=short
|
||||
|
||||
frontend-tests:
|
||||
name: Frontend Tests
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Run Vitest
|
||||
run: npx vitest run
|
||||
|
||||
rust-tests:
|
||||
name: Rust Sidecar Tests
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install Rust
|
||||
run: |
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install Tauri system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libgtk-3-dev libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf
|
||||
|
||||
- name: Run cargo test
|
||||
working-directory: src-tauri
|
||||
run: cargo test
|
||||
16
.gitignore
vendored
16
.gitignore
vendored
@@ -10,8 +10,8 @@ dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
/lib/
|
||||
/lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
@@ -54,3 +54,15 @@ models/
|
||||
|
||||
# PyInstaller
|
||||
*.spec.lock
|
||||
|
||||
# Node.js
|
||||
node_modules/
|
||||
|
||||
# Vite / Svelte build output
|
||||
dist/
|
||||
|
||||
# Tauri
|
||||
src-tauri/target/
|
||||
|
||||
# Windows NTFS alternate data streams
|
||||
*:Zone.Identifier
|
||||
|
||||
Binary file not shown.
420
CLAUDE.md
420
CLAUDE.md
@@ -4,52 +4,114 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
||||
|
||||
## Project Overview
|
||||
|
||||
Local Transcription is a desktop application for real-time speech-to-text transcription designed for streamers. It uses Whisper models (via faster-whisper) to transcribe audio locally with optional multi-user server synchronization.
|
||||
Local Transcription is a cross-platform desktop application for real-time speech-to-text transcription designed for streamers. It supports local Whisper models and cloud-based Deepgram transcription, with OBS browser source integration and optional multi-user sync.
|
||||
|
||||
**Architecture:** Two-process model — a Tauri v2 shell (Svelte 5 frontend) communicates with a headless Python backend (sidecar) via REST API and WebSocket.
|
||||
|
||||
**Key Features:**
|
||||
- Standalone desktop GUI (PySide6/Qt)
|
||||
- Local transcription with CPU/GPU support
|
||||
- Built-in web server for OBS browser source integration
|
||||
- Optional Node.js-based multi-user server for syncing transcriptions across users
|
||||
- Noise suppression and Voice Activity Detection (VAD)
|
||||
- Cross-platform builds (Linux/Windows) with PyInstaller
|
||||
- Cross-platform desktop app (Windows, macOS, Linux) via Tauri v2 + Svelte 5
|
||||
- Headless Python backend with FastAPI control API
|
||||
- Dual transcription modes: local Whisper or cloud Deepgram (managed/BYOK)
|
||||
- Built-in web server for OBS browser source at `http://localhost:8080`
|
||||
- Optional multi-user sync via Node.js server
|
||||
- CUDA, MPS (Apple Silicon), and CPU support
|
||||
- Auto-updates, custom fonts, configurable colors
|
||||
|
||||
> **Legacy GUI:** The original PySide6/Qt GUI (`main.py`, `gui/`) still works during the transition. New features should target the Tauri frontend and headless backend.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
local-transcription/
|
||||
├── client/ # Core transcription logic
|
||||
│ ├── audio_capture.py # Audio input and buffering
|
||||
│ ├── transcription_engine.py # Whisper model integration
|
||||
├── src/ # Svelte 5 frontend (Tauri UI)
|
||||
│ ├── App.svelte # Main app shell
|
||||
│ ├── app.css # Global dark theme styles
|
||||
│ ├── main.ts # Svelte mount point
|
||||
│ ├── lib/components/ # UI components
|
||||
│ │ ├── Header.svelte # Title bar + settings button
|
||||
│ │ ├── StatusBar.svelte # State indicator, device, user info
|
||||
│ │ ├── Controls.svelte # Start/Stop, Clear, Save buttons
|
||||
│ │ ├── TranscriptionDisplay.svelte # Scrolling transcript view
|
||||
│ │ └── Settings.svelte # Full settings modal (all sections)
|
||||
│ └── lib/stores/ # Svelte 5 reactive stores ($state/$derived)
|
||||
│ ├── backend.ts # WebSocket + REST API client
|
||||
│ ├── config.ts # App configuration fetch/update
|
||||
│ └── transcriptions.ts # Transcript data management
|
||||
├── src-tauri/ # Tauri v2 Rust shell
|
||||
│ ├── src/lib.rs # Plugin registration (shell, dialog, process)
|
||||
│ ├── src/main.rs # Entry point
|
||||
│ ├── tauri.conf.json # Window, bundle, plugin config
|
||||
│ └── Cargo.toml # Rust dependencies
|
||||
├── backend/ # Headless Python backend (the sidecar)
|
||||
│ ├── app_controller.py # Core orchestration (engine, sync, config)
|
||||
│ ├── api_server.py # FastAPI REST endpoints + /ws/control
|
||||
│ └── main_headless.py # Headless entry point (prints JSON to stdout)
|
||||
├── client/ # Core transcription modules (used by backend)
|
||||
│ ├── audio_capture.py # Audio input handling
|
||||
│ ├── transcription_engine_realtime.py # RealtimeSTT / Whisper engine
|
||||
│ ├── deepgram_transcription.py # Deepgram WebSocket cloud transcription
|
||||
│ ├── noise_suppression.py # VAD and noise reduction
|
||||
│ ├── device_utils.py # CPU/GPU device management
|
||||
│ ├── config.py # Configuration management
|
||||
│ └── server_sync.py # Multi-user server sync client
|
||||
├── gui/ # Desktop application UI
|
||||
│ ├── main_window_qt.py # Main application window (PySide6)
|
||||
│ ├── settings_dialog_qt.py # Settings dialog (PySide6)
|
||||
│ ├── device_utils.py # CPU/GPU/MPS detection
|
||||
│ ├── config.py # YAML config management (~/.local-transcription/)
|
||||
│ ├── server_sync.py # Multi-user server sync client
|
||||
│ ├── instance_lock.py # Single-instance PID lock
|
||||
│ └── update_checker.py # Gitea release update checker
|
||||
├── gui/ # Legacy PySide6/Qt GUI (still functional)
|
||||
│ ├── main_window_qt.py # Main window (orchestration lives here in legacy)
|
||||
│ ├── settings_dialog_qt.py # Settings dialog
|
||||
│ └── transcription_display_qt.py # Display widget
|
||||
├── server/ # Web display servers
|
||||
│ ├── web_display.py # FastAPI server for OBS browser source (local)
|
||||
│ └── nodejs/ # Optional multi-user Node.js server
|
||||
│ ├── server.js # Multi-user sync server with WebSocket
|
||||
│ ├── package.json # Node.js dependencies
|
||||
│ └── README.md # Server deployment documentation
|
||||
├── config/ # Example configuration files
|
||||
│ └── default_config.yaml # Default settings template
|
||||
├── main.py # GUI application entry point
|
||||
├── server/
|
||||
│ ├── web_display.py # FastAPI OBS display server (WebSocket + HTML)
|
||||
│ └── nodejs/ # Optional multi-user sync server
|
||||
├── .gitea/workflows/ # CI/CD
|
||||
│ ├── release.yml # Coordinator: version bump, tag, release creation
|
||||
│ ├── build-app-linux.yml # Linux Tauri app build (triggered by v* tag)
|
||||
│ ├── build-app-windows.yml # Windows Tauri app build (triggered by v* tag)
|
||||
│ ├── build-app-macos.yml # macOS Tauri app build (triggered by v* tag)
|
||||
│ ├── sidecar-release.yml # Sidecar coordinator: version bump, tag, release
|
||||
│ ├── build-sidecar-linux.yml # Linux sidecar build (triggered by sidecar-v* tag)
|
||||
│ ├── build-sidecar-windows.yml # Windows sidecar build (triggered by sidecar-v* tag)
|
||||
│ └── build-sidecar-macos.yml # macOS sidecar build (triggered by sidecar-v* tag)
|
||||
├── config/default_config.yaml # Default settings template
|
||||
├── main.py # Legacy PySide6 GUI entry point
|
||||
├── main_cli.py # CLI version for testing
|
||||
└── pyproject.toml # Dependencies and build config
|
||||
├── version.py # Version string (__version__)
|
||||
├── local-transcription.spec # PyInstaller config (legacy, includes PySide6)
|
||||
├── local-transcription-headless.spec # PyInstaller config (headless sidecar, no Qt)
|
||||
├── pyproject.toml # Python deps (uv, CUDA PyTorch index)
|
||||
├── package.json # Node/Tauri deps
|
||||
└── vite.config.ts # Vite build config ($lib alias)
|
||||
```
|
||||
|
||||
## Development Commands
|
||||
|
||||
### Installation and Setup
|
||||
### Frontend (Tauri + Svelte)
|
||||
```bash
|
||||
# Install dependencies (creates .venv automatically)
|
||||
# Install npm dependencies
|
||||
npm install
|
||||
|
||||
# Run Tauri in development mode (hot-reload)
|
||||
npm run tauri dev
|
||||
|
||||
# Build frontend only (for testing)
|
||||
npx vite build
|
||||
|
||||
# Type-check Svelte
|
||||
npx svelte-check
|
||||
|
||||
# Check Rust compiles
|
||||
cd src-tauri && cargo check
|
||||
```
|
||||
|
||||
### Backend (Python)
|
||||
```bash
|
||||
# Install Python dependencies
|
||||
uv sync
|
||||
|
||||
# Run the GUI application
|
||||
# Run the headless backend standalone (for development)
|
||||
uv run python -m backend.main_headless --port 8080
|
||||
|
||||
# Run the legacy PySide6 GUI
|
||||
uv run python main.py
|
||||
|
||||
# Run CLI version (headless, for testing)
|
||||
@@ -57,257 +119,163 @@ uv run python main_cli.py
|
||||
|
||||
# List available audio devices
|
||||
uv run python main_cli.py --list-devices
|
||||
|
||||
# Install with CUDA support (if needed)
|
||||
uv pip install torch --index-url https://download.pytorch.org/whl/cu121
|
||||
```
|
||||
|
||||
### Building Executables
|
||||
### Building
|
||||
```bash
|
||||
# Linux (includes CUDA support - works on both GPU and CPU systems)
|
||||
./build.sh
|
||||
# Build Tauri app (produces platform installer)
|
||||
npm run tauri build
|
||||
|
||||
# Windows (includes CUDA support - works on both GPU and CPU systems)
|
||||
build.bat
|
||||
# Build headless Python sidecar (no PySide6)
|
||||
uv run pyinstaller local-transcription-headless.spec
|
||||
# Output: dist/local-transcription-backend/
|
||||
|
||||
# Manual build with PyInstaller
|
||||
uv sync # Install dependencies (includes CUDA PyTorch)
|
||||
uv pip uninstall -q enum34 # Remove incompatible enum34 package
|
||||
# Build legacy PySide6 app
|
||||
uv run pyinstaller local-transcription.spec
|
||||
# Or use: ./build.sh (Linux) / build.bat (Windows)
|
||||
```
|
||||
|
||||
**Important:** All builds include CUDA support via `pyproject.toml` configuration. CUDA builds can be created on systems without NVIDIA GPUs. The PyTorch CUDA runtime is bundled, and the app automatically falls back to CPU if no GPU is available.
|
||||
|
||||
### Testing
|
||||
```bash
|
||||
# Run component tests
|
||||
uv run python test_components.py
|
||||
|
||||
# Check CUDA availability
|
||||
uv run python check_cuda.py
|
||||
|
||||
# Test web server manually
|
||||
uv run python -m uvicorn server.web_display:app --reload
|
||||
```
|
||||
|
||||
## Architecture
|
||||
## Architecture Details
|
||||
|
||||
### Audio Processing Pipeline
|
||||
### Communication: Tauri <-> Python Backend
|
||||
|
||||
1. **Audio Capture** ([client/audio_capture.py](client/audio_capture.py))
|
||||
- Captures audio from microphone/system using sounddevice
|
||||
- Handles automatic sample rate detection and resampling
|
||||
- Uses chunking with overlap for better transcription quality
|
||||
- Default: 3-second chunks with 0.5s overlap
|
||||
The Svelte frontend connects to the Python backend via two channels:
|
||||
|
||||
2. **Noise Suppression** ([client/noise_suppression.py](client/noise_suppression.py))
|
||||
- Applies noisereduce for background noise reduction
|
||||
- Voice Activity Detection (VAD) using webrtcvad
|
||||
- Skips silent segments to improve performance
|
||||
**REST API** (on port 8081 by default):
|
||||
- `GET /api/status` — app state, device info, version
|
||||
- `POST /api/start` / `POST /api/stop` — transcription control
|
||||
- `GET /api/config` / `PUT /api/config` — read/write settings (dot-notation keys)
|
||||
- `GET /api/audio-devices` / `GET /api/compute-devices` — device enumeration
|
||||
- `POST /api/reload-engine` — reload with new model/device
|
||||
- `GET /api/transcriptions` / `POST /api/clear` — transcript management
|
||||
- `POST /api/save-file` — write text to a file path
|
||||
- `GET /api/check-update` / `POST /api/skip-version` — update management
|
||||
- `POST /api/login` / `POST /api/register` / `GET /api/balance` — managed mode proxy
|
||||
|
||||
3. **Transcription** ([client/transcription_engine.py](client/transcription_engine.py))
|
||||
- Uses faster-whisper for efficient inference
|
||||
- Supports CPU, CUDA, and Apple MPS (Mac)
|
||||
- Models: tiny, base, small, medium, large
|
||||
- Thread-safe model loading with locks
|
||||
**WebSocket** `/ws/control`:
|
||||
- Pushes real-time events: `state_changed`, `transcription`, `preview`, `error`, `credits_low`
|
||||
- Client sends keepalive pings
|
||||
|
||||
4. **Display** ([gui/main_window_qt.py](gui/main_window_qt.py))
|
||||
- PySide6/Qt-based desktop GUI
|
||||
- Real-time transcription display with scrolling
|
||||
- Settings panel with live updates (no restart needed)
|
||||
The OBS display server runs separately on port 8080 (`GET /` for HTML, `WebSocket /ws` for transcriptions).
|
||||
|
||||
### Web Server Architecture
|
||||
### Backend Process Lifecycle
|
||||
|
||||
**Local Web Server** ([server/web_display.py](server/web_display.py))
|
||||
- Always runs when GUI starts (port 8080 by default)
|
||||
- FastAPI with WebSocket for real-time updates
|
||||
- Used for OBS browser source integration
|
||||
- Single-user (displays only local transcriptions)
|
||||
1. `main_headless.py` starts, acquires instance lock, creates `AppController`
|
||||
2. `AppController.initialize()` starts the OBS web server (port 8080) and engine init thread
|
||||
3. `APIServer` wraps the controller with FastAPI routes, runs on port 8081
|
||||
4. Backend prints `{"event": "ready", "port": 8080}` to stdout for Tauri to discover
|
||||
5. On shutdown: engine stopped, web server stopped, lock released
|
||||
|
||||
**Multi-User Server** (Optional - for syncing across multiple users)
|
||||
### Headless Backend vs Legacy GUI
|
||||
|
||||
**Node.js WebSocket Server** ([server/nodejs/](server/nodejs/)) - **RECOMMENDED**
|
||||
- Real-time WebSocket support (< 100ms latency)
|
||||
- Handles 100+ concurrent users
|
||||
- Easy deployment to VPS/cloud hosting (Railway, Heroku, DigitalOcean, or any VPS)
|
||||
- Configurable display options via URL parameters:
|
||||
- `timestamps=true/false` - Show/hide timestamps
|
||||
- `maxlines=50` - Maximum visible lines (prevents scroll bars in OBS)
|
||||
- `fontsize=16` - Font size in pixels
|
||||
- `fontfamily=Arial` - Font family
|
||||
- `fade=10` - Seconds before text fades (0 = never)
|
||||
The `AppController` class (`backend/app_controller.py`) extracts all orchestration logic from `gui/main_window_qt.py` into a Qt-free class. The mapping:
|
||||
|
||||
See [server/nodejs/README.md](server/nodejs/README.md) for deployment instructions
|
||||
| Legacy (MainWindow) | Headless (AppController) |
|
||||
|---------------------|--------------------------|
|
||||
| `_initialize_components()` | `_initialize_engine()` |
|
||||
| `_start_transcription()` | `start_transcription()` |
|
||||
| `_stop_transcription()` | `stop_transcription()` |
|
||||
| `_on_settings_saved()` | `apply_settings()` |
|
||||
| `_reload_engine()` | `reload_engine()` |
|
||||
| `_start_web_server_if_enabled()` | `_start_web_server()` |
|
||||
| `_start_server_sync()` | `_start_server_sync()` |
|
||||
| Qt signals | Callbacks (`on_state_changed`, `on_transcription`, etc.) |
|
||||
|
||||
### Configuration System
|
||||
### Threading Model (Headless)
|
||||
|
||||
- Config stored at `~/.local-transcription/config.yaml`
|
||||
- Managed by [client/config.py](client/config.py)
|
||||
- Settings apply immediately without restart (except model changes)
|
||||
- YAML format with nested keys (e.g., `transcription.model`)
|
||||
- Main thread: Uvicorn (FastAPI) event loop
|
||||
- Engine init thread: Downloads models, initializes VAD
|
||||
- Web server thread: Separate asyncio loop for OBS display
|
||||
- Audio capture: Runs in engine callback threads
|
||||
- All results flow through `AppController` callbacks -> `APIServer` WebSocket broadcast
|
||||
|
||||
### Device Management
|
||||
### Svelte Frontend
|
||||
|
||||
- [client/device_utils.py](client/device_utils.py) handles CPU/GPU detection
|
||||
- Auto-detects CUDA, MPS (Mac), or falls back to CPU
|
||||
- Compute types: float32 (best quality), float16 (GPU), int8 (fastest)
|
||||
- Thread-safe device selection
|
||||
Uses Svelte 5 runes throughout (`$state`, `$derived`, `$effect`, `$props`). No Svelte 4 patterns.
|
||||
|
||||
## Key Implementation Details
|
||||
**Stores** (`src/lib/stores/`):
|
||||
- `backend.ts` — WebSocket connection + REST helpers (`apiGet`, `apiPost`, `apiPut`), auto-reconnect
|
||||
- `config.ts` — fetches/updates config from backend API
|
||||
- `transcriptions.ts` — manages transcript list, listens for `CustomEvent`s from backend store
|
||||
|
||||
### PyInstaller Build Configuration
|
||||
**Key patterns:**
|
||||
- Backend store dispatches `CustomEvent`s on `window` for cross-store communication
|
||||
- Settings component collects all changed values into a `Record<string, any>` with dot-notation keys, sends via `PUT /api/config`
|
||||
- Controls use Tauri dialog plugin for native file save, falls back to blob download
|
||||
|
||||
- [local-transcription.spec](local-transcription.spec) controls build
|
||||
- UPX compression enabled for smaller executables
|
||||
- Hidden imports required for PySide6, faster-whisper, torch
|
||||
- Console mode enabled by default (set `console=False` to hide)
|
||||
## CI/CD
|
||||
|
||||
### Threading Model
|
||||
Eight Gitea Actions workflows in `.gitea/workflows/`, split into coordinators and per-OS builders:
|
||||
|
||||
- Main thread: Qt GUI event loop
|
||||
- Audio thread: Captures and processes audio chunks
|
||||
- Web server thread: Runs FastAPI server
|
||||
- Transcription: Runs in callback thread from audio capture
|
||||
- All transcription results communicated via Qt signals
|
||||
**App release (Tauri):**
|
||||
- **`release.yml`**: Coordinator. Triggers on push to `main`. Auto-bumps version in package.json/tauri.conf.json/Cargo.toml/version.py, commits, tags `v{VERSION}`, creates Gitea release.
|
||||
- **`build-app-linux.yml`**: Triggers on `v*` tag push or `workflow_dispatch`. Builds Tauri app, uploads `.deb`/`.rpm`/`.AppImage`.
|
||||
- **`build-app-windows.yml`**: Triggers on `v*` tag push or `workflow_dispatch`. Builds Tauri app, uploads `.msi`/`*-setup.exe`.
|
||||
- **`build-app-macos.yml`**: Triggers on `v*` tag push or `workflow_dispatch`. Builds Tauri app, uploads `.dmg`.
|
||||
|
||||
### Server Sync (Optional Multi-User Feature)
|
||||
**Sidecar release (Python backend):**
|
||||
- **`sidecar-release.yml`**: Coordinator. Triggers on push to `main` with changes in `client/`, `server/`, `backend/`, `pyproject.toml`, or `local-transcription-headless.spec`. Bumps version in pyproject.toml/version.py, tags `sidecar-v{VERSION}`, creates Gitea release.
|
||||
- **`build-sidecar-linux.yml`**: Triggers on `sidecar-v*` tag push or `workflow_dispatch`. Builds CUDA + CPU sidecars via PyInstaller.
|
||||
- **`build-sidecar-windows.yml`**: Triggers on `sidecar-v*` tag push or `workflow_dispatch`. Builds CUDA + CPU sidecars via PyInstaller.
|
||||
- **`build-sidecar-macos.yml`**: Triggers on `sidecar-v*` tag push or `workflow_dispatch`. Builds CPU-only sidecar via PyInstaller.
|
||||
|
||||
- [client/server_sync.py](client/server_sync.py) handles server communication
|
||||
- Toggle in Settings: "Enable Server Sync"
|
||||
- Sends transcriptions to Node.js server via HTTP POST
|
||||
- Real-time updates via WebSocket to display page
|
||||
- Per-speaker font support (Web-Safe, Google Fonts, Custom uploads)
|
||||
- Falls back gracefully if server unavailable
|
||||
All per-OS build workflows can be re-run independently via `workflow_dispatch` with an optional `tag` input. All require a `BUILD_TOKEN` secret (Gitea API token with release write access).
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Adding a New Setting
|
||||
|
||||
1. Add to [config/default_config.yaml](config/default_config.yaml)
|
||||
2. Update [client/config.py](client/config.py) if validation needed
|
||||
3. Add UI control in [gui/settings_dialog_qt.py](gui/settings_dialog_qt.py)
|
||||
4. Apply setting in relevant component (no restart if possible)
|
||||
5. Emit signal to update display if needed
|
||||
1. Add default to [config/default_config.yaml](config/default_config.yaml)
|
||||
2. Add UI control in [src/lib/components/Settings.svelte](src/lib/components/Settings.svelte)
|
||||
3. Ensure the setting is included in the save handler's config update
|
||||
4. Apply in `AppController.apply_settings()` or the relevant component
|
||||
5. For legacy GUI: also update [gui/settings_dialog_qt.py](gui/settings_dialog_qt.py)
|
||||
|
||||
### Adding a New API Endpoint
|
||||
|
||||
1. Add route in [backend/api_server.py](backend/api_server.py) `_setup_routes()`
|
||||
2. Add supporting logic in [backend/app_controller.py](backend/app_controller.py) if needed
|
||||
3. Call from Svelte via `backendStore.apiGet/apiPost/apiPut`
|
||||
|
||||
### Modifying Transcription Display
|
||||
|
||||
- Local GUI: [gui/transcription_display_qt.py](gui/transcription_display_qt.py)
|
||||
- Local web display (OBS): [server/web_display.py](server/web_display.py) (HTML in `_get_html()`)
|
||||
- Tauri UI: [src/lib/components/TranscriptionDisplay.svelte](src/lib/components/TranscriptionDisplay.svelte)
|
||||
- OBS display: [server/web_display.py](server/web_display.py) (HTML in `_get_html()`)
|
||||
- Multi-user display: [server/nodejs/server.js](server/nodejs/server.js) (display page in `/display` route)
|
||||
|
||||
### Adding a New Model Size
|
||||
|
||||
- Update [client/transcription_engine.py](client/transcription_engine.py)
|
||||
- Add to model selector in [gui/settings_dialog_qt.py](gui/settings_dialog_qt.py)
|
||||
- Update CLI argument choices in [main_cli.py](main_cli.py)
|
||||
|
||||
## Dependencies
|
||||
|
||||
**Core:**
|
||||
- `faster-whisper`: Optimized Whisper inference
|
||||
- `torch`: ML framework (CUDA-enabled via special index)
|
||||
- `PySide6`: Qt6 bindings for GUI
|
||||
- `sounddevice`: Cross-platform audio I/O
|
||||
- `noisereduce`, `webrtcvad`: Audio preprocessing
|
||||
|
||||
**Web Server:**
|
||||
- `fastapi`, `uvicorn`: Web server and ASGI
|
||||
- `websockets`: Real-time communication
|
||||
|
||||
**Build:**
|
||||
- `pyinstaller`: Create standalone executables
|
||||
- `uv`: Fast package manager
|
||||
|
||||
**PyTorch CUDA Index:**
|
||||
- Configured in [pyproject.toml](pyproject.toml) under `[[tool.uv.index]]`
|
||||
- Uses PyTorch's custom wheel repository for CUDA builds
|
||||
- Automatically installed with `uv sync` when using CUDA build scripts
|
||||
**Frontend:** Tauri v2, Svelte 5, Vite, TypeScript
|
||||
**Backend:** Python 3.9+, FastAPI, Uvicorn, RealtimeSTT, faster-whisper, PyTorch (CUDA), sounddevice
|
||||
**Build:** PyInstaller (sidecar), Tauri CLI (app), uv (Python packages)
|
||||
**CI:** Gitea Actions with platform-specific runners
|
||||
|
||||
## Platform-Specific Notes
|
||||
|
||||
### Linux
|
||||
- Uses PulseAudio/ALSA for audio
|
||||
- Build scripts use bash (`.sh` files)
|
||||
- Executable: `dist/LocalTranscription/LocalTranscription`
|
||||
- Tauri needs: `libgtk-3-dev`, `libwebkit2gtk-4.1-dev`, `libappindicator3-dev`, `librsvg2-dev`, `patchelf`
|
||||
- Audio: PulseAudio/ALSA via sounddevice
|
||||
|
||||
### Windows
|
||||
- Uses Windows Audio/WASAPI
|
||||
- Build scripts use batch (`.bat` files)
|
||||
- Executable: `dist\LocalTranscription\LocalTranscription.exe`
|
||||
- Requires Visual C++ Redistributable on target systems
|
||||
- Tauri needs: WebView2 (usually pre-installed on Windows 10+)
|
||||
- Audio: WASAPI via sounddevice
|
||||
|
||||
### Cross-Building
|
||||
- **Cannot cross-compile** - must build on target platform
|
||||
- CI/CD should use platform-specific runners
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Model Loading Issues
|
||||
- Models download to `~/.cache/huggingface/`
|
||||
- First run requires internet connection
|
||||
- Check disk space (models: 75MB-3GB depending on size)
|
||||
|
||||
### Audio Device Issues
|
||||
- Run `uv run python main_cli.py --list-devices`
|
||||
- Check permissions (microphone access)
|
||||
- Try different device indices in settings
|
||||
|
||||
### GPU Not Detected
|
||||
- Run `uv run python check_cuda.py`
|
||||
- Install CUDA drivers (not CUDA toolkit - bundled in build)
|
||||
- Verify PyTorch sees GPU: `python -c "import torch; print(torch.cuda.is_available())"`
|
||||
|
||||
### Web Server Port Conflicts
|
||||
- Default port: 8080
|
||||
- Change in [gui/main_window_qt.py](gui/main_window_qt.py) or config
|
||||
- Use `lsof -i :8080` (Linux) or `netstat -ano | findstr :8080` (Windows)
|
||||
|
||||
## OBS Integration
|
||||
|
||||
### Local Display (Single User)
|
||||
1. Start Local Transcription app
|
||||
2. In OBS: Add "Browser" source
|
||||
3. URL: `http://localhost:8080`
|
||||
4. Set dimensions (e.g., 1920x300)
|
||||
|
||||
### Multi-User Display (Node.js Server)
|
||||
1. Deploy Node.js server (see [server/nodejs/README.md](server/nodejs/README.md))
|
||||
2. Each user configures Server URL: `http://your-server:3000/api/send`
|
||||
3. Enter same room name and passphrase
|
||||
4. In OBS: Add "Browser" source
|
||||
5. URL: `http://your-server:3000/display?room=ROOM&fade=10×tamps=true&maxlines=50&fontsize=16`
|
||||
6. Customize URL parameters as needed:
|
||||
- `timestamps=false` - Hide timestamps
|
||||
- `maxlines=30` - Show max 30 lines (prevents scroll bars)
|
||||
- `fontsize=18` - Larger font
|
||||
- `fontfamily=Courier` - Different font
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
**For Real-Time Transcription:**
|
||||
- Use `tiny` or `base` model (faster)
|
||||
- Enable GPU if available (5-10x faster)
|
||||
- Increase chunk_duration for better accuracy (higher latency)
|
||||
- Decrease chunk_duration for lower latency (less context)
|
||||
- Enable VAD to skip silent audio
|
||||
|
||||
**For Build Size Reduction:**
|
||||
- Don't bundle models (download on demand)
|
||||
- Use CPU-only build if no GPU users
|
||||
- Enable UPX compression (already in spec)
|
||||
|
||||
## Phase Status
|
||||
|
||||
- ✅ **Phase 1**: Standalone desktop application (complete)
|
||||
- ✅ **Web Server**: Local OBS integration (complete)
|
||||
- ✅ **Builds**: PyInstaller executables (complete)
|
||||
- ✅ **Phase 2**: Multi-user Node.js server (complete, optional)
|
||||
- ⏸️ **Phase 3+**: Advanced features (see [NEXT_STEPS.md](NEXT_STEPS.md))
|
||||
### macOS
|
||||
- Tauri needs: Xcode Command Line Tools
|
||||
- Audio: CoreAudio via sounddevice
|
||||
- GPU: MPS (Apple Silicon) detected by `device_utils.py`
|
||||
- `Info.plist` must include `NSMicrophoneUsageDescription` for mic access
|
||||
- No CUDA builds — CPU/MPS only
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [README.md](README.md) - User-facing documentation
|
||||
- [BUILD.md](BUILD.md) - Detailed build instructions
|
||||
- [INSTALL.md](INSTALL.md) - Installation guide
|
||||
- [NEXT_STEPS.md](NEXT_STEPS.md) - Future enhancements
|
||||
- [server/nodejs/README.md](server/nodejs/README.md) - Node.js server setup and deployment
|
||||
- [README.md](README.md) — User-facing documentation
|
||||
- [BUILD.md](BUILD.md) — Detailed build instructions
|
||||
- [INSTALL.md](INSTALL.md) — Installation guide
|
||||
- [server/nodejs/README.md](server/nodejs/README.md) — Node.js server setup
|
||||
|
||||
574
DEEPGRAM_PROXY_PLAN.md
Normal file
574
DEEPGRAM_PROXY_PLAN.md
Normal file
@@ -0,0 +1,574 @@
|
||||
# Deepgram Proxy Service — Build Plan
|
||||
|
||||
## Project Overview
|
||||
|
||||
Build a standalone hosted service that acts as a Deepgram proxy for the Local Transcription
|
||||
desktop app. Users can either provide their own Deepgram API key (BYOK) or use the managed
|
||||
service with prepaid credits purchased via Stripe.
|
||||
|
||||
This is a **separate repository** from `local-transcription`. The desktop app will be updated
|
||||
in a second phase to support both modes.
|
||||
|
||||
---
|
||||
|
||||
## Repository Structure
|
||||
|
||||
```
|
||||
transcription-proxy/
|
||||
├── src/
|
||||
│ ├── server.js # Express app entry point
|
||||
│ ├── config.js # Environment config loader
|
||||
│ ├── db/
|
||||
│ │ ├── index.js # node-postgres pool setup
|
||||
│ │ └── migrations/ # SQL migration files (numbered)
|
||||
│ │ ├── 001_users.sql
|
||||
│ │ ├── 002_credits.sql
|
||||
│ │ ├── 003_sessions.sql
|
||||
│ │ └── 004_usage_ledger.sql
|
||||
│ ├── middleware/
|
||||
│ │ ├── auth.js # JWT verification middleware
|
||||
│ │ └── rateLimit.js # Per-user rate limiting
|
||||
│ ├── routes/
|
||||
│ │ ├── auth.js # POST /auth/register, /auth/login, /auth/refresh
|
||||
│ │ ├── billing.js # POST /billing/checkout, GET /billing/balance
|
||||
│ │ └── account.js # GET /account/me, GET /account/usage
|
||||
│ ├── websocket/
|
||||
│ │ └── proxy.js # WebSocket proxy handler (core feature)
|
||||
│ └── webhooks/
|
||||
│ └── stripe.js # POST /webhooks/stripe
|
||||
├── web/ # Simple frontend dashboard
|
||||
│ ├── index.html # Landing / login page
|
||||
│ ├── dashboard.html # Balance, usage history, buy credits
|
||||
│ └── assets/
|
||||
│ ├── app.js
|
||||
│ └── style.css
|
||||
├── .env.example
|
||||
├── package.json
|
||||
├── docker-compose.yml # Postgres + app for local dev
|
||||
└── CLAUDE.md # This file (after renaming)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Technology Stack
|
||||
|
||||
- **Runtime**: Node.js 20+
|
||||
- **Framework**: Express 4
|
||||
- **WebSocket**: `ws` library (not socket.io — keep it lean)
|
||||
- **Database**: PostgreSQL 15+ via `pg` (node-postgres)
|
||||
- **Auth**: JWT via `jsonwebtoken`, passwords hashed with `bcrypt`
|
||||
- **Payments**: Stripe Node SDK (`stripe`)
|
||||
- **Environment**: `dotenv`
|
||||
- **Dev tooling**: `nodemon` for dev, no TypeScript (keep it simple)
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
Run migrations in order. Use a simple `schema_migrations` table to track applied migrations.
|
||||
|
||||
### 001_users.sql
|
||||
```sql
|
||||
CREATE TABLE schema_migrations (
|
||||
version INTEGER PRIMARY KEY,
|
||||
applied_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE TABLE users (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
email TEXT UNIQUE NOT NULL,
|
||||
password_hash TEXT NOT NULL,
|
||||
stripe_customer_id TEXT UNIQUE,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
### 002_credits.sql
|
||||
```sql
|
||||
CREATE TABLE credit_balance (
|
||||
user_id UUID PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
|
||||
seconds_remaining INTEGER NOT NULL DEFAULT 0,
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
### 003_sessions.sql
|
||||
```sql
|
||||
CREATE TABLE transcription_sessions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
user_id UUID NOT NULL REFERENCES users(id),
|
||||
mode TEXT NOT NULL CHECK (mode IN ('managed', 'byok')),
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
ended_at TIMESTAMPTZ,
|
||||
seconds_used INTEGER NOT NULL DEFAULT 0,
|
||||
deepgram_model TEXT,
|
||||
status TEXT NOT NULL DEFAULT 'active' CHECK (status IN ('active', 'completed', 'terminated'))
|
||||
);
|
||||
|
||||
CREATE INDEX idx_sessions_user_id ON transcription_sessions(user_id);
|
||||
CREATE INDEX idx_sessions_started_at ON transcription_sessions(started_at);
|
||||
```
|
||||
|
||||
### 004_usage_ledger.sql
|
||||
```sql
|
||||
CREATE TABLE usage_ledger (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
user_id UUID NOT NULL REFERENCES users(id),
|
||||
session_id UUID REFERENCES transcription_sessions(id),
|
||||
recorded_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
seconds INTEGER NOT NULL,
|
||||
description TEXT -- e.g. 'session_usage', 'credit_purchase', 'manual_adjustment'
|
||||
);
|
||||
|
||||
CREATE INDEX idx_ledger_user_id ON usage_ledger(user_id);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Environment Variables (.env.example)
|
||||
|
||||
```env
|
||||
# Server
|
||||
PORT=3000
|
||||
NODE_ENV=development
|
||||
|
||||
# Database
|
||||
DATABASE_URL=postgresql://user:password@localhost:5432/transcription_proxy
|
||||
|
||||
# Auth
|
||||
JWT_SECRET=changeme_use_long_random_string
|
||||
JWT_EXPIRY=7d
|
||||
|
||||
# Stripe
|
||||
STRIPE_SECRET_KEY=sk_test_...
|
||||
STRIPE_WEBHOOK_SECRET=whsec_...
|
||||
|
||||
# Deepgram
|
||||
DEEPGRAM_API_KEY=your_deepgram_key_here
|
||||
|
||||
# Pricing (seconds per dollar — adjust for your margin)
|
||||
# Default: 1000 seconds per $1 = $0.006/min managed cost covered + margin
|
||||
CREDITS_PER_DOLLAR=1000
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 1 — Core Server & Auth
|
||||
|
||||
### Goals
|
||||
- Working Express app with Postgres connection
|
||||
- Migration runner
|
||||
- User registration and login
|
||||
- JWT middleware
|
||||
|
||||
### Tasks
|
||||
|
||||
1. **Scaffold project**
|
||||
- `npm init`, install dependencies: `express ws pg jsonwebtoken bcrypt stripe dotenv`
|
||||
- Dev dependencies: `nodemon`
|
||||
- Add `start` and `dev` scripts to package.json
|
||||
|
||||
2. **Database connection** (`src/db/index.js`)
|
||||
- Export a `pg.Pool` instance using `DATABASE_URL`
|
||||
- Export a `migrate()` function that reads `src/db/migrations/*.sql` in order,
|
||||
checks `schema_migrations` table, and applies unapplied ones
|
||||
- Call `migrate()` on server startup before listening
|
||||
|
||||
3. **Auth routes** (`src/routes/auth.js`)
|
||||
- `POST /auth/register` — validate email/password, hash password with bcrypt (cost 12),
|
||||
insert user, insert empty credit_balance row, return JWT
|
||||
- `POST /auth/login` — verify credentials, return JWT + refresh token
|
||||
- `POST /auth/refresh` — validate refresh token, return new JWT
|
||||
- Passwords: minimum 8 characters, validate email format
|
||||
|
||||
4. **JWT middleware** (`src/middleware/auth.js`)
|
||||
- Verify `Authorization: Bearer <token>` header
|
||||
- Attach `req.user = { id, email }` on success
|
||||
- Return 401 on failure
|
||||
- Export as `requireAuth` middleware
|
||||
|
||||
5. **Basic health check**
|
||||
- `GET /health` returns `{ status: 'ok', db: 'connected' }`
|
||||
|
||||
---
|
||||
|
||||
## Phase 2 — Billing & Credits
|
||||
|
||||
### Goals
|
||||
- Stripe Checkout session creation for credit purchases
|
||||
- Webhook handler to fulfill purchases
|
||||
- Balance endpoint
|
||||
|
||||
### Payment Methods
|
||||
|
||||
Use **Stripe Dynamic Payment Methods** — do NOT hardcode `payment_method_types` in the
|
||||
Checkout Session. Instead, leave it unset and manage everything from the Stripe Dashboard.
|
||||
|
||||
Enable the following in the Stripe Dashboard under Settings → Payment Methods:
|
||||
- **Cards** (Visa, Mastercard, Amex, Discover) — on by default
|
||||
- **PayPal** — enable manually
|
||||
- **Apple Pay** — on by default, shows automatically on Safari/iOS
|
||||
- **Google Pay** — enable manually (one toggle)
|
||||
- **Cash App Pay** — enable manually (popular with streaming audiences)
|
||||
- **Link** — Stripe's saved payment network, on by default
|
||||
|
||||
Stripe will automatically show the most relevant methods to each user based on their
|
||||
location and device. No code changes are needed to add or remove methods in future —
|
||||
it's all dashboard config.
|
||||
|
||||
### Credit Packages
|
||||
|
||||
Define these as constants in `src/config.js`:
|
||||
|
||||
```javascript
|
||||
CREDIT_PACKAGES: [
|
||||
{ id: 'pack_500', label: '500 minutes', seconds: 30000, price_cents: 300 },
|
||||
{ id: 'pack_1200', label: '1200 minutes', seconds: 72000, price_cents: 600 },
|
||||
{ id: 'pack_3000', label: '3000 minutes', seconds: 180000, price_cents: 1200 },
|
||||
]
|
||||
```
|
||||
|
||||
Adjust pricing to cover Deepgram costs ($0.006/min = $0.0001/sec) plus margin and
|
||||
Stripe fees (~2.9% + $0.30).
|
||||
|
||||
### Tasks
|
||||
|
||||
1. **Stripe customer creation**
|
||||
- On user registration, create a Stripe customer and store `stripe_customer_id`
|
||||
- Do this asynchronously (don't block registration response)
|
||||
|
||||
2. **Billing routes** (`src/routes/billing.js`)
|
||||
- `GET /billing/packages` — return credit package list (no auth required)
|
||||
- `POST /billing/checkout` — requires auth, accepts `{ package_id }`,
|
||||
creates Stripe Checkout Session using dynamic payment methods (do NOT pass
|
||||
`payment_method_types` — omitting it enables dynamic methods automatically),
|
||||
include `payment_intent_data.metadata` containing `user_id` and `package_id`,
|
||||
returns `{ checkout_url }`
|
||||
- `GET /billing/balance` — requires auth, returns `{ seconds_remaining, minutes_remaining }`
|
||||
|
||||
3. **Stripe webhook** (`src/webhooks/stripe.js`)
|
||||
- Mount at `POST /webhooks/stripe` with raw body (use `express.raw()` for this route only)
|
||||
- Verify signature with `stripe.webhooks.constructEvent()`
|
||||
- Handle `checkout.session.completed`:
|
||||
- Extract `user_id` and `package_id` from metadata
|
||||
- Add seconds to `credit_balance`
|
||||
- Insert row into `usage_ledger` with description `'credit_purchase'`
|
||||
- Handle `payment_intent.payment_failed`: log it (no action needed for prepaid)
|
||||
|
||||
4. **Success/cancel pages**
|
||||
- Stripe Checkout redirects to `GET /billing/success?session_id=...` and `/billing/cancel`
|
||||
- These can be simple HTML responses or redirects to the web dashboard
|
||||
|
||||
---
|
||||
|
||||
## Phase 3 — WebSocket Proxy (Core Feature)
|
||||
|
||||
This is the most critical component. The proxy sits between the desktop client and Deepgram,
|
||||
forwarding audio while tracking usage in real time.
|
||||
|
||||
### Connection Flow
|
||||
|
||||
```
|
||||
Client connects → validate JWT → check credit balance → open Deepgram upstream
|
||||
↓
|
||||
Audio chunks arrive → forward to Deepgram → record usage every 5 seconds
|
||||
↓
|
||||
Transcription arrives from Deepgram → forward to client
|
||||
↓
|
||||
Client disconnects (or credits exhausted) → close upstream → finalize session
|
||||
```
|
||||
|
||||
### WebSocket Protocol
|
||||
|
||||
**Client connects to**: `wss://your-domain/ws/transcribe`
|
||||
|
||||
**Client sends as first message** (JSON):
|
||||
```json
|
||||
{
|
||||
"type": "auth",
|
||||
"token": "<JWT>",
|
||||
"config": {
|
||||
"model": "nova-2",
|
||||
"language": "en-US",
|
||||
"interim_results": true,
|
||||
"endpointing": 300
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**After auth success, client sends**: raw audio binary frames (PCM 16kHz mono)
|
||||
|
||||
**Server sends to client**:
|
||||
```json
|
||||
{ "type": "ready" }
|
||||
{ "type": "transcript", "text": "...", "is_final": true, "confidence": 0.98 }
|
||||
{ "type": "error", "code": "insufficient_credits", "message": "..." }
|
||||
{ "type": "credits_low", "seconds_remaining": 300 }
|
||||
{ "type": "session_end", "seconds_used": 120 }
|
||||
```
|
||||
|
||||
### Tasks (`src/websocket/proxy.js`)
|
||||
|
||||
1. **Upgrade handler**
|
||||
- Attach to the HTTP server using `ws.Server({ noServer: true })`
|
||||
- In `server.on('upgrade', ...)`, route `/ws/transcribe` to this handler
|
||||
|
||||
2. **Auth handshake**
|
||||
- First message must be `{ type: 'auth', token: '...' }` — received within 5 seconds
|
||||
or connection is terminated
|
||||
- Verify JWT, load user's credit balance from DB
|
||||
- If balance is 0 or negative, send `insufficient_credits` error and close
|
||||
|
||||
3. **Deepgram upstream connection**
|
||||
- Open a WebSocket to Deepgram's streaming API:
|
||||
`wss://api.deepgram.com/v1/listen?model=nova-2&language=en-US&interim_results=true`
|
||||
- Auth header: `Authorization: Token <DEEPGRAM_API_KEY>`
|
||||
- Use query params from client's `config` object (whitelist allowed params)
|
||||
|
||||
4. **Audio forwarding**
|
||||
- All binary messages from client → forward directly to Deepgram upstream
|
||||
- All messages from Deepgram → parse JSON, reformat, forward to client
|
||||
|
||||
5. **Usage tracking**
|
||||
- Create a `transcription_sessions` row on connection
|
||||
- Maintain an in-memory `secondsUsed` counter per connection
|
||||
- Deepgram sends `{ type: 'Results', duration: X }` in responses — use this for
|
||||
accurate second counting
|
||||
- Every 10 seconds (or on disconnect), write current `secondsUsed` to DB:
|
||||
- Update `transcription_sessions.seconds_used`
|
||||
- Decrement `credit_balance.seconds_remaining`
|
||||
- Insert into `usage_ledger`
|
||||
- If `seconds_remaining` hits 0: send `insufficient_credits`, close connection
|
||||
|
||||
6. **Cleanup on disconnect**
|
||||
- Mark session as `completed`, set `ended_at`
|
||||
- Do final usage flush to DB
|
||||
- Close Deepgram upstream if still open
|
||||
|
||||
7. **Error handling**
|
||||
- If Deepgram upstream closes unexpectedly, notify client and close
|
||||
- If client sends malformed data, log and continue (don't crash)
|
||||
|
||||
---
|
||||
|
||||
## Phase 4 — Account Routes & Rate Limiting
|
||||
|
||||
### Tasks
|
||||
|
||||
1. **Account routes** (`src/routes/account.js`)
|
||||
- `GET /account/me` — returns `{ email, credits: { seconds_remaining, minutes_remaining }, created_at }`
|
||||
- `GET /account/usage` — returns last 30 days of `usage_ledger` entries grouped by day,
|
||||
plus list of last 10 sessions with duration
|
||||
|
||||
2. **Rate limiting** (`src/middleware/rateLimit.js`)
|
||||
- Use in-memory rate limiting (no Redis needed at this scale)
|
||||
- Auth endpoints: max 10 requests per minute per IP
|
||||
- WebSocket connections: max 2 concurrent connections per user
|
||||
(store active connections in a `Map<userId, Set<ws>>`)
|
||||
|
||||
---
|
||||
|
||||
## Phase 5 — Web Dashboard
|
||||
|
||||
A simple, functional HTML/CSS/JS dashboard. No framework — vanilla JS is fine.
|
||||
This is a developer-friendly streamer tool, not a consumer SaaS, so clean and
|
||||
functional beats flashy.
|
||||
|
||||
### Pages
|
||||
|
||||
**`/` (Landing / Login)**
|
||||
- Brief product description (what this is, why it exists)
|
||||
- Login form and link to register
|
||||
- Link to GitHub/Gitea repo
|
||||
|
||||
**`/dashboard` (Post-login)**
|
||||
- Current credit balance (minutes remaining, prominently displayed)
|
||||
- "Buy Credits" section showing the three packages with Stripe Checkout buttons
|
||||
- Usage chart: last 30 days bar chart (vanilla canvas or a small CDN chart lib)
|
||||
- Recent sessions table: date, duration, status
|
||||
|
||||
**`/register`**
|
||||
- Registration form
|
||||
|
||||
### Implementation Notes
|
||||
- Store JWT in `localStorage`, attach as `Authorization` header on API calls
|
||||
- Redirect to `/` if JWT missing or expired
|
||||
- Keep CSS minimal but readable — this is a utility dashboard
|
||||
|
||||
---
|
||||
|
||||
## Phase 6 — Desktop App Integration
|
||||
|
||||
Changes needed in the `local-transcription` Python repo.
|
||||
|
||||
### New file: `client/remote_transcription.py`
|
||||
|
||||
This module replaces `transcription_engine_realtime.py` when remote mode is active.
|
||||
|
||||
```python
|
||||
# Pseudocode / spec for Claude Code to implement
|
||||
|
||||
class RemoteTranscriptionEngine:
|
||||
"""
|
||||
Connects to the transcription proxy WebSocket and streams audio.
|
||||
Provides the same callback interface as the local engine so the
|
||||
rest of the app doesn't need to change.
|
||||
"""
|
||||
|
||||
def __init__(self, config, on_transcript_callback):
|
||||
# config contains: server_url, auth_token (or byok_api_key), model
|
||||
...
|
||||
|
||||
def start(self):
|
||||
# Open WebSocket connection
|
||||
# Send auth message
|
||||
# Start audio capture thread (reuse existing audio_capture.py)
|
||||
...
|
||||
|
||||
def stop(self):
|
||||
# Close WebSocket gracefully
|
||||
...
|
||||
|
||||
def _on_audio_chunk(self, audio_data):
|
||||
# Called by audio_capture.py with raw PCM data
|
||||
# Send as binary WebSocket frame
|
||||
...
|
||||
|
||||
def _on_server_message(self, message):
|
||||
# Parse JSON from server
|
||||
# On type='transcript': call on_transcript_callback
|
||||
# On type='credits_low': trigger UI warning
|
||||
# On type='error': surface to user
|
||||
...
|
||||
```
|
||||
|
||||
### BYOK Mode
|
||||
|
||||
When user provides their own Deepgram key, connect directly to Deepgram instead of the proxy:
|
||||
- Endpoint: `wss://api.deepgram.com/v1/listen?...`
|
||||
- Auth: `Authorization: Token <user_key>`
|
||||
- No session tracking (Deepgram handles billing directly to the user)
|
||||
- Same `RemoteTranscriptionEngine` class, just different URL and auth header
|
||||
|
||||
### Settings Changes (`gui/settings_dialog_qt.py`)
|
||||
|
||||
Add a new "Transcription Mode" section:
|
||||
|
||||
```
|
||||
Transcription Mode:
|
||||
○ Local (Whisper) [existing behavior]
|
||||
○ Remote - Managed [requires login]
|
||||
○ Remote - BYOK [requires Deepgram API key]
|
||||
|
||||
[If Managed selected]:
|
||||
Server URL: [____________]
|
||||
[Login / Register] [View Balance: 420 min remaining]
|
||||
|
||||
[If BYOK selected]:
|
||||
Deepgram API Key: [____________]
|
||||
Model: [nova-2 ▼]
|
||||
```
|
||||
|
||||
### Config additions (`config/default_config.yaml`)
|
||||
|
||||
```yaml
|
||||
remote:
|
||||
mode: local # local | managed | byok
|
||||
server_url: "" # proxy server URL for managed mode
|
||||
auth_token: "" # JWT stored after login
|
||||
byok_api_key: "" # Deepgram key for BYOK mode
|
||||
deepgram_model: nova-2
|
||||
language: en-US
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Build & Deployment Notes
|
||||
|
||||
### Docker Compose (local dev)
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
services:
|
||||
db:
|
||||
image: postgres:15
|
||||
environment:
|
||||
POSTGRES_DB: transcription_proxy
|
||||
POSTGRES_USER: user
|
||||
POSTGRES_PASSWORD: password
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- pgdata:/var/lib/postgresql/data
|
||||
|
||||
app:
|
||||
build: .
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
DATABASE_URL: postgresql://user:password@db:5432/transcription_proxy
|
||||
depends_on:
|
||||
- db
|
||||
volumes:
|
||||
- .:/app
|
||||
- /app/node_modules
|
||||
|
||||
volumes:
|
||||
pgdata:
|
||||
```
|
||||
|
||||
### Production Deployment
|
||||
|
||||
This service is a good fit for deployment on AnHonestHost WHP as a containerized app,
|
||||
or on a small DigitalOcean/Linode VPS. Requirements are light:
|
||||
- 512MB RAM is sufficient
|
||||
- Postgres can be the same instance as other services or managed (e.g., Supabase free tier)
|
||||
- Needs a public domain with SSL for WebSocket (`wss://`) to work from desktop clients
|
||||
|
||||
Reverse proxy config (Nginx or HAProxy) should:
|
||||
- Proxy HTTP → `localhost:3000`
|
||||
- Pass `Upgrade` and `Connection` headers for WebSocket support
|
||||
- Set `proxy_read_timeout 3600` (sessions can be long)
|
||||
|
||||
---
|
||||
|
||||
## Implementation Order
|
||||
|
||||
Build and test in this sequence:
|
||||
|
||||
1. Project scaffold + DB connection + migrations
|
||||
2. Auth (register/login/JWT) — test with curl
|
||||
3. Stripe billing + webhook — test with Stripe CLI (`stripe listen`)
|
||||
4. WebSocket proxy — test with a simple browser WebSocket client first
|
||||
5. Usage tracking and credit decrement
|
||||
6. Account/usage routes
|
||||
7. Web dashboard
|
||||
8. Desktop app integration (separate PR in local-transcription repo)
|
||||
|
||||
---
|
||||
|
||||
## Key Decisions & Rationale
|
||||
|
||||
| Decision | Choice | Reason |
|
||||
|---|---|---|
|
||||
| Credits model | Prepaid | No surprise charges, simpler billing, better for irregular streamer usage |
|
||||
| WebSocket library | `ws` | Lightweight, no abstraction overhead, plays well with raw binary audio |
|
||||
| Auth | JWT (stateless) | Desktop app holds token locally; no session store needed |
|
||||
| DB driver | `node-postgres` (pg) | No ORM overhead; schema is simple enough for raw SQL |
|
||||
| Migrations | Raw SQL files | No dependency on Knex/Prisma; easy to inspect and reason about |
|
||||
| Rate limiting | In-memory | Redis is overkill for this scale; single-process Node is fine initially |
|
||||
| Frontend | Vanilla JS | Dashboard is simple utility UI; no framework justified |
|
||||
|
||||
---
|
||||
|
||||
## What This Plan Does NOT Cover (Future Work)
|
||||
|
||||
- OAuth / social login
|
||||
- Admin panel for managing users
|
||||
- Refund / credit adjustment tooling
|
||||
- Email verification
|
||||
- Password reset flow
|
||||
- Multi-language support beyond Deepgram's defaults
|
||||
- Analytics / aggregated usage reporting
|
||||
- Self-hosted Whisper inference as a third backend option
|
||||
652
README.md
652
README.md
@@ -1,494 +1,318 @@
|
||||
# Local Transcription for Streamers
|
||||
# Local Transcription
|
||||
|
||||
A local speech-to-text application designed for streamers that provides real-time transcription using Whisper or similar models. Multiple users can run the application locally and sync their transcriptions to a centralized web stream that can be easily captured in OBS or other streaming software.
|
||||
A real-time speech-to-text desktop application for streamers. Runs locally on your machine with GPU or CPU, displays transcriptions via OBS browser source, and optionally syncs with other users through a multi-user server.
|
||||
|
||||
**Version 1.4.0**
|
||||
|
||||
## Features
|
||||
|
||||
- **Standalone Desktop Application**: Use locally with built-in GUI display - no server required
|
||||
- **Local Transcription**: Run Whisper (or compatible models) locally on your machine
|
||||
- **CPU/GPU Support**: Choose between CPU or GPU processing based on your hardware
|
||||
- **Real-time Processing**: Live audio transcription with minimal latency
|
||||
- **Real-Time Transcription**: Live speech-to-text using Whisper models with minimal latency
|
||||
- **Cross-Platform**: Native desktop app for Windows, macOS, and Linux via [Tauri](https://tauri.app/)
|
||||
- **Dual Transcription Modes**: Local (Whisper) or cloud (Deepgram) with managed billing or BYOK
|
||||
- **CPU & GPU Support**: Automatic detection of CUDA (NVIDIA), MPS (Apple Silicon), or CPU fallback
|
||||
- **Advanced Voice Detection**: Dual-layer VAD (WebRTC + Silero) for accurate speech detection
|
||||
- **OBS Integration**: Built-in web server for browser source capture at `http://localhost:8080`
|
||||
- **Multi-User Sync**: Optional Node.js server to sync transcriptions across multiple users
|
||||
- **Custom Fonts**: Support for system fonts, web-safe fonts, Google Fonts, and custom font files
|
||||
- **Customizable Colors**: User-configurable colors for name, text, and background
|
||||
- **Noise Suppression**: Built-in audio preprocessing to reduce background noise
|
||||
- **User Configuration**: Set your display name and preferences through the GUI
|
||||
- **Optional Multi-user Sync**: Connect to a server to sync transcriptions with other users
|
||||
- **OBS Integration**: Web-based output designed for easy browser source capture
|
||||
- **Privacy-First**: All processing happens locally; only transcription text is shared
|
||||
- **Customizable**: Configure model size, language, and streaming settings
|
||||
- **Auto-Updates**: Automatic update checking with release notes display
|
||||
|
||||
## Architecture
|
||||
|
||||
The application uses a two-process architecture:
|
||||
|
||||
1. **Tauri Shell** (Svelte 5 frontend) — lightweight native window (~50MB) rendering the UI
|
||||
2. **Python Backend** (sidecar) — headless process running transcription, audio capture, and the OBS web server
|
||||
|
||||
The Tauri frontend communicates with the Python backend via REST API and WebSocket, following the same pattern as [voice-to-notes](https://repo.anhonesthost.net/MacroPad/voice-to-notes).
|
||||
|
||||
```
|
||||
Tauri App (user launches this)
|
||||
└─ Spawns Python backend as sidecar
|
||||
├─ FastAPI REST API (control endpoints)
|
||||
├─ WebSocket /ws/control (real-time state + transcriptions)
|
||||
├─ OBS web display at http://localhost:8080
|
||||
└─ Transcription engine (Whisper or Deepgram)
|
||||
```
|
||||
|
||||
> **Legacy GUI**: The original PySide6/Qt desktop GUI (`main.py`) still works alongside the new Tauri frontend during the transition period.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Running from Source
|
||||
|
||||
```bash
|
||||
# Install dependencies
|
||||
# Install Python dependencies
|
||||
uv sync
|
||||
|
||||
# Run the application
|
||||
# Run the Tauri app (frontend + backend)
|
||||
npm install
|
||||
npm run tauri dev
|
||||
|
||||
# Or run just the headless backend (for development)
|
||||
uv run python -m backend.main_headless
|
||||
|
||||
# Or run the legacy PySide6 GUI
|
||||
uv run python main.py
|
||||
```
|
||||
|
||||
### Building Standalone Executables
|
||||
### Using Pre-Built Executables
|
||||
|
||||
To create standalone executables for distribution:
|
||||
Download the latest release from the [releases page](https://repo.anhonesthost.net/streamer-tools/local-transcription/releases):
|
||||
|
||||
- **App installer** (Tauri shell): `.msi` (Windows), `.dmg` (macOS), `.deb`/`.rpm`/`.AppImage` (Linux)
|
||||
- **Sidecar** (Python backend): Download the matching `sidecar-*` zip for your platform (CUDA or CPU)
|
||||
|
||||
### Building from Source
|
||||
|
||||
**Linux:**
|
||||
```bash
|
||||
./build.sh
|
||||
```
|
||||
# Build the Tauri app
|
||||
npm install
|
||||
npm run tauri build
|
||||
# Output: src-tauri/target/release/bundle/
|
||||
|
||||
**Windows:**
|
||||
```cmd
|
||||
# Build the Python sidecar (headless, no Qt)
|
||||
uv sync
|
||||
uv run pyinstaller local-transcription-headless.spec
|
||||
# Output: dist/local-transcription-backend/
|
||||
|
||||
# Build the legacy PySide6 app (Linux)
|
||||
./build.sh
|
||||
# Build the legacy PySide6 app (Windows)
|
||||
build.bat
|
||||
```
|
||||
|
||||
For detailed build instructions, see [BUILD.md](BUILD.md).
|
||||
|
||||
## Architecture Overview
|
||||
## Usage
|
||||
|
||||
The application can run in two modes:
|
||||
### Standalone Mode
|
||||
|
||||
### Standalone Mode (No Server Required):
|
||||
1. **Desktop Application**: Captures audio, performs speech-to-text, and displays transcriptions locally in a GUI window
|
||||
1. Launch the application
|
||||
2. Select your microphone from the audio device dropdown
|
||||
3. Choose a Whisper model (smaller = faster, larger = more accurate):
|
||||
- `tiny.en` / `tiny` — Fastest, good for quick captions
|
||||
- `base.en` / `base` — Balanced speed and accuracy
|
||||
- `small.en` / `small` — Better accuracy
|
||||
- `medium.en` / `medium` — High accuracy
|
||||
- `large-v3` — Best accuracy (requires more resources)
|
||||
4. Click **Start** to begin transcription
|
||||
5. Transcriptions appear in the main window and at `http://localhost:8080`
|
||||
|
||||
### Multi-user Sync Mode (Optional):
|
||||
1. **Local Transcription Client**: Captures audio, performs speech-to-text, and sends results to the web server
|
||||
2. **Centralized Web Server**: Aggregates transcriptions from multiple clients and serves a web stream
|
||||
3. **Web Stream Interface**: Browser-accessible page displaying synchronized transcriptions (for OBS capture)
|
||||
### Remote Transcription (Deepgram)
|
||||
|
||||
## Use Cases
|
||||
Instead of local Whisper models, you can use cloud-based transcription:
|
||||
|
||||
- **Multi-language Streams**: Multiple translators transcribing in different languages
|
||||
- **Accessibility**: Provide real-time captions for viewers
|
||||
- **Collaborative Podcasts**: Multiple hosts with separate transcriptions
|
||||
- **Gaming Commentary**: Track who said what in multiplayer sessions
|
||||
- **Managed mode**: Sign up via the transcription proxy for metered billing
|
||||
- **BYOK mode**: Bring your own Deepgram API key for direct access
|
||||
|
||||
---
|
||||
Configure in Settings > Remote Transcription.
|
||||
|
||||
## Implementation Plan
|
||||
### OBS Browser Source Setup
|
||||
|
||||
### Phase 1: Standalone Desktop Application
|
||||
1. Start the Local Transcription app
|
||||
2. In OBS, add a **Browser** source
|
||||
3. Set URL to `http://localhost:8080`
|
||||
4. Set dimensions (e.g., 1920x300)
|
||||
5. Check "Shutdown source when not visible" for performance
|
||||
|
||||
**Objective**: Build a fully functional standalone transcription app with GUI that works without any server
|
||||
### Multi-User Mode (Optional)
|
||||
|
||||
#### Components:
|
||||
1. **Audio Capture Module**
|
||||
- Capture system audio or microphone input
|
||||
- Support multiple audio sources (virtual audio cables, physical devices)
|
||||
- Real-time audio buffering with configurable chunk sizes
|
||||
- **Noise Suppression**: Preprocess audio to reduce background noise
|
||||
- Libraries: `pyaudio`, `sounddevice`, `noisereduce`, `webrtcvad`
|
||||
For syncing transcriptions across multiple users (e.g., multi-host streams or translation teams):
|
||||
|
||||
2. **Noise Suppression Engine**
|
||||
- Real-time noise reduction using RNNoise or noisereduce
|
||||
- Adjustable noise reduction strength
|
||||
- Optional VAD (Voice Activity Detection) to skip silent segments
|
||||
- Libraries: `noisereduce`, `rnnoise-python`, `webrtcvad`
|
||||
1. Deploy the Node.js server (see [server/nodejs/README.md](server/nodejs/README.md))
|
||||
2. In the app settings, enable **Server Sync**
|
||||
3. Enter the server URL (e.g., `http://your-server:3000/api/send`)
|
||||
4. Set a room name and passphrase (shared with other users)
|
||||
5. In OBS, use the server's display URL with your room name:
|
||||
```
|
||||
http://your-server:3000/display?room=YOURROOM×tamps=true&maxlines=50
|
||||
```
|
||||
|
||||
3. **Transcription Engine**
|
||||
- Integrate OpenAI Whisper (or alternatives: faster-whisper, whisper.cpp)
|
||||
- Support multiple model sizes (tiny, base, small, medium, large)
|
||||
- CPU and GPU inference options
|
||||
- Model management and automatic downloading
|
||||
- Libraries: `openai-whisper`, `faster-whisper`, `torch`
|
||||
## Configuration
|
||||
|
||||
4. **Device Selection**
|
||||
- Auto-detect available compute devices (CPU, CUDA, MPS for Mac)
|
||||
- Allow user to specify preferred device via GUI
|
||||
- Graceful fallback if GPU unavailable
|
||||
- Display device status and performance metrics
|
||||
Settings are stored at `~/.local-transcription/config.yaml` and can be modified through the GUI settings panel or the REST API.
|
||||
|
||||
5. **Desktop GUI Application**
|
||||
- Cross-platform GUI using PyQt6, Tkinter, or CustomTkinter
|
||||
- Main transcription display window (scrolling text area)
|
||||
- Settings panel for configuration
|
||||
- User name input field
|
||||
- Audio input device selector
|
||||
- Model size selector
|
||||
- CPU/GPU toggle
|
||||
- Start/Stop transcription button
|
||||
- Optional: System tray integration
|
||||
- Libraries: `PyQt6`, `customtkinter`, or `tkinter`
|
||||
### Key Settings
|
||||
|
||||
6. **Local Display**
|
||||
- Real-time transcription display in GUI window
|
||||
- Scrolling text with timestamps
|
||||
- User name/label shown with transcriptions
|
||||
- Copy transcription to clipboard
|
||||
- Optional: Save transcription to file (TXT, SRT, VTT)
|
||||
| Setting | Description | Default |
|
||||
|---------|-------------|---------|
|
||||
| `transcription.model` | Whisper model to use | `base.en` |
|
||||
| `transcription.device` | Processing device (auto/cuda/cpu) | `auto` |
|
||||
| `transcription.enable_realtime_transcription` | Show preview while speaking | `false` |
|
||||
| `transcription.silero_sensitivity` | VAD sensitivity (0-1, lower = more sensitive) | `0.4` |
|
||||
| `transcription.post_speech_silence_duration` | Silence before finalizing (seconds) | `0.3` |
|
||||
| `transcription.continuous_mode` | Fast speaker mode for quick talkers | `false` |
|
||||
| `remote.mode` | Transcription mode (local/managed/byok) | `local` |
|
||||
| `display.show_timestamps` | Show timestamps with transcriptions | `true` |
|
||||
| `display.fade_after_seconds` | Fade out time (0 = never) | `10` |
|
||||
| `display.font_source` | Font type (System Font/Web-Safe/Google Font/Custom File) | `System Font` |
|
||||
| `web_server.port` | Local web server port | `8080` |
|
||||
|
||||
#### Tasks:
|
||||
- [ ] Set up project structure and dependencies
|
||||
- [ ] Implement audio capture with device selection
|
||||
- [ ] Add noise suppression and VAD preprocessing
|
||||
- [ ] Integrate Whisper model loading and inference
|
||||
- [ ] Add CPU/GPU device detection and selection logic
|
||||
- [ ] Create real-time audio buffer processing pipeline
|
||||
- [ ] Design and implement GUI layout (main window)
|
||||
- [ ] Add settings panel with user name configuration
|
||||
- [ ] Implement local transcription display area
|
||||
- [ ] Add start/stop controls and status indicators
|
||||
- [ ] Test transcription accuracy and latency
|
||||
- [ ] Test noise suppression effectiveness
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Web Server and Sync System
|
||||
|
||||
**Objective**: Create a centralized server to aggregate and serve transcriptions
|
||||
|
||||
#### Components:
|
||||
1. **Web Server**
|
||||
- FastAPI or Flask-based REST API
|
||||
- WebSocket support for real-time updates
|
||||
- User/client registration and management
|
||||
- Libraries: `fastapi`, `uvicorn`, `websockets`
|
||||
|
||||
2. **Transcription Aggregator**
|
||||
- Receive transcription chunks from multiple clients
|
||||
- Associate transcriptions with user IDs/names
|
||||
- Timestamp management and synchronization
|
||||
- Buffer management for smooth streaming
|
||||
|
||||
3. **Database/Storage** (Optional)
|
||||
- Store transcription history (SQLite for simplicity)
|
||||
- Session management
|
||||
- Export functionality (SRT, VTT, TXT formats)
|
||||
|
||||
#### API Endpoints:
|
||||
- `POST /api/register` - Register a new client
|
||||
- `POST /api/transcription` - Submit transcription chunk
|
||||
- `WS /api/stream` - WebSocket for real-time transcription stream
|
||||
- `GET /stream` - Web page for OBS browser source
|
||||
|
||||
#### Tasks:
|
||||
- [ ] Set up FastAPI server with CORS support
|
||||
- [ ] Implement WebSocket handler for real-time streaming
|
||||
- [ ] Create client registration system
|
||||
- [ ] Build transcription aggregation logic
|
||||
- [ ] Add timestamp synchronization
|
||||
- [ ] Create data models for clients and transcriptions
|
||||
|
||||
---
|
||||
|
||||
### Phase 3: Client-Server Communication (Optional Multi-user Mode)
|
||||
|
||||
**Objective**: Add optional server connectivity to enable multi-user transcription sync
|
||||
|
||||
#### Components:
|
||||
1. **HTTP/WebSocket Client**
|
||||
- Register client with server on startup
|
||||
- Send transcription chunks as they're generated
|
||||
- Handle connection drops and reconnection
|
||||
- Libraries: `requests`, `websockets`
|
||||
|
||||
2. **Configuration System**
|
||||
- Config file for server URL, API keys, user settings
|
||||
- Model preferences (size, language)
|
||||
- Audio input settings
|
||||
- Format: YAML or JSON
|
||||
|
||||
3. **Status Monitoring**
|
||||
- Connection status indicator
|
||||
- Transcription queue health
|
||||
- Error handling and logging
|
||||
|
||||
#### Tasks:
|
||||
- [ ] Add "Enable Server Sync" toggle to GUI
|
||||
- [ ] Add server URL configuration field in settings
|
||||
- [ ] Implement WebSocket client for sending transcriptions
|
||||
- [ ] Add configuration file support (YAML/JSON)
|
||||
- [ ] Create connection management with auto-reconnect
|
||||
- [ ] Add local logging and error handling
|
||||
- [ ] Add server connection status indicator to GUI
|
||||
- [ ] Allow app to function normally if server is unavailable
|
||||
|
||||
---
|
||||
|
||||
### Phase 4: Web Stream Interface (OBS Integration)
|
||||
|
||||
**Objective**: Create a web page that displays synchronized transcriptions for OBS
|
||||
|
||||
#### Components:
|
||||
1. **Web Frontend**
|
||||
- HTML/CSS/JavaScript page for displaying transcriptions
|
||||
- Responsive design with customizable styling
|
||||
- Auto-scroll with configurable retention window
|
||||
- Libraries: Vanilla JS or lightweight framework (Alpine.js, htmx)
|
||||
|
||||
2. **Styling Options**
|
||||
- Customizable fonts, colors, sizes
|
||||
- Background transparency for OBS chroma key
|
||||
- User name/ID display options
|
||||
- Timestamp display (optional)
|
||||
|
||||
3. **Display Modes**
|
||||
- Scrolling captions (like live TV captions)
|
||||
- Multi-user panel view (separate sections per user)
|
||||
- Overlay mode (minimal UI for transparency)
|
||||
|
||||
#### Tasks:
|
||||
- [ ] Create HTML template for transcription display
|
||||
- [ ] Implement WebSocket client in JavaScript
|
||||
- [ ] Add CSS styling with OBS-friendly transparency
|
||||
- [ ] Create customization controls (URL parameters or UI)
|
||||
- [ ] Test with OBS browser source
|
||||
- [ ] Add configurable retention/scroll behavior
|
||||
|
||||
---
|
||||
|
||||
### Phase 5: Advanced Features
|
||||
|
||||
**Objective**: Enhance functionality and user experience
|
||||
|
||||
#### Features:
|
||||
1. **Language Detection**
|
||||
- Auto-detect spoken language
|
||||
- Multi-language support in single stream
|
||||
- Language selector in GUI
|
||||
|
||||
2. **Speaker Diarization** (Optional)
|
||||
- Identify different speakers
|
||||
- Label transcriptions by speaker
|
||||
- Useful for multi-host streams
|
||||
|
||||
3. **Profanity Filtering**
|
||||
- Optional word filtering/replacement
|
||||
- Customizable filter lists
|
||||
- Toggle in GUI settings
|
||||
|
||||
4. **Advanced Noise Profiles**
|
||||
- Save and load custom noise profiles
|
||||
- Adaptive noise suppression
|
||||
- Different profiles for different environments
|
||||
|
||||
5. **Export Functionality**
|
||||
- Save transcriptions in multiple formats (TXT, SRT, VTT, JSON)
|
||||
- Export button in GUI
|
||||
- Automatic session saving
|
||||
|
||||
6. **Hotkey Support**
|
||||
- Global hotkeys to start/stop transcription
|
||||
- Mute/unmute hotkey
|
||||
- Quick save hotkey
|
||||
|
||||
7. **Docker Support**
|
||||
- Containerized server deployment
|
||||
- Docker Compose for easy multi-component setup
|
||||
- Pre-built images for easy deployment
|
||||
|
||||
8. **Themes and Customization**
|
||||
- Dark/light theme toggle
|
||||
- Customizable font sizes and colors for display
|
||||
- OBS-friendly transparent overlay mode
|
||||
|
||||
#### Tasks:
|
||||
- [ ] Add language detection and multi-language support
|
||||
- [ ] Implement speaker diarization
|
||||
- [ ] Create optional profanity filter
|
||||
- [ ] Add export functionality (SRT, VTT, plain text, JSON)
|
||||
- [ ] Implement global hotkey support
|
||||
- [ ] Create Docker containers for server component
|
||||
- [ ] Add theme customization options
|
||||
- [ ] Create advanced noise profile management
|
||||
|
||||
---
|
||||
|
||||
## Technology Stack
|
||||
|
||||
### Local Client:
|
||||
- **Python 3.9+**
|
||||
- **GUI**: PyQt6 / CustomTkinter / tkinter
|
||||
- **Audio**: PyAudio / sounddevice
|
||||
- **Noise Suppression**: noisereduce / rnnoise-python
|
||||
- **VAD**: webrtcvad
|
||||
- **ML Framework**: PyTorch (for Whisper)
|
||||
- **Transcription**: openai-whisper / faster-whisper
|
||||
- **Networking**: websockets, requests (optional for server sync)
|
||||
- **Config**: PyYAML / json
|
||||
|
||||
### Server:
|
||||
- **Backend**: FastAPI / Flask
|
||||
- **WebSocket**: python-websockets / FastAPI WebSockets
|
||||
- **Server**: Uvicorn / Gunicorn
|
||||
- **Database** (optional): SQLite / PostgreSQL
|
||||
- **CORS**: fastapi-cors
|
||||
|
||||
### Web Interface:
|
||||
- **Frontend**: HTML5, CSS3, JavaScript (ES6+)
|
||||
- **Real-time**: WebSocket API
|
||||
- **Styling**: CSS Grid/Flexbox for layout
|
||||
|
||||
---
|
||||
See [config/default_config.yaml](config/default_config.yaml) for all available options.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
local-transcription/
|
||||
| ||||