- OAuth authentication via Authentik - WebSocket connection to OpenClaw gateway - Configurable gateway URL with first-run setup - User preferences sync across devices - Multi-user support with custom assistant names - ElevenLabs TTS integration (local + remote) - FCM push notifications for alarms - Voice input via Google Speech API - No hardcoded secrets or internal IPs in tracked files
320 lines
12 KiB
Kotlin
320 lines
12 KiB
Kotlin
package com.openclaw.alfred.voice
|
|
|
|
import android.content.Context
|
|
import android.media.MediaPlayer
|
|
import android.speech.tts.TextToSpeech
|
|
import android.util.Log
|
|
import com.openclaw.alfred.BuildConfig
|
|
import kotlinx.coroutines.*
|
|
import okhttp3.MediaType.Companion.toMediaType
|
|
import okhttp3.OkHttpClient
|
|
import okhttp3.Request
|
|
import okhttp3.RequestBody.Companion.toRequestBody
|
|
import org.json.JSONObject
|
|
import java.io.File
|
|
import java.io.FileOutputStream
|
|
import java.util.*
|
|
import java.util.concurrent.TimeUnit
|
|
|
|
/**
|
|
* Manages Text-to-Speech using ElevenLabs API with extended timeout.
|
|
*/
|
|
class TTSManager(private val context: Context) {
|
|
|
|
private val TAG = "TTSManager"
|
|
private val client = OkHttpClient.Builder()
|
|
.connectTimeout(30, TimeUnit.SECONDS)
|
|
.readTimeout(120, TimeUnit.SECONDS) // Extended for long responses
|
|
.writeTimeout(30, TimeUnit.SECONDS)
|
|
.build()
|
|
private var mediaPlayer: MediaPlayer? = null
|
|
private val scope = CoroutineScope(Dispatchers.IO + SupervisorJob())
|
|
|
|
private val apiKey = BuildConfig.ELEVENLABS_API_KEY
|
|
private val baseUrl = "https://api.elevenlabs.io/v1"
|
|
|
|
// Read voice ID from preferences (default: Finn - vBKc2FfBKJfcZNyEt1n6)
|
|
private fun getVoiceId(): String {
|
|
val prefs = context.getSharedPreferences("alfred_settings", Context.MODE_PRIVATE)
|
|
return prefs.getString("tts_voice_id", BuildConfig.ELEVENLABS_VOICE_ID)
|
|
?: BuildConfig.ELEVENLABS_VOICE_ID
|
|
}
|
|
|
|
// Fallback Android TTS
|
|
private var androidTTS: TextToSpeech? = null
|
|
private var ttsReady = false
|
|
|
|
init {
|
|
// Initialize Android TTS as fallback
|
|
androidTTS = TextToSpeech(context) { status ->
|
|
if (status == TextToSpeech.SUCCESS) {
|
|
androidTTS?.language = Locale.US
|
|
ttsReady = true
|
|
Log.d(TAG, "Android TTS initialized successfully")
|
|
} else {
|
|
Log.e(TAG, "Android TTS initialization failed")
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sanitize text for TTS by removing markdown and special characters.
|
|
*/
|
|
private fun sanitizeTextForSpeech(text: String): String {
|
|
var cleaned = text
|
|
|
|
// Remove markdown formatting
|
|
cleaned = cleaned.replace(Regex("\\*\\*([^*]+)\\*\\*"), "$1") // Bold: **text**
|
|
cleaned = cleaned.replace(Regex("\\*([^*]+)\\*"), "$1") // Italic: *text*
|
|
cleaned = cleaned.replace(Regex("__([^_]+)__"), "$1") // Bold: __text__
|
|
cleaned = cleaned.replace(Regex("_([^_]+)_"), "$1") // Italic: _text_
|
|
cleaned = cleaned.replace(Regex("~~([^~]+)~~"), "$1") // Strikethrough: ~~text~~
|
|
cleaned = cleaned.replace(Regex("`([^`]+)`"), "$1") // Inline code: `text`
|
|
|
|
// Remove code blocks
|
|
cleaned = cleaned.replace(Regex("```[\\s\\S]*?```"), "") // Code blocks
|
|
|
|
// Remove links but keep link text
|
|
cleaned = cleaned.replace(Regex("\\[([^]]+)]\\([^)]+\\)"), "$1") // [text](url)
|
|
cleaned = cleaned.replace(Regex("https?://\\S+"), "") // Plain URLs
|
|
|
|
// Remove list markers
|
|
cleaned = cleaned.replace(Regex("^[\\s]*[-*+•]\\s+", RegexOption.MULTILINE), "") // List bullets
|
|
cleaned = cleaned.replace(Regex("^[\\s]*\\d+\\.\\s+", RegexOption.MULTILINE), "") // Numbered lists
|
|
|
|
// Remove headers
|
|
cleaned = cleaned.replace(Regex("^#+\\s+", RegexOption.MULTILINE), "") // # Headers
|
|
|
|
// Remove blockquotes
|
|
cleaned = cleaned.replace(Regex("^>\\s+", RegexOption.MULTILINE), "")
|
|
|
|
// Remove emoji shortcodes
|
|
cleaned = cleaned.replace(Regex(":[a-z_]+:"), "")
|
|
|
|
// Remove brackets and parentheses (but keep content)
|
|
cleaned = cleaned.replace(Regex("[\\[\\]()]"), "")
|
|
|
|
// Remove multiple punctuation marks (e.g., "..." -> ".")
|
|
cleaned = cleaned.replace(Regex("([.!?]){2,}"), "$1")
|
|
|
|
// Remove special characters but keep basic punctuation
|
|
cleaned = cleaned.replace(Regex("[^a-zA-Z0-9\\s.,!?;:'-]"), "")
|
|
|
|
// Clean up whitespace
|
|
cleaned = cleaned.replace(Regex("\\s+"), " ")
|
|
cleaned = cleaned.trim()
|
|
|
|
Log.d(TAG, "Sanitized for TTS: '$text' -> '$cleaned'")
|
|
return cleaned
|
|
}
|
|
|
|
/**
|
|
* Convert text to speech and play it.
|
|
*/
|
|
fun speak(text: String, onComplete: () -> Unit = {}, onError: (String) -> Unit = {}) {
|
|
if (apiKey.isEmpty()) {
|
|
Log.w(TAG, "ElevenLabs API key not configured, using Android TTS")
|
|
speakWithAndroidTTS(text, onComplete, onError)
|
|
return
|
|
}
|
|
|
|
scope.launch {
|
|
try {
|
|
// Sanitize text before sending to TTS
|
|
val cleanText = sanitizeTextForSpeech(text)
|
|
|
|
if (cleanText.isBlank()) {
|
|
Log.w(TAG, "Text became empty after sanitization, skipping TTS")
|
|
withContext(Dispatchers.Main) { onComplete() }
|
|
return@launch
|
|
}
|
|
|
|
Log.d(TAG, "Converting text to speech: ${cleanText.take(50)}...")
|
|
|
|
// Call TTS proxy endpoint
|
|
val voiceId = getVoiceId()
|
|
val audioUrl = callTTSProxy(cleanText, voiceId)
|
|
|
|
if (audioUrl == null) {
|
|
// Fallback to Android TTS
|
|
Log.w(TAG, "TTS proxy failed, falling back to Android TTS")
|
|
withContext(Dispatchers.Main) {
|
|
speakWithAndroidTTS(cleanText, onComplete, onError)
|
|
}
|
|
return@launch
|
|
}
|
|
|
|
Log.d(TAG, "TTS audio URL: $audioUrl")
|
|
|
|
// Play audio on main thread
|
|
withContext(Dispatchers.Main) {
|
|
val baseUrl = BuildConfig.GATEWAY_URL.replace("wss://", "https://").replace("ws://", "http://")
|
|
playStreamingAudio("$baseUrl$audioUrl", onComplete, onError)
|
|
}
|
|
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "TTS error, falling back to Android TTS", e)
|
|
// Use sanitized text for fallback too
|
|
val cleanText = sanitizeTextForSpeech(text)
|
|
withContext(Dispatchers.Main) {
|
|
speakWithAndroidTTS(cleanText, onComplete, onError)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Call TTS proxy and get audio URL.
|
|
*/
|
|
private fun callTTSProxy(text: String, voiceId: String): String? {
|
|
try {
|
|
val baseUrl = BuildConfig.GATEWAY_URL.replace("wss://", "https://").replace("ws://", "http://")
|
|
val proxyUrl = "$baseUrl/api/tts"
|
|
|
|
val json = JSONObject().apply {
|
|
put("text", text)
|
|
put("voiceId", voiceId)
|
|
}
|
|
|
|
val requestBody = json.toString().toRequestBody("application/json".toMediaType())
|
|
|
|
val request = Request.Builder()
|
|
.url(proxyUrl)
|
|
.post(requestBody)
|
|
.build()
|
|
|
|
client.newCall(request).execute().use { response ->
|
|
if (!response.isSuccessful) {
|
|
val errorBody = response.body?.string() ?: "no body"
|
|
Log.e(TAG, "TTS proxy error: ${response.code} ${response.message}")
|
|
Log.e(TAG, "Error body: $errorBody")
|
|
return null
|
|
}
|
|
|
|
val responseBody = response.body?.string() ?: return null
|
|
val responseJson = JSONObject(responseBody)
|
|
return responseJson.getString("audioUrl")
|
|
}
|
|
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Failed to call TTS proxy", e)
|
|
return null
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Speak using Android built-in TTS.
|
|
*/
|
|
private fun speakWithAndroidTTS(text: String, onComplete: () -> Unit, onError: (String) -> Unit) {
|
|
if (!ttsReady || androidTTS == null) {
|
|
onError("Android TTS not ready")
|
|
return
|
|
}
|
|
|
|
try {
|
|
androidTTS?.setOnUtteranceProgressListener(object : android.speech.tts.UtteranceProgressListener() {
|
|
override fun onStart(utteranceId: String?) {
|
|
Log.d(TAG, "Android TTS started")
|
|
}
|
|
|
|
override fun onDone(utteranceId: String?) {
|
|
Log.d(TAG, "Android TTS completed")
|
|
onComplete()
|
|
}
|
|
|
|
override fun onError(utteranceId: String?) {
|
|
Log.e(TAG, "Android TTS error")
|
|
onError("Android TTS error")
|
|
}
|
|
})
|
|
|
|
androidTTS?.speak(text, TextToSpeech.QUEUE_FLUSH, null, "alfred-${System.currentTimeMillis()}")
|
|
Log.d(TAG, "Speaking with Android TTS")
|
|
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Failed to use Android TTS", e)
|
|
onError("Android TTS failed: ${e.message}")
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Play streaming audio from URL.
|
|
*/
|
|
private fun playStreamingAudio(streamUrl: String, onComplete: () -> Unit, onError: (String) -> Unit) {
|
|
try {
|
|
// Stop any existing playback
|
|
stopPlayback()
|
|
|
|
mediaPlayer = MediaPlayer().apply {
|
|
setDataSource(streamUrl)
|
|
setOnPreparedListener {
|
|
Log.d(TAG, "Stream prepared, starting playback")
|
|
start()
|
|
}
|
|
setOnCompletionListener {
|
|
Log.d(TAG, "Playback completed")
|
|
stopPlayback()
|
|
onComplete()
|
|
}
|
|
setOnErrorListener { _, what, extra ->
|
|
Log.e(TAG, "MediaPlayer error: what=$what extra=$extra")
|
|
stopPlayback()
|
|
|
|
// Fallback to Android TTS on streaming error
|
|
Log.w(TAG, "Streaming failed, falling back to Android TTS")
|
|
// We can't easily get the original text here, so just call the error handler
|
|
onError("Streaming error, using fallback")
|
|
true
|
|
}
|
|
setOnInfoListener { _, what, extra ->
|
|
Log.d(TAG, "MediaPlayer info: what=$what extra=$extra")
|
|
false
|
|
}
|
|
|
|
// Prepare async to avoid blocking
|
|
prepareAsync()
|
|
}
|
|
|
|
Log.d(TAG, "Streaming audio from: $streamUrl")
|
|
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Failed to stream audio", e)
|
|
onError("Failed to stream audio: ${e.message}")
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Stop current playback.
|
|
*/
|
|
fun stopPlayback() {
|
|
// Stop MediaPlayer (ElevenLabs)
|
|
mediaPlayer?.let {
|
|
if (it.isPlaying) {
|
|
it.stop()
|
|
}
|
|
it.release()
|
|
}
|
|
mediaPlayer = null
|
|
|
|
// Stop Android TTS
|
|
androidTTS?.stop()
|
|
}
|
|
|
|
/**
|
|
* Check if currently playing.
|
|
*/
|
|
fun isPlaying(): Boolean {
|
|
return mediaPlayer?.isPlaying == true || androidTTS?.isSpeaking == true
|
|
}
|
|
|
|
/**
|
|
* Cleanup resources.
|
|
*/
|
|
fun destroy() {
|
|
stopPlayback()
|
|
androidTTS?.shutdown()
|
|
androidTTS = null
|
|
scope.cancel()
|
|
}
|
|
}
|