Files
alfred-mobile/app/src/main/java/com/openclaw/alfred/voice/TTSManager.kt
jknapp 6d4ae2e5c3 Initial commit: Alfred Mobile - AI Assistant Android App
- OAuth authentication via Authentik
- WebSocket connection to OpenClaw gateway
- Configurable gateway URL with first-run setup
- User preferences sync across devices
- Multi-user support with custom assistant names
- ElevenLabs TTS integration (local + remote)
- FCM push notifications for alarms
- Voice input via Google Speech API
- No hardcoded secrets or internal IPs in tracked files
2026-02-09 11:12:51 -08:00

320 lines
12 KiB
Kotlin

package com.openclaw.alfred.voice
import android.content.Context
import android.media.MediaPlayer
import android.speech.tts.TextToSpeech
import android.util.Log
import com.openclaw.alfred.BuildConfig
import kotlinx.coroutines.*
import okhttp3.MediaType.Companion.toMediaType
import okhttp3.OkHttpClient
import okhttp3.Request
import okhttp3.RequestBody.Companion.toRequestBody
import org.json.JSONObject
import java.io.File
import java.io.FileOutputStream
import java.util.*
import java.util.concurrent.TimeUnit
/**
* Manages Text-to-Speech using ElevenLabs API with extended timeout.
*/
class TTSManager(private val context: Context) {
private val TAG = "TTSManager"
private val client = OkHttpClient.Builder()
.connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(120, TimeUnit.SECONDS) // Extended for long responses
.writeTimeout(30, TimeUnit.SECONDS)
.build()
private var mediaPlayer: MediaPlayer? = null
private val scope = CoroutineScope(Dispatchers.IO + SupervisorJob())
private val apiKey = BuildConfig.ELEVENLABS_API_KEY
private val baseUrl = "https://api.elevenlabs.io/v1"
// Read voice ID from preferences (default: Finn - vBKc2FfBKJfcZNyEt1n6)
private fun getVoiceId(): String {
val prefs = context.getSharedPreferences("alfred_settings", Context.MODE_PRIVATE)
return prefs.getString("tts_voice_id", BuildConfig.ELEVENLABS_VOICE_ID)
?: BuildConfig.ELEVENLABS_VOICE_ID
}
// Fallback Android TTS
private var androidTTS: TextToSpeech? = null
private var ttsReady = false
init {
// Initialize Android TTS as fallback
androidTTS = TextToSpeech(context) { status ->
if (status == TextToSpeech.SUCCESS) {
androidTTS?.language = Locale.US
ttsReady = true
Log.d(TAG, "Android TTS initialized successfully")
} else {
Log.e(TAG, "Android TTS initialization failed")
}
}
}
/**
* Sanitize text for TTS by removing markdown and special characters.
*/
private fun sanitizeTextForSpeech(text: String): String {
var cleaned = text
// Remove markdown formatting
cleaned = cleaned.replace(Regex("\\*\\*([^*]+)\\*\\*"), "$1") // Bold: **text**
cleaned = cleaned.replace(Regex("\\*([^*]+)\\*"), "$1") // Italic: *text*
cleaned = cleaned.replace(Regex("__([^_]+)__"), "$1") // Bold: __text__
cleaned = cleaned.replace(Regex("_([^_]+)_"), "$1") // Italic: _text_
cleaned = cleaned.replace(Regex("~~([^~]+)~~"), "$1") // Strikethrough: ~~text~~
cleaned = cleaned.replace(Regex("`([^`]+)`"), "$1") // Inline code: `text`
// Remove code blocks
cleaned = cleaned.replace(Regex("```[\\s\\S]*?```"), "") // Code blocks
// Remove links but keep link text
cleaned = cleaned.replace(Regex("\\[([^]]+)]\\([^)]+\\)"), "$1") // [text](url)
cleaned = cleaned.replace(Regex("https?://\\S+"), "") // Plain URLs
// Remove list markers
cleaned = cleaned.replace(Regex("^[\\s]*[-*+•]\\s+", RegexOption.MULTILINE), "") // List bullets
cleaned = cleaned.replace(Regex("^[\\s]*\\d+\\.\\s+", RegexOption.MULTILINE), "") // Numbered lists
// Remove headers
cleaned = cleaned.replace(Regex("^#+\\s+", RegexOption.MULTILINE), "") // # Headers
// Remove blockquotes
cleaned = cleaned.replace(Regex("^>\\s+", RegexOption.MULTILINE), "")
// Remove emoji shortcodes
cleaned = cleaned.replace(Regex(":[a-z_]+:"), "")
// Remove brackets and parentheses (but keep content)
cleaned = cleaned.replace(Regex("[\\[\\]()]"), "")
// Remove multiple punctuation marks (e.g., "..." -> ".")
cleaned = cleaned.replace(Regex("([.!?]){2,}"), "$1")
// Remove special characters but keep basic punctuation
cleaned = cleaned.replace(Regex("[^a-zA-Z0-9\\s.,!?;:'-]"), "")
// Clean up whitespace
cleaned = cleaned.replace(Regex("\\s+"), " ")
cleaned = cleaned.trim()
Log.d(TAG, "Sanitized for TTS: '$text' -> '$cleaned'")
return cleaned
}
/**
* Convert text to speech and play it.
*/
fun speak(text: String, onComplete: () -> Unit = {}, onError: (String) -> Unit = {}) {
if (apiKey.isEmpty()) {
Log.w(TAG, "ElevenLabs API key not configured, using Android TTS")
speakWithAndroidTTS(text, onComplete, onError)
return
}
scope.launch {
try {
// Sanitize text before sending to TTS
val cleanText = sanitizeTextForSpeech(text)
if (cleanText.isBlank()) {
Log.w(TAG, "Text became empty after sanitization, skipping TTS")
withContext(Dispatchers.Main) { onComplete() }
return@launch
}
Log.d(TAG, "Converting text to speech: ${cleanText.take(50)}...")
// Call TTS proxy endpoint
val voiceId = getVoiceId()
val audioUrl = callTTSProxy(cleanText, voiceId)
if (audioUrl == null) {
// Fallback to Android TTS
Log.w(TAG, "TTS proxy failed, falling back to Android TTS")
withContext(Dispatchers.Main) {
speakWithAndroidTTS(cleanText, onComplete, onError)
}
return@launch
}
Log.d(TAG, "TTS audio URL: $audioUrl")
// Play audio on main thread
withContext(Dispatchers.Main) {
val baseUrl = BuildConfig.GATEWAY_URL.replace("wss://", "https://").replace("ws://", "http://")
playStreamingAudio("$baseUrl$audioUrl", onComplete, onError)
}
} catch (e: Exception) {
Log.e(TAG, "TTS error, falling back to Android TTS", e)
// Use sanitized text for fallback too
val cleanText = sanitizeTextForSpeech(text)
withContext(Dispatchers.Main) {
speakWithAndroidTTS(cleanText, onComplete, onError)
}
}
}
}
/**
* Call TTS proxy and get audio URL.
*/
private fun callTTSProxy(text: String, voiceId: String): String? {
try {
val baseUrl = BuildConfig.GATEWAY_URL.replace("wss://", "https://").replace("ws://", "http://")
val proxyUrl = "$baseUrl/api/tts"
val json = JSONObject().apply {
put("text", text)
put("voiceId", voiceId)
}
val requestBody = json.toString().toRequestBody("application/json".toMediaType())
val request = Request.Builder()
.url(proxyUrl)
.post(requestBody)
.build()
client.newCall(request).execute().use { response ->
if (!response.isSuccessful) {
val errorBody = response.body?.string() ?: "no body"
Log.e(TAG, "TTS proxy error: ${response.code} ${response.message}")
Log.e(TAG, "Error body: $errorBody")
return null
}
val responseBody = response.body?.string() ?: return null
val responseJson = JSONObject(responseBody)
return responseJson.getString("audioUrl")
}
} catch (e: Exception) {
Log.e(TAG, "Failed to call TTS proxy", e)
return null
}
}
/**
* Speak using Android built-in TTS.
*/
private fun speakWithAndroidTTS(text: String, onComplete: () -> Unit, onError: (String) -> Unit) {
if (!ttsReady || androidTTS == null) {
onError("Android TTS not ready")
return
}
try {
androidTTS?.setOnUtteranceProgressListener(object : android.speech.tts.UtteranceProgressListener() {
override fun onStart(utteranceId: String?) {
Log.d(TAG, "Android TTS started")
}
override fun onDone(utteranceId: String?) {
Log.d(TAG, "Android TTS completed")
onComplete()
}
override fun onError(utteranceId: String?) {
Log.e(TAG, "Android TTS error")
onError("Android TTS error")
}
})
androidTTS?.speak(text, TextToSpeech.QUEUE_FLUSH, null, "alfred-${System.currentTimeMillis()}")
Log.d(TAG, "Speaking with Android TTS")
} catch (e: Exception) {
Log.e(TAG, "Failed to use Android TTS", e)
onError("Android TTS failed: ${e.message}")
}
}
/**
* Play streaming audio from URL.
*/
private fun playStreamingAudio(streamUrl: String, onComplete: () -> Unit, onError: (String) -> Unit) {
try {
// Stop any existing playback
stopPlayback()
mediaPlayer = MediaPlayer().apply {
setDataSource(streamUrl)
setOnPreparedListener {
Log.d(TAG, "Stream prepared, starting playback")
start()
}
setOnCompletionListener {
Log.d(TAG, "Playback completed")
stopPlayback()
onComplete()
}
setOnErrorListener { _, what, extra ->
Log.e(TAG, "MediaPlayer error: what=$what extra=$extra")
stopPlayback()
// Fallback to Android TTS on streaming error
Log.w(TAG, "Streaming failed, falling back to Android TTS")
// We can't easily get the original text here, so just call the error handler
onError("Streaming error, using fallback")
true
}
setOnInfoListener { _, what, extra ->
Log.d(TAG, "MediaPlayer info: what=$what extra=$extra")
false
}
// Prepare async to avoid blocking
prepareAsync()
}
Log.d(TAG, "Streaming audio from: $streamUrl")
} catch (e: Exception) {
Log.e(TAG, "Failed to stream audio", e)
onError("Failed to stream audio: ${e.message}")
}
}
/**
* Stop current playback.
*/
fun stopPlayback() {
// Stop MediaPlayer (ElevenLabs)
mediaPlayer?.let {
if (it.isPlaying) {
it.stop()
}
it.release()
}
mediaPlayer = null
// Stop Android TTS
androidTTS?.stop()
}
/**
* Check if currently playing.
*/
fun isPlaying(): Boolean {
return mediaPlayer?.isPlaying == true || androidTTS?.isSpeaking == true
}
/**
* Cleanup resources.
*/
fun destroy() {
stopPlayback()
androidTTS?.shutdown()
androidTTS = null
scope.cancel()
}
}