package com.openclaw.alfred.voice import android.content.Context import android.media.MediaPlayer import android.speech.tts.TextToSpeech import android.util.Log import com.openclaw.alfred.BuildConfig import kotlinx.coroutines.* import okhttp3.MediaType.Companion.toMediaType import okhttp3.OkHttpClient import okhttp3.Request import okhttp3.RequestBody.Companion.toRequestBody import org.json.JSONObject import java.io.File import java.io.FileOutputStream import java.util.* import java.util.concurrent.TimeUnit /** * Manages Text-to-Speech using ElevenLabs API with extended timeout. */ class TTSManager(private val context: Context) { private val TAG = "TTSManager" private val client = OkHttpClient.Builder() .connectTimeout(30, TimeUnit.SECONDS) .readTimeout(120, TimeUnit.SECONDS) // Extended for long responses .writeTimeout(30, TimeUnit.SECONDS) .build() private var mediaPlayer: MediaPlayer? = null private val scope = CoroutineScope(Dispatchers.IO + SupervisorJob()) private val apiKey = BuildConfig.ELEVENLABS_API_KEY private val baseUrl = "https://api.elevenlabs.io/v1" // Read voice ID from preferences (default: Finn - vBKc2FfBKJfcZNyEt1n6) private fun getVoiceId(): String { val prefs = context.getSharedPreferences("alfred_settings", Context.MODE_PRIVATE) return prefs.getString("tts_voice_id", BuildConfig.ELEVENLABS_VOICE_ID) ?: BuildConfig.ELEVENLABS_VOICE_ID } // Fallback Android TTS private var androidTTS: TextToSpeech? = null private var ttsReady = false init { // Initialize Android TTS as fallback androidTTS = TextToSpeech(context) { status -> if (status == TextToSpeech.SUCCESS) { androidTTS?.language = Locale.US ttsReady = true Log.d(TAG, "Android TTS initialized successfully") } else { Log.e(TAG, "Android TTS initialization failed") } } } /** * Sanitize text for TTS by removing markdown and special characters. */ private fun sanitizeTextForSpeech(text: String): String { var cleaned = text // Remove markdown formatting cleaned = cleaned.replace(Regex("\\*\\*([^*]+)\\*\\*"), "$1") // Bold: **text** cleaned = cleaned.replace(Regex("\\*([^*]+)\\*"), "$1") // Italic: *text* cleaned = cleaned.replace(Regex("__([^_]+)__"), "$1") // Bold: __text__ cleaned = cleaned.replace(Regex("_([^_]+)_"), "$1") // Italic: _text_ cleaned = cleaned.replace(Regex("~~([^~]+)~~"), "$1") // Strikethrough: ~~text~~ cleaned = cleaned.replace(Regex("`([^`]+)`"), "$1") // Inline code: `text` // Remove code blocks cleaned = cleaned.replace(Regex("```[\\s\\S]*?```"), "") // Code blocks // Remove links but keep link text cleaned = cleaned.replace(Regex("\\[([^]]+)]\\([^)]+\\)"), "$1") // [text](url) cleaned = cleaned.replace(Regex("https?://\\S+"), "") // Plain URLs // Remove list markers cleaned = cleaned.replace(Regex("^[\\s]*[-*+•]\\s+", RegexOption.MULTILINE), "") // List bullets cleaned = cleaned.replace(Regex("^[\\s]*\\d+\\.\\s+", RegexOption.MULTILINE), "") // Numbered lists // Remove headers cleaned = cleaned.replace(Regex("^#+\\s+", RegexOption.MULTILINE), "") // # Headers // Remove blockquotes cleaned = cleaned.replace(Regex("^>\\s+", RegexOption.MULTILINE), "") // Remove emoji shortcodes cleaned = cleaned.replace(Regex(":[a-z_]+:"), "") // Remove brackets and parentheses (but keep content) cleaned = cleaned.replace(Regex("[\\[\\]()]"), "") // Remove multiple punctuation marks (e.g., "..." -> ".") cleaned = cleaned.replace(Regex("([.!?]){2,}"), "$1") // Remove special characters but keep basic punctuation cleaned = cleaned.replace(Regex("[^a-zA-Z0-9\\s.,!?;:'-]"), "") // Clean up whitespace cleaned = cleaned.replace(Regex("\\s+"), " ") cleaned = cleaned.trim() Log.d(TAG, "Sanitized for TTS: '$text' -> '$cleaned'") return cleaned } /** * Convert text to speech and play it. */ fun speak(text: String, onComplete: () -> Unit = {}, onError: (String) -> Unit = {}) { if (apiKey.isEmpty()) { Log.w(TAG, "ElevenLabs API key not configured, using Android TTS") speakWithAndroidTTS(text, onComplete, onError) return } scope.launch { try { // Sanitize text before sending to TTS val cleanText = sanitizeTextForSpeech(text) if (cleanText.isBlank()) { Log.w(TAG, "Text became empty after sanitization, skipping TTS") withContext(Dispatchers.Main) { onComplete() } return@launch } Log.d(TAG, "Converting text to speech: ${cleanText.take(50)}...") // Call TTS proxy endpoint val voiceId = getVoiceId() val audioUrl = callTTSProxy(cleanText, voiceId) if (audioUrl == null) { // Fallback to Android TTS Log.w(TAG, "TTS proxy failed, falling back to Android TTS") withContext(Dispatchers.Main) { speakWithAndroidTTS(cleanText, onComplete, onError) } return@launch } Log.d(TAG, "TTS audio URL: $audioUrl") // Play audio on main thread withContext(Dispatchers.Main) { val baseUrl = BuildConfig.GATEWAY_URL.replace("wss://", "https://").replace("ws://", "http://") playStreamingAudio("$baseUrl$audioUrl", onComplete, onError) } } catch (e: Exception) { Log.e(TAG, "TTS error, falling back to Android TTS", e) // Use sanitized text for fallback too val cleanText = sanitizeTextForSpeech(text) withContext(Dispatchers.Main) { speakWithAndroidTTS(cleanText, onComplete, onError) } } } } /** * Call TTS proxy and get audio URL. */ private fun callTTSProxy(text: String, voiceId: String): String? { try { val baseUrl = BuildConfig.GATEWAY_URL.replace("wss://", "https://").replace("ws://", "http://") val proxyUrl = "$baseUrl/api/tts" val json = JSONObject().apply { put("text", text) put("voiceId", voiceId) } val requestBody = json.toString().toRequestBody("application/json".toMediaType()) val request = Request.Builder() .url(proxyUrl) .post(requestBody) .build() client.newCall(request).execute().use { response -> if (!response.isSuccessful) { val errorBody = response.body?.string() ?: "no body" Log.e(TAG, "TTS proxy error: ${response.code} ${response.message}") Log.e(TAG, "Error body: $errorBody") return null } val responseBody = response.body?.string() ?: return null val responseJson = JSONObject(responseBody) return responseJson.getString("audioUrl") } } catch (e: Exception) { Log.e(TAG, "Failed to call TTS proxy", e) return null } } /** * Speak using Android built-in TTS. */ private fun speakWithAndroidTTS(text: String, onComplete: () -> Unit, onError: (String) -> Unit) { if (!ttsReady || androidTTS == null) { onError("Android TTS not ready") return } try { androidTTS?.setOnUtteranceProgressListener(object : android.speech.tts.UtteranceProgressListener() { override fun onStart(utteranceId: String?) { Log.d(TAG, "Android TTS started") } override fun onDone(utteranceId: String?) { Log.d(TAG, "Android TTS completed") onComplete() } override fun onError(utteranceId: String?) { Log.e(TAG, "Android TTS error") onError("Android TTS error") } }) androidTTS?.speak(text, TextToSpeech.QUEUE_FLUSH, null, "alfred-${System.currentTimeMillis()}") Log.d(TAG, "Speaking with Android TTS") } catch (e: Exception) { Log.e(TAG, "Failed to use Android TTS", e) onError("Android TTS failed: ${e.message}") } } /** * Play streaming audio from URL. */ private fun playStreamingAudio(streamUrl: String, onComplete: () -> Unit, onError: (String) -> Unit) { try { // Stop any existing playback stopPlayback() mediaPlayer = MediaPlayer().apply { setDataSource(streamUrl) setOnPreparedListener { Log.d(TAG, "Stream prepared, starting playback") start() } setOnCompletionListener { Log.d(TAG, "Playback completed") stopPlayback() onComplete() } setOnErrorListener { _, what, extra -> Log.e(TAG, "MediaPlayer error: what=$what extra=$extra") stopPlayback() // Fallback to Android TTS on streaming error Log.w(TAG, "Streaming failed, falling back to Android TTS") // We can't easily get the original text here, so just call the error handler onError("Streaming error, using fallback") true } setOnInfoListener { _, what, extra -> Log.d(TAG, "MediaPlayer info: what=$what extra=$extra") false } // Prepare async to avoid blocking prepareAsync() } Log.d(TAG, "Streaming audio from: $streamUrl") } catch (e: Exception) { Log.e(TAG, "Failed to stream audio", e) onError("Failed to stream audio: ${e.message}") } } /** * Stop current playback. */ fun stopPlayback() { // Stop MediaPlayer (ElevenLabs) mediaPlayer?.let { if (it.isPlaying) { it.stop() } it.release() } mediaPlayer = null // Stop Android TTS androidTTS?.stop() } /** * Check if currently playing. */ fun isPlaying(): Boolean { return mediaPlayer?.isPlaying == true || androidTTS?.isSpeaking == true } /** * Cleanup resources. */ fun destroy() { stopPlayback() androidTTS?.shutdown() androidTTS = null scope.cancel() } }