- OAuth authentication via Authentik - WebSocket connection to OpenClaw gateway - Configurable gateway URL with first-run setup - User preferences sync across devices - Multi-user support with custom assistant names - ElevenLabs TTS integration (local + remote) - FCM push notifications for alarms - Voice input via Google Speech API - No hardcoded secrets or internal IPs in tracked files
208 lines
7.8 KiB
Kotlin
208 lines
7.8 KiB
Kotlin
package com.openclaw.alfred.voice
|
|
|
|
import android.content.Context
|
|
import android.content.Intent
|
|
import android.os.Bundle
|
|
import android.speech.RecognitionListener
|
|
import android.speech.RecognizerIntent
|
|
import android.speech.SpeechRecognizer
|
|
import android.util.Log
|
|
import java.util.*
|
|
|
|
/**
|
|
* Manages on-device voice-to-text using Android SpeechRecognizer.
|
|
*/
|
|
class VoiceInputManager(
|
|
private val context: Context,
|
|
private val onResult: (String) -> Unit,
|
|
private val onError: (String) -> Unit,
|
|
private val onListening: (Boolean) -> Unit
|
|
) {
|
|
|
|
private val TAG = "VoiceInputManager"
|
|
private var speechRecognizer: SpeechRecognizer? = null
|
|
private var isListening = false
|
|
private val handler = android.os.Handler(android.os.Looper.getMainLooper())
|
|
|
|
/**
|
|
* Create RecognitionListener for SpeechRecognizer.
|
|
*/
|
|
private fun createRecognitionListener() = object : RecognitionListener {
|
|
override fun onReadyForSpeech(params: Bundle?) {
|
|
Log.d(TAG, "Ready for speech")
|
|
isListening = true
|
|
onListening(true)
|
|
}
|
|
|
|
override fun onBeginningOfSpeech() {
|
|
Log.d(TAG, "Speech started")
|
|
}
|
|
|
|
override fun onRmsChanged(rmsdB: Float) {
|
|
// Audio level changed - could show visual feedback
|
|
}
|
|
|
|
override fun onBufferReceived(buffer: ByteArray?) {
|
|
// Partial audio buffer
|
|
}
|
|
|
|
override fun onEndOfSpeech() {
|
|
Log.d(TAG, "Speech ended")
|
|
isListening = false
|
|
onListening(false)
|
|
}
|
|
|
|
override fun onError(error: Int) {
|
|
Log.e(TAG, "Recognition error: $error")
|
|
isListening = false
|
|
onListening(false)
|
|
|
|
val errorMsg = when (error) {
|
|
SpeechRecognizer.ERROR_AUDIO -> "Audio recording error (microphone busy or unavailable)"
|
|
SpeechRecognizer.ERROR_CLIENT -> "Client error (recognizer not ready - try again)"
|
|
SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS -> "Missing permissions"
|
|
SpeechRecognizer.ERROR_NETWORK -> "Network error"
|
|
SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout"
|
|
SpeechRecognizer.ERROR_NO_MATCH -> "No speech detected - try again"
|
|
SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Microphone busy - please wait and try again"
|
|
SpeechRecognizer.ERROR_SERVER -> "Server error"
|
|
SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Speech timeout"
|
|
11 -> "Recognizer initialization error (try again in a moment)"
|
|
else -> "Unknown error: $error"
|
|
}
|
|
onError(errorMsg)
|
|
}
|
|
|
|
override fun onResults(results: Bundle?) {
|
|
Log.d(TAG, "Got results")
|
|
val matches = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
|
|
if (!matches.isNullOrEmpty()) {
|
|
val text = matches[0]
|
|
Log.d(TAG, "Recognized: $text")
|
|
onResult(text)
|
|
}
|
|
isListening = false
|
|
onListening(false)
|
|
}
|
|
|
|
override fun onPartialResults(partialResults: Bundle?) {
|
|
// Partial recognition results (if enabled)
|
|
}
|
|
|
|
override fun onEvent(eventType: Int, params: Bundle?) {
|
|
// Recognition event
|
|
}
|
|
}
|
|
|
|
init {
|
|
if (SpeechRecognizer.isRecognitionAvailable(context)) {
|
|
speechRecognizer = SpeechRecognizer.createSpeechRecognizer(context)
|
|
speechRecognizer?.setRecognitionListener(createRecognitionListener())
|
|
} else {
|
|
Log.e(TAG, "Speech recognition not available on this device")
|
|
onError("Speech recognition not available")
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Start listening for voice input.
|
|
*/
|
|
fun startListening() {
|
|
if (isListening) {
|
|
Log.w(TAG, "Already listening")
|
|
return
|
|
}
|
|
|
|
// Destroy previous SpeechRecognizer instance
|
|
try {
|
|
speechRecognizer?.destroy()
|
|
speechRecognizer = null
|
|
} catch (e: Exception) {
|
|
Log.w(TAG, "Error destroying previous recognizer", e)
|
|
}
|
|
|
|
// Add delay to ensure Android speech service has fully released resources
|
|
// This prevents error 11 (initialization error) caused by race condition
|
|
handler.postDelayed({
|
|
if (!SpeechRecognizer.isRecognitionAvailable(context)) {
|
|
Log.e(TAG, "Speech recognition not available on this device")
|
|
onError("Speech recognition not available")
|
|
return@postDelayed
|
|
}
|
|
|
|
// Create new SpeechRecognizer instance
|
|
try {
|
|
speechRecognizer = SpeechRecognizer.createSpeechRecognizer(context)
|
|
speechRecognizer?.setRecognitionListener(createRecognitionListener())
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Failed to create speech recognizer", e)
|
|
onError("Failed to initialize: ${e.message}")
|
|
return@postDelayed
|
|
}
|
|
|
|
// Create intent with extended timeouts
|
|
val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
|
|
putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
|
|
putExtra(RecognizerIntent.EXTRA_LANGUAGE, Locale.getDefault())
|
|
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 1)
|
|
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, false)
|
|
|
|
// Extend silence detection timeouts for longer pauses
|
|
putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS, 6500L)
|
|
putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS, 5000L)
|
|
putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_MINIMUM_LENGTH_MILLIS, 12000L)
|
|
}
|
|
|
|
// Start listening
|
|
try {
|
|
speechRecognizer?.startListening(intent)
|
|
Log.d(TAG, "Started listening")
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Failed to start listening", e)
|
|
isListening = false
|
|
onListening(false)
|
|
onError("Failed to start: ${e.message}")
|
|
}
|
|
}, 150) // 150ms delay to avoid race condition
|
|
}
|
|
|
|
/**
|
|
* Stop listening.
|
|
*/
|
|
fun stopListening() {
|
|
if (isListening) {
|
|
speechRecognizer?.stopListening()
|
|
isListening = false
|
|
onListening(false)
|
|
Log.d(TAG, "Stopped listening")
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Cancel listening.
|
|
*/
|
|
fun cancel() {
|
|
if (isListening) {
|
|
speechRecognizer?.cancel()
|
|
isListening = false
|
|
onListening(false)
|
|
Log.d(TAG, "Cancelled listening")
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Cleanup resources.
|
|
*/
|
|
fun destroy() {
|
|
speechRecognizer?.destroy()
|
|
speechRecognizer = null
|
|
handler.removeCallbacksAndMessages(null)
|
|
Log.d(TAG, "Destroyed")
|
|
}
|
|
|
|
/**
|
|
* Check if currently listening.
|
|
*/
|
|
fun isListening(): Boolean = isListening
|
|
}
|