mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-05-01 06:36:23 +08:00
fix(talk): honor configured speech locale
This commit is contained in:
@@ -59,6 +59,8 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- iOS/macOS Talk Mode: allow `talk.speechLocale` to set the speech
|
||||
recognition locale for non-English voice conversations. Fixes #44688.
|
||||
- Plugins/Voice Call: treat missing provider credentials as setup-incomplete
|
||||
during Gateway startup and log the missing keys as a warning instead of a
|
||||
runtime startup error, while keeping explicit command/tool errors when used. Thanks
|
||||
|
||||
@@ -21,6 +21,7 @@ struct SettingsTab: View {
|
||||
@AppStorage("node.instanceId") private var instanceId: String = UUID().uuidString
|
||||
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
|
||||
@AppStorage("talk.enabled") private var talkEnabled: Bool = false
|
||||
@AppStorage(TalkSpeechLocale.storageKey) private var talkSpeechLocale: String = TalkSpeechLocale.automaticID
|
||||
@AppStorage("talk.button.enabled") private var talkButtonEnabled: Bool = true
|
||||
@AppStorage("talk.background.enabled") private var talkBackgroundEnabled: Bool = false
|
||||
@AppStorage("camera.enabled") private var cameraEnabled: Bool = true
|
||||
@@ -278,6 +279,11 @@ struct SettingsTab: View {
|
||||
help: "Enables voice conversation mode with your connected OpenClaw agent.") { newValue in
|
||||
self.appModel.setTalkEnabled(newValue)
|
||||
}
|
||||
Picker("Speech Language", selection: self.$talkSpeechLocale) {
|
||||
ForEach(TalkSpeechLocale.supportedOptions()) { option in
|
||||
Text(option.label).tag(option.id)
|
||||
}
|
||||
}
|
||||
self.featureToggle(
|
||||
"Background Listening",
|
||||
isOn: self.$talkBackgroundEnabled,
|
||||
|
||||
@@ -12,6 +12,7 @@ struct TalkModeGatewayConfigState {
|
||||
let rawConfigApiKey: String?
|
||||
let interruptOnSpeech: Bool?
|
||||
let silenceTimeoutMs: Int
|
||||
let speechLocaleID: String?
|
||||
}
|
||||
|
||||
enum TalkModeGatewayConfigParser {
|
||||
@@ -53,6 +54,7 @@ enum TalkModeGatewayConfigParser {
|
||||
let silenceTimeoutMs = TalkConfigParsing.resolvedSilenceTimeoutMs(
|
||||
talk,
|
||||
fallback: defaultSilenceTimeoutMs)
|
||||
let speechLocaleID = TalkConfigParsing.resolvedSpeechLocaleID(talk)
|
||||
|
||||
return TalkModeGatewayConfigState(
|
||||
activeProvider: activeProvider,
|
||||
@@ -64,6 +66,7 @@ enum TalkModeGatewayConfigParser {
|
||||
defaultOutputFormat: defaultOutputFormat,
|
||||
rawConfigApiKey: rawConfigApiKey,
|
||||
interruptOnSpeech: interruptOnSpeech,
|
||||
silenceTimeoutMs: silenceTimeoutMs)
|
||||
silenceTimeoutMs: silenceTimeoutMs,
|
||||
speechLocaleID: speechLocaleID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,6 +87,7 @@ final class TalkModeManager: NSObject {
|
||||
private var apiKey: String?
|
||||
private var voiceAliases: [String: String] = [:]
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var gatewaySpeechLocaleID: String?
|
||||
private var mainSessionKey: String = "main"
|
||||
private var fallbackVoiceId: String?
|
||||
private var lastPlaybackWasPCM: Bool = false
|
||||
@@ -500,12 +501,17 @@ final class TalkModeManager: NSObject {
|
||||
#endif
|
||||
|
||||
self.stopRecognition()
|
||||
self.speechRecognizer = SFSpeechRecognizer()
|
||||
let localSpeechLocale = UserDefaults.standard.string(forKey: TalkSpeechLocale.storageKey)
|
||||
let resolvedSpeech = TalkSpeechLocale.makeRecognizer(
|
||||
localSelection: localSpeechLocale,
|
||||
gatewaySelection: self.gatewaySpeechLocaleID)
|
||||
self.speechRecognizer = resolvedSpeech.recognizer
|
||||
guard let recognizer = self.speechRecognizer else {
|
||||
throw NSError(domain: "TalkMode", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Speech recognizer unavailable",
|
||||
])
|
||||
}
|
||||
GatewayDiagnostics.log("talk speech: locale=\(resolvedSpeech.localeID ?? "default")")
|
||||
|
||||
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||
self.recognitionRequest?.shouldReportPartialResults = true
|
||||
@@ -2027,6 +2033,7 @@ extension TalkModeManager {
|
||||
if let interrupt = parsed.interruptOnSpeech {
|
||||
self.interruptOnSpeech = interrupt
|
||||
}
|
||||
self.gatewaySpeechLocaleID = parsed.speechLocaleID
|
||||
self.silenceWindow = TimeInterval(parsed.silenceTimeoutMs) / 1000
|
||||
if parsed.normalizedPayload || parsed.defaultVoiceId != nil || parsed.rawConfigApiKey != nil {
|
||||
GatewayDiagnostics.log(
|
||||
@@ -2041,6 +2048,7 @@ extension TalkModeManager {
|
||||
self.gatewayTalkDefaultModelId = nil
|
||||
self.gatewayTalkApiKeyConfigured = false
|
||||
self.gatewayTalkConfigLoaded = false
|
||||
self.gatewaySpeechLocaleID = nil
|
||||
self.silenceWindow = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000
|
||||
}
|
||||
}
|
||||
|
||||
100
apps/ios/Sources/Voice/TalkSpeechLocale.swift
Normal file
100
apps/ios/Sources/Voice/TalkSpeechLocale.swift
Normal file
@@ -0,0 +1,100 @@
|
||||
import Foundation
|
||||
import OpenClawKit
|
||||
import Speech
|
||||
|
||||
enum TalkSpeechLocale {
|
||||
static let storageKey = "talk.speechLocale"
|
||||
static let automaticID = "auto"
|
||||
static let fallbackLocaleID = "en-US"
|
||||
|
||||
struct Option: Identifiable {
|
||||
let id: String
|
||||
let label: String
|
||||
}
|
||||
|
||||
static func supportedOptions(
|
||||
supportedLocales: Set<Locale> = SFSpeechRecognizer.supportedLocales()
|
||||
) -> [Option] {
|
||||
var seen = Set<String>()
|
||||
let dynamic: [Option] = supportedLocales
|
||||
.compactMap { locale in
|
||||
let id = self.canonicalID(locale.identifier)
|
||||
guard seen.insert(id).inserted else { return nil }
|
||||
return Option(id: id, label: self.friendlyName(for: locale))
|
||||
}
|
||||
.sorted { (lhs: Option, rhs: Option) in
|
||||
lhs.label.localizedCaseInsensitiveCompare(rhs.label) == .orderedAscending
|
||||
}
|
||||
return [Option(id: self.automaticID, label: "Automatic")] + dynamic
|
||||
}
|
||||
|
||||
static func resolvedLocaleID(
|
||||
localSelection: String?,
|
||||
gatewaySelection: String?,
|
||||
deviceLocaleID: String = Locale.autoupdatingCurrent.identifier,
|
||||
fallbackLocaleID: String = Self.fallbackLocaleID,
|
||||
supportedLocaleIDs: Set<String>
|
||||
) -> String? {
|
||||
TalkConfigParsing.resolvedSpeechRecognitionLocaleID(
|
||||
preferredLocaleIDs: [
|
||||
TalkConfigParsing.normalizedExplicitSpeechLocaleID(localSelection),
|
||||
TalkConfigParsing.normalizedExplicitSpeechLocaleID(gatewaySelection),
|
||||
deviceLocaleID,
|
||||
],
|
||||
fallbackLocaleID: fallbackLocaleID,
|
||||
supportedLocaleIDs: supportedLocaleIDs)
|
||||
}
|
||||
|
||||
static func makeRecognizer(
|
||||
localSelection: String?,
|
||||
gatewaySelection: String?,
|
||||
supportedLocales: Set<Locale> = SFSpeechRecognizer.supportedLocales()
|
||||
) -> (recognizer: SFSpeechRecognizer?, localeID: String?) {
|
||||
let supportedIDs = Set(supportedLocales.map(\.identifier))
|
||||
guard let localeID = self.resolvedLocaleID(
|
||||
localSelection: localSelection,
|
||||
gatewaySelection: gatewaySelection,
|
||||
supportedLocaleIDs: supportedIDs)
|
||||
else {
|
||||
let recognizer = SFSpeechRecognizer()
|
||||
return (recognizer, recognizer?.locale.identifier)
|
||||
}
|
||||
|
||||
if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeID)) {
|
||||
return (recognizer, localeID)
|
||||
}
|
||||
|
||||
let recognizer = SFSpeechRecognizer()
|
||||
return (recognizer, recognizer?.locale.identifier)
|
||||
}
|
||||
|
||||
static func normalizedExplicitLocaleID(_ raw: String?) -> String? {
|
||||
TalkConfigParsing.normalizedExplicitSpeechLocaleID(raw, automaticID: self.automaticID)
|
||||
}
|
||||
|
||||
private static func normalizedLocaleID(_ raw: String?) -> String? {
|
||||
TalkConfigParsing.normalizedSpeechLocaleID(raw)
|
||||
}
|
||||
|
||||
private static func canonicalID(_ raw: String) -> String {
|
||||
raw.replacingOccurrences(of: "_", with: "-")
|
||||
}
|
||||
|
||||
private static func friendlyName(for locale: Locale) -> String {
|
||||
let id = self.canonicalID(locale.identifier)
|
||||
let cleanLocale = Locale(identifier: id)
|
||||
if let langCode = cleanLocale.language.languageCode?.identifier,
|
||||
let lang = cleanLocale.localizedString(forLanguageCode: langCode),
|
||||
let regionCode = cleanLocale.region?.identifier,
|
||||
let region = cleanLocale.localizedString(forRegionCode: regionCode)
|
||||
{
|
||||
return "\(lang) (\(region))"
|
||||
}
|
||||
if let langCode = cleanLocale.language.languageCode?.identifier,
|
||||
let lang = cleanLocale.localizedString(forLanguageCode: langCode)
|
||||
{
|
||||
return lang
|
||||
}
|
||||
return cleanLocale.localizedString(forIdentifier: id) ?? id
|
||||
}
|
||||
}
|
||||
@@ -47,6 +47,16 @@ private let iOSSilenceTimeoutMs = 900
|
||||
fallback: iOSSilenceTimeoutMs) == 1500)
|
||||
}
|
||||
|
||||
@Test func readsConfiguredSpeechLocale() {
|
||||
let talk: [String: Any] = [
|
||||
"speechLocale": " ru-RU ",
|
||||
]
|
||||
|
||||
#expect(
|
||||
TalkConfigParsing.resolvedSpeechLocaleID(
|
||||
TalkConfigParsing.bridgeFoundationDictionary(talk)) == "ru-RU")
|
||||
}
|
||||
|
||||
@Test func defaultsSilenceTimeoutMsWhenMissing() {
|
||||
#expect(TalkConfigParsing.resolvedSilenceTimeoutMs(nil, fallback: iOSSilenceTimeoutMs) == iOSSilenceTimeoutMs)
|
||||
}
|
||||
|
||||
41
apps/ios/Tests/TalkSpeechLocaleTests.swift
Normal file
41
apps/ios/Tests/TalkSpeechLocaleTests.swift
Normal file
@@ -0,0 +1,41 @@
|
||||
import Foundation
|
||||
import Testing
|
||||
@testable import OpenClaw
|
||||
|
||||
@Suite struct TalkSpeechLocaleTests {
|
||||
@Test func localSelectionOverridesGatewayConfig() {
|
||||
let locale = TalkSpeechLocale.resolvedLocaleID(
|
||||
localSelection: "de-DE",
|
||||
gatewaySelection: "ru-RU",
|
||||
deviceLocaleID: "en-US",
|
||||
supportedLocaleIDs: ["de-DE", "ru-RU", "en-US"])
|
||||
|
||||
#expect(locale == "de-DE")
|
||||
}
|
||||
|
||||
@Test func automaticLocalSelectionAllowsGatewayConfig() {
|
||||
let locale = TalkSpeechLocale.resolvedLocaleID(
|
||||
localSelection: TalkSpeechLocale.automaticID,
|
||||
gatewaySelection: "ru_RU",
|
||||
deviceLocaleID: "en-US",
|
||||
supportedLocaleIDs: ["ru-RU", "en-US"])
|
||||
|
||||
#expect(locale == "ru-RU")
|
||||
}
|
||||
|
||||
@Test func unsupportedConfiguredLocaleFallsBackToDeviceThenEnglish() {
|
||||
let deviceLocale = TalkSpeechLocale.resolvedLocaleID(
|
||||
localSelection: "zz-ZZ",
|
||||
gatewaySelection: nil,
|
||||
deviceLocaleID: "fr-FR",
|
||||
supportedLocaleIDs: ["fr-FR", "en-US"])
|
||||
let english = TalkSpeechLocale.resolvedLocaleID(
|
||||
localSelection: "zz-ZZ",
|
||||
gatewaySelection: nil,
|
||||
deviceLocaleID: "yy-YY",
|
||||
supportedLocaleIDs: ["en-US"])
|
||||
|
||||
#expect(deviceLocale == "fr-FR")
|
||||
#expect(english == "en-US")
|
||||
}
|
||||
}
|
||||
@@ -11,6 +11,7 @@ struct TalkModeGatewayConfigState {
|
||||
let outputFormat: String?
|
||||
let interruptOnSpeech: Bool
|
||||
let silenceTimeoutMs: Int
|
||||
let speechLocaleID: String?
|
||||
let apiKey: String?
|
||||
let seamColorHex: String?
|
||||
}
|
||||
@@ -53,6 +54,7 @@ enum TalkModeGatewayConfigParser {
|
||||
}
|
||||
let outputFormat = activeConfig?["outputFormat"]?.stringValue
|
||||
let interrupt = talk?["interruptOnSpeech"]?.boolValue
|
||||
let speechLocaleID = TalkConfigParsing.resolvedSpeechLocaleID(talk)
|
||||
let apiKey = activeConfig?["apiKey"]?.stringValue
|
||||
let resolvedVoice: String? = if activeProvider == defaultProvider {
|
||||
(voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ??
|
||||
@@ -78,6 +80,7 @@ enum TalkModeGatewayConfigParser {
|
||||
outputFormat: outputFormat,
|
||||
interruptOnSpeech: interrupt ?? true,
|
||||
silenceTimeoutMs: silenceTimeoutMs,
|
||||
speechLocaleID: speechLocaleID,
|
||||
apiKey: resolvedApiKey,
|
||||
seamColorHex: rawSeam.isEmpty ? nil : rawSeam)
|
||||
}
|
||||
@@ -104,6 +107,7 @@ enum TalkModeGatewayConfigParser {
|
||||
outputFormat: nil,
|
||||
interruptOnSpeech: true,
|
||||
silenceTimeoutMs: defaultSilenceTimeoutMs,
|
||||
speechLocaleID: nil,
|
||||
apiKey: resolvedApiKey,
|
||||
seamColorHex: nil)
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ actor TalkModeRuntime {
|
||||
private var defaultOutputFormat: String?
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var activeTalkProvider = TalkModeRuntime.defaultTalkProvider
|
||||
private var speechLocaleID: String?
|
||||
private var lastInterruptedAtSeconds: Double?
|
||||
private var voiceAliases: [String: String] = [:]
|
||||
private var lastSpokenText: String?
|
||||
@@ -186,12 +187,23 @@ actor TalkModeRuntime {
|
||||
self.recognitionGeneration &+= 1
|
||||
let generation = self.recognitionGeneration
|
||||
|
||||
let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID }
|
||||
self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale))
|
||||
let voiceWakeLocale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID }
|
||||
let supportedLocaleIDs = Set(SFSpeechRecognizer.supportedLocales().map(\.identifier))
|
||||
let localeID = TalkConfigParsing.resolvedSpeechRecognitionLocaleID(
|
||||
preferredLocaleIDs: [
|
||||
self.speechLocaleID,
|
||||
voiceWakeLocale,
|
||||
Locale.autoupdatingCurrent.identifier,
|
||||
],
|
||||
supportedLocaleIDs: supportedLocaleIDs)
|
||||
self.recognizer = localeID
|
||||
.map { SFSpeechRecognizer(locale: Locale(identifier: $0)) }
|
||||
?? SFSpeechRecognizer()
|
||||
guard let recognizer, recognizer.isAvailable else {
|
||||
self.logger.error("talk recognizer unavailable")
|
||||
return
|
||||
}
|
||||
self.logger.debug("talk recognizer locale=\(recognizer.locale.identifier, privacy: .public)")
|
||||
|
||||
let request = SFSpeechAudioBufferRecognitionRequest()
|
||||
Self.configureRecognitionRequest(request)
|
||||
@@ -1010,6 +1022,7 @@ extension TalkModeRuntime {
|
||||
self.interruptOnSpeech = cfg.interruptOnSpeech
|
||||
self.activeTalkProvider = cfg.activeProvider
|
||||
self.silenceWindow = TimeInterval(cfg.silenceTimeoutMs) / 1000
|
||||
self.speechLocaleID = cfg.speechLocaleID
|
||||
self.apiKey = cfg.apiKey
|
||||
let hasApiKey = (cfg.apiKey?.isEmpty == false)
|
||||
let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
|
||||
@@ -1021,7 +1034,8 @@ extension TalkModeRuntime {
|
||||
"modelId=\(modelLabel, privacy: .public) " +
|
||||
"apiKey=\(hasApiKey, privacy: .public) " +
|
||||
"interrupt=\(cfg.interruptOnSpeech, privacy: .public) " +
|
||||
"silenceTimeoutMs=\(cfg.silenceTimeoutMs, privacy: .public)")
|
||||
"silenceTimeoutMs=\(cfg.silenceTimeoutMs, privacy: .public) " +
|
||||
"speechLocale=\(cfg.speechLocaleID ?? "device", privacy: .public)")
|
||||
}
|
||||
|
||||
static func selectTalkProviderConfig(
|
||||
|
||||
@@ -25,10 +25,10 @@ struct TalkModeGatewayConfigTests {
|
||||
"voiceId": "unused-voice",
|
||||
],
|
||||
],
|
||||
"speechLocale": "ru-RU",
|
||||
]),
|
||||
],
|
||||
issues: nil
|
||||
)
|
||||
issues: nil)
|
||||
|
||||
let parsed = TalkModeGatewayConfigParser.parse(
|
||||
snapshot: snapshot,
|
||||
@@ -37,12 +37,12 @@ struct TalkModeGatewayConfigTests {
|
||||
defaultSilenceTimeoutMs: TalkDefaults.silenceTimeoutMs,
|
||||
envVoice: "env-voice",
|
||||
sagVoice: "sag-voice",
|
||||
envApiKey: "env-key"
|
||||
)
|
||||
envApiKey: "env-key")
|
||||
|
||||
#expect(parsed.activeProvider == "mlx")
|
||||
#expect(parsed.modelId == nil)
|
||||
#expect(parsed.apiKey == nil)
|
||||
#expect(parsed.voiceId == "unused-voice")
|
||||
#expect(parsed.speechLocaleID == "ru-RU")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,6 +56,46 @@ public enum TalkConfigParsing {
|
||||
self.resolvedPositiveInt(talk?["silenceTimeoutMs"], fallback: fallback)
|
||||
}
|
||||
|
||||
public static func normalizedSpeechLocaleID(_ value: String?) -> String? {
|
||||
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
return trimmed.isEmpty ? nil : trimmed.replacingOccurrences(of: "_", with: "-")
|
||||
}
|
||||
|
||||
public static func resolvedSpeechLocaleID(
|
||||
_ talk: [String: AnyCodable]?,
|
||||
fallback: String? = nil
|
||||
) -> String? {
|
||||
self.normalizedSpeechLocaleID(talk?["speechLocale"]?.stringValue)
|
||||
?? self.normalizedSpeechLocaleID(fallback)
|
||||
}
|
||||
|
||||
public static func normalizedExplicitSpeechLocaleID(
|
||||
_ value: String?,
|
||||
automaticID: String = "auto"
|
||||
) -> String? {
|
||||
let normalized = self.normalizedSpeechLocaleID(value)
|
||||
return normalized == automaticID ? nil : normalized
|
||||
}
|
||||
|
||||
public static func resolvedSpeechRecognitionLocaleID(
|
||||
preferredLocaleIDs: [String?],
|
||||
fallbackLocaleID: String = "en-US",
|
||||
supportedLocaleIDs: Set<String>
|
||||
) -> String? {
|
||||
let supported = Set(supportedLocaleIDs.compactMap(self.normalizedSpeechLocaleID))
|
||||
var seen = Set<String>()
|
||||
let candidates = (preferredLocaleIDs + [fallbackLocaleID])
|
||||
.compactMap(self.normalizedSpeechLocaleID)
|
||||
|
||||
for candidate in candidates {
|
||||
guard seen.insert(candidate).inserted else { continue }
|
||||
if supported.isEmpty || supported.contains(candidate) {
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private static func normalizedTalkProviderID(_ raw: String?) -> String? {
|
||||
let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
return trimmed.isEmpty ? nil : trimmed
|
||||
|
||||
@@ -116,4 +116,21 @@ struct TalkConfigParsingTests {
|
||||
#expect(TalkConfigParsing.resolvedPositiveInt(AnyCodable(true), fallback: 700) == 700)
|
||||
#expect(TalkConfigParsing.resolvedPositiveInt(AnyCodable("1500"), fallback: 700) == 700)
|
||||
}
|
||||
|
||||
@Test func resolvesSpeechLocaleID() {
|
||||
#expect(TalkConfigParsing.resolvedSpeechLocaleID(["speechLocale": AnyCodable(" ru_RU ")]) == "ru-RU")
|
||||
#expect(TalkConfigParsing.resolvedSpeechLocaleID(["speechLocale": AnyCodable("")], fallback: "en-US") == "en-US")
|
||||
}
|
||||
|
||||
@Test func resolvesSpeechRecognitionLocaleFromSupportedFallbacks() {
|
||||
let locale = TalkConfigParsing.resolvedSpeechRecognitionLocaleID(
|
||||
preferredLocaleIDs: ["zz-ZZ", "fr-FR"],
|
||||
supportedLocaleIDs: ["fr-FR", "en-US"])
|
||||
let fallback = TalkConfigParsing.resolvedSpeechRecognitionLocaleID(
|
||||
preferredLocaleIDs: ["zz-ZZ", "yy-YY"],
|
||||
supportedLocaleIDs: ["en-US"])
|
||||
|
||||
#expect(locale == "fr-FR")
|
||||
#expect(fallback == "en-US")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
6ed33ef102e7c92816243bfabc3626222a679c3270c12ec5ea47b28b66204b3b config-baseline.json
|
||||
f86cb4d57ec1f5fd75008be0ab86151194945eb013a47ab4bdeaddafd3780da7 config-baseline.core.json
|
||||
9ac3d271f9bfa9611557f0b52e4d0a600693bdd1de75cc1bafc320fc4d4f0075 config-baseline.json
|
||||
271fdf1d6652927e0fc160a6f25276bf6dccb8f1b27fab15e0fc2620e8cacab4 config-baseline.core.json
|
||||
7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json
|
||||
7825b56a5b3fcdbe2e09ef8fe5d9f12ac3598435afebe20413051e45b0d1968e config-baseline.plugin.json
|
||||
|
||||
@@ -1332,6 +1332,7 @@ Defaults for Talk mode (macOS/iOS/Android).
|
||||
},
|
||||
system: {},
|
||||
},
|
||||
speechLocale: "ru-RU",
|
||||
silenceTimeoutMs: 1500,
|
||||
interruptOnSpeech: true,
|
||||
},
|
||||
@@ -1346,6 +1347,7 @@ Defaults for Talk mode (macOS/iOS/Android).
|
||||
- `providers.*.voiceAliases` lets Talk directives use friendly names.
|
||||
- `providers.mlx.modelId` selects the Hugging Face repo used by the macOS local MLX helper. If omitted, macOS uses `mlx-community/Soprano-80M-bf16`.
|
||||
- macOS MLX playback runs through the bundled `openclaw-mlx-tts` helper when present, or an executable on `PATH`; `OPENCLAW_MLX_TTS_BIN` overrides the helper path for development.
|
||||
- `speechLocale` sets the BCP 47 locale id used by iOS/macOS Talk speech recognition. Leave unset to use the device default.
|
||||
- `silenceTimeoutMs` controls how long Talk mode waits after user silence before it sends the transcript. Unset keeps the platform default pause window (`700 ms on macOS and Android, 900 ms on iOS`).
|
||||
|
||||
---
|
||||
|
||||
@@ -43,6 +43,7 @@ Moved to a dedicated page — see
|
||||
- `session.*` (session lifecycle, compaction, pruning)
|
||||
- `messages.*` (message delivery, TTS, markdown rendering)
|
||||
- `talk.*` (Talk mode)
|
||||
- `talk.speechLocale`: optional BCP 47 locale id for Talk speech recognition on iOS/macOS
|
||||
- `talk.silenceTimeoutMs`: when unset, Talk keeps the platform default pause window before sending the transcript (`700 ms on macOS and Android, 900 ms on iOS`)
|
||||
|
||||
## Tools and custom providers
|
||||
|
||||
@@ -63,6 +63,7 @@ Supported keys:
|
||||
},
|
||||
system: {},
|
||||
},
|
||||
speechLocale: "ru-RU",
|
||||
silenceTimeoutMs: 1500,
|
||||
interruptOnSpeech: true,
|
||||
},
|
||||
@@ -78,6 +79,7 @@ Defaults:
|
||||
- `providers.elevenlabs.modelId`: defaults to `eleven_v3` when unset.
|
||||
- `providers.mlx.modelId`: defaults to `mlx-community/Soprano-80M-bf16` when unset.
|
||||
- `providers.elevenlabs.apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available).
|
||||
- `speechLocale`: optional BCP 47 locale id for on-device Talk speech recognition on iOS/macOS. Leave unset to use the device default.
|
||||
- `outputFormat`: defaults to `pcm_44100` on macOS/iOS and `pcm_24000` on Android (set `mp3_*` to force MP3 streaming)
|
||||
|
||||
## macOS UI
|
||||
|
||||
@@ -21075,6 +21075,12 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
description:
|
||||
"Provider-specific Talk settings keyed by provider id. During migration, prefer this over legacy talk.* keys.",
|
||||
},
|
||||
speechLocale: {
|
||||
type: "string",
|
||||
title: "Talk Speech Locale",
|
||||
description:
|
||||
'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.',
|
||||
},
|
||||
interruptOnSpeech: {
|
||||
type: "boolean",
|
||||
title: "Talk Interrupt on Speech",
|
||||
@@ -27273,6 +27279,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
help: "Enables automatic live-reload behavior for canvas assets during development workflows. Keep disabled in production-like environments where deterministic output is preferred.",
|
||||
tags: ["reliability"],
|
||||
},
|
||||
"talk.speechLocale": {
|
||||
label: "Talk Speech Locale",
|
||||
help: 'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.',
|
||||
tags: ["media"],
|
||||
},
|
||||
"talk.interruptOnSpeech": {
|
||||
label: "Talk Interrupt on Speech",
|
||||
help: "If true (default), stop assistant speech when the user starts speaking in Talk mode. Keep enabled for conversational turn-taking.",
|
||||
|
||||
@@ -148,6 +148,8 @@ export const FIELD_HELP: Record<string, string> = {
|
||||
"Provider-specific Talk settings keyed by provider id. During migration, prefer this over legacy talk.* keys.",
|
||||
"talk.providers.*": "Provider-owned Talk config fields for the matching provider id.",
|
||||
"talk.providers.*.apiKey": "Provider API key for Talk mode.", // pragma: allowlist secret
|
||||
"talk.speechLocale":
|
||||
'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.',
|
||||
"talk.interruptOnSpeech":
|
||||
"If true (default), stop assistant speech when the user starts speaking in Talk mode. Keep enabled for conversational turn-taking.",
|
||||
"talk.silenceTimeoutMs": `Milliseconds of user silence before Talk mode finalizes and sends the current transcript. Leave unset to keep the platform default pause window (${describeTalkSilenceTimeoutDefaults()}).`,
|
||||
|
||||
@@ -786,6 +786,7 @@ export const FIELD_LABELS: Record<string, string> = {
|
||||
"canvasHost.port": "Canvas Host Port",
|
||||
"canvasHost.liveReload": "Canvas Host Live Reload",
|
||||
talk: "Talk",
|
||||
"talk.speechLocale": "Talk Speech Locale",
|
||||
"talk.interruptOnSpeech": "Talk Interrupt on Speech",
|
||||
"talk.silenceTimeoutMs": "Talk Silence Timeout (ms)",
|
||||
messages: "Messages",
|
||||
|
||||
@@ -10,11 +10,13 @@ describe("talk normalization", () => {
|
||||
modelId: "eleven_v3",
|
||||
outputFormat: "pcm_44100",
|
||||
apiKey: "secret-key", // pragma: allowlist secret
|
||||
speechLocale: " ru-RU ",
|
||||
interruptOnSpeech: false,
|
||||
silenceTimeoutMs: 1500,
|
||||
} as unknown as never);
|
||||
|
||||
expect(normalized).toEqual({
|
||||
speechLocale: "ru-RU",
|
||||
interruptOnSpeech: false,
|
||||
silenceTimeoutMs: 1500,
|
||||
});
|
||||
@@ -77,6 +79,7 @@ describe("talk normalization", () => {
|
||||
modelId: "acme-model",
|
||||
},
|
||||
},
|
||||
speechLocale: "ru-RU",
|
||||
interruptOnSpeech: true,
|
||||
});
|
||||
|
||||
@@ -95,6 +98,7 @@ describe("talk normalization", () => {
|
||||
modelId: "acme-model",
|
||||
},
|
||||
},
|
||||
speechLocale: "ru-RU",
|
||||
interruptOnSpeech: true,
|
||||
});
|
||||
});
|
||||
|
||||
@@ -105,6 +105,10 @@ export function normalizeTalkSection(value: TalkConfig | undefined): TalkConfig
|
||||
|
||||
const source = value as Record<string, unknown>;
|
||||
const normalized: TalkConfig = {};
|
||||
const speechLocale = normalizeOptionalString(source.speechLocale);
|
||||
if (speechLocale) {
|
||||
normalized.speechLocale = speechLocale;
|
||||
}
|
||||
if (typeof source.interruptOnSpeech === "boolean") {
|
||||
normalized.interruptOnSpeech = source.interruptOnSpeech;
|
||||
}
|
||||
@@ -172,6 +176,9 @@ export function buildTalkConfigResponse(value: unknown): TalkConfigResponse | un
|
||||
if (typeof normalized?.silenceTimeoutMs === "number") {
|
||||
payload.silenceTimeoutMs = normalized.silenceTimeoutMs;
|
||||
}
|
||||
if (typeof normalized?.speechLocale === "string") {
|
||||
payload.speechLocale = normalized.speechLocale;
|
||||
}
|
||||
if (normalized?.providers && Object.keys(normalized.providers).length > 0) {
|
||||
payload.providers = normalized.providers;
|
||||
}
|
||||
|
||||
@@ -67,6 +67,8 @@ export type TalkConfig = {
|
||||
provider?: string;
|
||||
/** Provider-specific Talk config keyed by provider id. */
|
||||
providers?: Record<string, TalkProviderConfig>;
|
||||
/** BCP 47 locale id used for Talk speech recognition on device nodes. */
|
||||
speechLocale?: string;
|
||||
/** Stop speaking when user starts talking (default: true). */
|
||||
interruptOnSpeech?: boolean;
|
||||
/** Milliseconds of user silence before Talk mode sends the transcript after a pause. */
|
||||
|
||||
@@ -186,6 +186,7 @@ const TalkSchema = z
|
||||
.object({
|
||||
provider: z.string().optional(),
|
||||
providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
|
||||
speechLocale: z.string().optional(),
|
||||
interruptOnSpeech: z.boolean().optional(),
|
||||
silenceTimeoutMs: z.number().int().positive().optional(),
|
||||
})
|
||||
|
||||
@@ -78,6 +78,7 @@ const TalkConfigSchema = Type.Object(
|
||||
provider: Type.Optional(Type.String()),
|
||||
providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
|
||||
resolved: ResolvedTalkConfigSchema,
|
||||
speechLocale: Type.Optional(Type.String()),
|
||||
interruptOnSpeech: Type.Optional(Type.Boolean()),
|
||||
silenceTimeoutMs: Type.Optional(Type.Integer({ minimum: 1 })),
|
||||
},
|
||||
|
||||
@@ -34,6 +34,7 @@ type TalkConfigPayload = {
|
||||
provider?: string;
|
||||
config?: { voiceId?: string; apiKey?: string | SecretRef };
|
||||
};
|
||||
speechLocale?: string;
|
||||
silenceTimeoutMs?: number;
|
||||
};
|
||||
session?: { mainKey?: string };
|
||||
@@ -144,6 +145,7 @@ function expectTalkConfig(
|
||||
apiKey?: string | SecretRef;
|
||||
providerApiKey?: string | SecretRef;
|
||||
resolvedApiKey?: string | SecretRef;
|
||||
speechLocale?: string;
|
||||
silenceTimeoutMs?: number;
|
||||
},
|
||||
) {
|
||||
@@ -162,6 +164,9 @@ function expectTalkConfig(
|
||||
if ("resolvedApiKey" in expected) {
|
||||
expect(talk?.resolved?.config?.apiKey).toEqual(expected.resolvedApiKey);
|
||||
}
|
||||
if ("speechLocale" in expected) {
|
||||
expect(talk?.speechLocale).toBe(expected.speechLocale);
|
||||
}
|
||||
if ("silenceTimeoutMs" in expected) {
|
||||
expect(talk?.silenceTimeoutMs).toBe(expected.silenceTimeoutMs);
|
||||
}
|
||||
@@ -179,6 +184,7 @@ describe("gateway talk.config", () => {
|
||||
apiKey: "secret-key-abc", // pragma: allowlist secret
|
||||
},
|
||||
},
|
||||
speechLocale: "ru-RU",
|
||||
silenceTimeoutMs: 1500,
|
||||
},
|
||||
session: {
|
||||
@@ -196,6 +202,7 @@ describe("gateway talk.config", () => {
|
||||
provider: GENERIC_TALK_PROVIDER_ID,
|
||||
voiceId: "voice-123",
|
||||
apiKey: "__OPENCLAW_REDACTED__",
|
||||
speechLocale: "ru-RU",
|
||||
silenceTimeoutMs: 1500,
|
||||
});
|
||||
expect(res.payload?.config?.session?.mainKey).toBe("main-test");
|
||||
|
||||
Reference in New Issue
Block a user