From 02f3e9cfa25f26826e8dfd79d4bf174f2f9d8f54 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 21:05:15 +0100 Subject: [PATCH] fix(talk): honor configured speech locale --- CHANGELOG.md | 2 + apps/ios/Sources/Settings/SettingsTab.swift | 6 ++ .../Sources/Voice/TalkModeGatewayConfig.swift | 5 +- apps/ios/Sources/Voice/TalkModeManager.swift | 10 +- apps/ios/Sources/Voice/TalkSpeechLocale.swift | 100 ++++++++++++++++++ .../Tests/Logic/TalkConfigParsingTests.swift | 10 ++ apps/ios/Tests/TalkSpeechLocaleTests.swift | 41 +++++++ .../OpenClaw/TalkModeGatewayConfig.swift | 4 + .../Sources/OpenClaw/TalkModeRuntime.swift | 20 +++- .../TalkModeGatewayConfigTests.swift | 8 +- .../OpenClawKit/TalkConfigParsing.swift | 40 +++++++ .../TalkConfigParsingTests.swift | 17 +++ docs/.generated/config-baseline.sha256 | 4 +- docs/gateway/config-agents.md | 2 + docs/gateway/configuration-reference.md | 1 + docs/nodes/talk.md | 2 + src/config/schema.base.generated.ts | 11 ++ src/config/schema.help.ts | 2 + src/config/schema.labels.ts | 1 + src/config/talk.normalize.test.ts | 4 + src/config/talk.ts | 7 ++ src/config/types.gateway.ts | 2 + src/config/zod-schema.ts | 1 + src/gateway/protocol/schema/channels.ts | 1 + src/gateway/server.talk-config.test.ts | 7 ++ 25 files changed, 297 insertions(+), 11 deletions(-) create mode 100644 apps/ios/Sources/Voice/TalkSpeechLocale.swift create mode 100644 apps/ios/Tests/TalkSpeechLocaleTests.swift diff --git a/CHANGELOG.md b/CHANGELOG.md index 1aff0c720cd..ee7aefad616 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,8 @@ Docs: https://docs.openclaw.ai ### Fixes +- iOS/macOS Talk Mode: allow `talk.speechLocale` to set the speech + recognition locale for non-English voice conversations. Fixes #44688. - Plugins/Voice Call: treat missing provider credentials as setup-incomplete during Gateway startup and log the missing keys as a warning instead of a runtime startup error, while keeping explicit command/tool errors when used. Thanks diff --git a/apps/ios/Sources/Settings/SettingsTab.swift b/apps/ios/Sources/Settings/SettingsTab.swift index 2d2d936f315..f1a3f7bb0d9 100644 --- a/apps/ios/Sources/Settings/SettingsTab.swift +++ b/apps/ios/Sources/Settings/SettingsTab.swift @@ -21,6 +21,7 @@ struct SettingsTab: View { @AppStorage("node.instanceId") private var instanceId: String = UUID().uuidString @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false @AppStorage("talk.enabled") private var talkEnabled: Bool = false + @AppStorage(TalkSpeechLocale.storageKey) private var talkSpeechLocale: String = TalkSpeechLocale.automaticID @AppStorage("talk.button.enabled") private var talkButtonEnabled: Bool = true @AppStorage("talk.background.enabled") private var talkBackgroundEnabled: Bool = false @AppStorage("camera.enabled") private var cameraEnabled: Bool = true @@ -278,6 +279,11 @@ struct SettingsTab: View { help: "Enables voice conversation mode with your connected OpenClaw agent.") { newValue in self.appModel.setTalkEnabled(newValue) } + Picker("Speech Language", selection: self.$talkSpeechLocale) { + ForEach(TalkSpeechLocale.supportedOptions()) { option in + Text(option.label).tag(option.id) + } + } self.featureToggle( "Background Listening", isOn: self.$talkBackgroundEnabled, diff --git a/apps/ios/Sources/Voice/TalkModeGatewayConfig.swift b/apps/ios/Sources/Voice/TalkModeGatewayConfig.swift index 7215bc7d1af..b8054165121 100644 --- a/apps/ios/Sources/Voice/TalkModeGatewayConfig.swift +++ b/apps/ios/Sources/Voice/TalkModeGatewayConfig.swift @@ -12,6 +12,7 @@ struct TalkModeGatewayConfigState { let rawConfigApiKey: String? let interruptOnSpeech: Bool? let silenceTimeoutMs: Int + let speechLocaleID: String? } enum TalkModeGatewayConfigParser { @@ -53,6 +54,7 @@ enum TalkModeGatewayConfigParser { let silenceTimeoutMs = TalkConfigParsing.resolvedSilenceTimeoutMs( talk, fallback: defaultSilenceTimeoutMs) + let speechLocaleID = TalkConfigParsing.resolvedSpeechLocaleID(talk) return TalkModeGatewayConfigState( activeProvider: activeProvider, @@ -64,6 +66,7 @@ enum TalkModeGatewayConfigParser { defaultOutputFormat: defaultOutputFormat, rawConfigApiKey: rawConfigApiKey, interruptOnSpeech: interruptOnSpeech, - silenceTimeoutMs: silenceTimeoutMs) + silenceTimeoutMs: silenceTimeoutMs, + speechLocaleID: speechLocaleID) } } diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index ed1378e5b6b..ec46bf9bcde 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -87,6 +87,7 @@ final class TalkModeManager: NSObject { private var apiKey: String? private var voiceAliases: [String: String] = [:] private var interruptOnSpeech: Bool = true + private var gatewaySpeechLocaleID: String? private var mainSessionKey: String = "main" private var fallbackVoiceId: String? private var lastPlaybackWasPCM: Bool = false @@ -500,12 +501,17 @@ final class TalkModeManager: NSObject { #endif self.stopRecognition() - self.speechRecognizer = SFSpeechRecognizer() + let localSpeechLocale = UserDefaults.standard.string(forKey: TalkSpeechLocale.storageKey) + let resolvedSpeech = TalkSpeechLocale.makeRecognizer( + localSelection: localSpeechLocale, + gatewaySelection: self.gatewaySpeechLocaleID) + self.speechRecognizer = resolvedSpeech.recognizer guard let recognizer = self.speechRecognizer else { throw NSError(domain: "TalkMode", code: 1, userInfo: [ NSLocalizedDescriptionKey: "Speech recognizer unavailable", ]) } + GatewayDiagnostics.log("talk speech: locale=\(resolvedSpeech.localeID ?? "default")") self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() self.recognitionRequest?.shouldReportPartialResults = true @@ -2027,6 +2033,7 @@ extension TalkModeManager { if let interrupt = parsed.interruptOnSpeech { self.interruptOnSpeech = interrupt } + self.gatewaySpeechLocaleID = parsed.speechLocaleID self.silenceWindow = TimeInterval(parsed.silenceTimeoutMs) / 1000 if parsed.normalizedPayload || parsed.defaultVoiceId != nil || parsed.rawConfigApiKey != nil { GatewayDiagnostics.log( @@ -2041,6 +2048,7 @@ extension TalkModeManager { self.gatewayTalkDefaultModelId = nil self.gatewayTalkApiKeyConfigured = false self.gatewayTalkConfigLoaded = false + self.gatewaySpeechLocaleID = nil self.silenceWindow = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000 } } diff --git a/apps/ios/Sources/Voice/TalkSpeechLocale.swift b/apps/ios/Sources/Voice/TalkSpeechLocale.swift new file mode 100644 index 00000000000..07cdf045cb3 --- /dev/null +++ b/apps/ios/Sources/Voice/TalkSpeechLocale.swift @@ -0,0 +1,100 @@ +import Foundation +import OpenClawKit +import Speech + +enum TalkSpeechLocale { + static let storageKey = "talk.speechLocale" + static let automaticID = "auto" + static let fallbackLocaleID = "en-US" + + struct Option: Identifiable { + let id: String + let label: String + } + + static func supportedOptions( + supportedLocales: Set = SFSpeechRecognizer.supportedLocales() + ) -> [Option] { + var seen = Set() + let dynamic: [Option] = supportedLocales + .compactMap { locale in + let id = self.canonicalID(locale.identifier) + guard seen.insert(id).inserted else { return nil } + return Option(id: id, label: self.friendlyName(for: locale)) + } + .sorted { (lhs: Option, rhs: Option) in + lhs.label.localizedCaseInsensitiveCompare(rhs.label) == .orderedAscending + } + return [Option(id: self.automaticID, label: "Automatic")] + dynamic + } + + static func resolvedLocaleID( + localSelection: String?, + gatewaySelection: String?, + deviceLocaleID: String = Locale.autoupdatingCurrent.identifier, + fallbackLocaleID: String = Self.fallbackLocaleID, + supportedLocaleIDs: Set + ) -> String? { + TalkConfigParsing.resolvedSpeechRecognitionLocaleID( + preferredLocaleIDs: [ + TalkConfigParsing.normalizedExplicitSpeechLocaleID(localSelection), + TalkConfigParsing.normalizedExplicitSpeechLocaleID(gatewaySelection), + deviceLocaleID, + ], + fallbackLocaleID: fallbackLocaleID, + supportedLocaleIDs: supportedLocaleIDs) + } + + static func makeRecognizer( + localSelection: String?, + gatewaySelection: String?, + supportedLocales: Set = SFSpeechRecognizer.supportedLocales() + ) -> (recognizer: SFSpeechRecognizer?, localeID: String?) { + let supportedIDs = Set(supportedLocales.map(\.identifier)) + guard let localeID = self.resolvedLocaleID( + localSelection: localSelection, + gatewaySelection: gatewaySelection, + supportedLocaleIDs: supportedIDs) + else { + let recognizer = SFSpeechRecognizer() + return (recognizer, recognizer?.locale.identifier) + } + + if let recognizer = SFSpeechRecognizer(locale: Locale(identifier: localeID)) { + return (recognizer, localeID) + } + + let recognizer = SFSpeechRecognizer() + return (recognizer, recognizer?.locale.identifier) + } + + static func normalizedExplicitLocaleID(_ raw: String?) -> String? { + TalkConfigParsing.normalizedExplicitSpeechLocaleID(raw, automaticID: self.automaticID) + } + + private static func normalizedLocaleID(_ raw: String?) -> String? { + TalkConfigParsing.normalizedSpeechLocaleID(raw) + } + + private static func canonicalID(_ raw: String) -> String { + raw.replacingOccurrences(of: "_", with: "-") + } + + private static func friendlyName(for locale: Locale) -> String { + let id = self.canonicalID(locale.identifier) + let cleanLocale = Locale(identifier: id) + if let langCode = cleanLocale.language.languageCode?.identifier, + let lang = cleanLocale.localizedString(forLanguageCode: langCode), + let regionCode = cleanLocale.region?.identifier, + let region = cleanLocale.localizedString(forRegionCode: regionCode) + { + return "\(lang) (\(region))" + } + if let langCode = cleanLocale.language.languageCode?.identifier, + let lang = cleanLocale.localizedString(forLanguageCode: langCode) + { + return lang + } + return cleanLocale.localizedString(forIdentifier: id) ?? id + } +} diff --git a/apps/ios/Tests/Logic/TalkConfigParsingTests.swift b/apps/ios/Tests/Logic/TalkConfigParsingTests.swift index c7fb9b0e209..d5176f98cc4 100644 --- a/apps/ios/Tests/Logic/TalkConfigParsingTests.swift +++ b/apps/ios/Tests/Logic/TalkConfigParsingTests.swift @@ -47,6 +47,16 @@ private let iOSSilenceTimeoutMs = 900 fallback: iOSSilenceTimeoutMs) == 1500) } + @Test func readsConfiguredSpeechLocale() { + let talk: [String: Any] = [ + "speechLocale": " ru-RU ", + ] + + #expect( + TalkConfigParsing.resolvedSpeechLocaleID( + TalkConfigParsing.bridgeFoundationDictionary(talk)) == "ru-RU") + } + @Test func defaultsSilenceTimeoutMsWhenMissing() { #expect(TalkConfigParsing.resolvedSilenceTimeoutMs(nil, fallback: iOSSilenceTimeoutMs) == iOSSilenceTimeoutMs) } diff --git a/apps/ios/Tests/TalkSpeechLocaleTests.swift b/apps/ios/Tests/TalkSpeechLocaleTests.swift new file mode 100644 index 00000000000..563bfd0fefb --- /dev/null +++ b/apps/ios/Tests/TalkSpeechLocaleTests.swift @@ -0,0 +1,41 @@ +import Foundation +import Testing +@testable import OpenClaw + +@Suite struct TalkSpeechLocaleTests { + @Test func localSelectionOverridesGatewayConfig() { + let locale = TalkSpeechLocale.resolvedLocaleID( + localSelection: "de-DE", + gatewaySelection: "ru-RU", + deviceLocaleID: "en-US", + supportedLocaleIDs: ["de-DE", "ru-RU", "en-US"]) + + #expect(locale == "de-DE") + } + + @Test func automaticLocalSelectionAllowsGatewayConfig() { + let locale = TalkSpeechLocale.resolvedLocaleID( + localSelection: TalkSpeechLocale.automaticID, + gatewaySelection: "ru_RU", + deviceLocaleID: "en-US", + supportedLocaleIDs: ["ru-RU", "en-US"]) + + #expect(locale == "ru-RU") + } + + @Test func unsupportedConfiguredLocaleFallsBackToDeviceThenEnglish() { + let deviceLocale = TalkSpeechLocale.resolvedLocaleID( + localSelection: "zz-ZZ", + gatewaySelection: nil, + deviceLocaleID: "fr-FR", + supportedLocaleIDs: ["fr-FR", "en-US"]) + let english = TalkSpeechLocale.resolvedLocaleID( + localSelection: "zz-ZZ", + gatewaySelection: nil, + deviceLocaleID: "yy-YY", + supportedLocaleIDs: ["en-US"]) + + #expect(deviceLocale == "fr-FR") + #expect(english == "en-US") + } +} diff --git a/apps/macos/Sources/OpenClaw/TalkModeGatewayConfig.swift b/apps/macos/Sources/OpenClaw/TalkModeGatewayConfig.swift index a548321a3b1..34fc1631862 100644 --- a/apps/macos/Sources/OpenClaw/TalkModeGatewayConfig.swift +++ b/apps/macos/Sources/OpenClaw/TalkModeGatewayConfig.swift @@ -11,6 +11,7 @@ struct TalkModeGatewayConfigState { let outputFormat: String? let interruptOnSpeech: Bool let silenceTimeoutMs: Int + let speechLocaleID: String? let apiKey: String? let seamColorHex: String? } @@ -53,6 +54,7 @@ enum TalkModeGatewayConfigParser { } let outputFormat = activeConfig?["outputFormat"]?.stringValue let interrupt = talk?["interruptOnSpeech"]?.boolValue + let speechLocaleID = TalkConfigParsing.resolvedSpeechLocaleID(talk) let apiKey = activeConfig?["apiKey"]?.stringValue let resolvedVoice: String? = if activeProvider == defaultProvider { (voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ?? @@ -78,6 +80,7 @@ enum TalkModeGatewayConfigParser { outputFormat: outputFormat, interruptOnSpeech: interrupt ?? true, silenceTimeoutMs: silenceTimeoutMs, + speechLocaleID: speechLocaleID, apiKey: resolvedApiKey, seamColorHex: rawSeam.isEmpty ? nil : rawSeam) } @@ -104,6 +107,7 @@ enum TalkModeGatewayConfigParser { outputFormat: nil, interruptOnSpeech: true, silenceTimeoutMs: defaultSilenceTimeoutMs, + speechLocaleID: nil, apiKey: resolvedApiKey, seamColorHex: nil) } diff --git a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift index 712324519c5..8f407093b9f 100644 --- a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift +++ b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift @@ -70,6 +70,7 @@ actor TalkModeRuntime { private var defaultOutputFormat: String? private var interruptOnSpeech: Bool = true private var activeTalkProvider = TalkModeRuntime.defaultTalkProvider + private var speechLocaleID: String? private var lastInterruptedAtSeconds: Double? private var voiceAliases: [String: String] = [:] private var lastSpokenText: String? @@ -186,12 +187,23 @@ actor TalkModeRuntime { self.recognitionGeneration &+= 1 let generation = self.recognitionGeneration - let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID } - self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale)) + let voiceWakeLocale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID } + let supportedLocaleIDs = Set(SFSpeechRecognizer.supportedLocales().map(\.identifier)) + let localeID = TalkConfigParsing.resolvedSpeechRecognitionLocaleID( + preferredLocaleIDs: [ + self.speechLocaleID, + voiceWakeLocale, + Locale.autoupdatingCurrent.identifier, + ], + supportedLocaleIDs: supportedLocaleIDs) + self.recognizer = localeID + .map { SFSpeechRecognizer(locale: Locale(identifier: $0)) } + ?? SFSpeechRecognizer() guard let recognizer, recognizer.isAvailable else { self.logger.error("talk recognizer unavailable") return } + self.logger.debug("talk recognizer locale=\(recognizer.locale.identifier, privacy: .public)") let request = SFSpeechAudioBufferRecognitionRequest() Self.configureRecognitionRequest(request) @@ -1010,6 +1022,7 @@ extension TalkModeRuntime { self.interruptOnSpeech = cfg.interruptOnSpeech self.activeTalkProvider = cfg.activeProvider self.silenceWindow = TimeInterval(cfg.silenceTimeoutMs) / 1000 + self.speechLocaleID = cfg.speechLocaleID self.apiKey = cfg.apiKey let hasApiKey = (cfg.apiKey?.isEmpty == false) let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none" @@ -1021,7 +1034,8 @@ extension TalkModeRuntime { "modelId=\(modelLabel, privacy: .public) " + "apiKey=\(hasApiKey, privacy: .public) " + "interrupt=\(cfg.interruptOnSpeech, privacy: .public) " + - "silenceTimeoutMs=\(cfg.silenceTimeoutMs, privacy: .public)") + "silenceTimeoutMs=\(cfg.silenceTimeoutMs, privacy: .public) " + + "speechLocale=\(cfg.speechLocaleID ?? "device", privacy: .public)") } static func selectTalkProviderConfig( diff --git a/apps/macos/Tests/OpenClawIPCTests/TalkModeGatewayConfigTests.swift b/apps/macos/Tests/OpenClawIPCTests/TalkModeGatewayConfigTests.swift index 73fd4b2ea05..cf7ea24c850 100644 --- a/apps/macos/Tests/OpenClawIPCTests/TalkModeGatewayConfigTests.swift +++ b/apps/macos/Tests/OpenClawIPCTests/TalkModeGatewayConfigTests.swift @@ -25,10 +25,10 @@ struct TalkModeGatewayConfigTests { "voiceId": "unused-voice", ], ], + "speechLocale": "ru-RU", ]), ], - issues: nil - ) + issues: nil) let parsed = TalkModeGatewayConfigParser.parse( snapshot: snapshot, @@ -37,12 +37,12 @@ struct TalkModeGatewayConfigTests { defaultSilenceTimeoutMs: TalkDefaults.silenceTimeoutMs, envVoice: "env-voice", sagVoice: "sag-voice", - envApiKey: "env-key" - ) + envApiKey: "env-key") #expect(parsed.activeProvider == "mlx") #expect(parsed.modelId == nil) #expect(parsed.apiKey == nil) #expect(parsed.voiceId == "unused-voice") + #expect(parsed.speechLocaleID == "ru-RU") } } diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkConfigParsing.swift b/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkConfigParsing.swift index 6bdd6b9f244..38efece3e1d 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkConfigParsing.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkConfigParsing.swift @@ -56,6 +56,46 @@ public enum TalkConfigParsing { self.resolvedPositiveInt(talk?["silenceTimeoutMs"], fallback: fallback) } + public static func normalizedSpeechLocaleID(_ value: String?) -> String? { + let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed.replacingOccurrences(of: "_", with: "-") + } + + public static func resolvedSpeechLocaleID( + _ talk: [String: AnyCodable]?, + fallback: String? = nil + ) -> String? { + self.normalizedSpeechLocaleID(talk?["speechLocale"]?.stringValue) + ?? self.normalizedSpeechLocaleID(fallback) + } + + public static func normalizedExplicitSpeechLocaleID( + _ value: String?, + automaticID: String = "auto" + ) -> String? { + let normalized = self.normalizedSpeechLocaleID(value) + return normalized == automaticID ? nil : normalized + } + + public static func resolvedSpeechRecognitionLocaleID( + preferredLocaleIDs: [String?], + fallbackLocaleID: String = "en-US", + supportedLocaleIDs: Set + ) -> String? { + let supported = Set(supportedLocaleIDs.compactMap(self.normalizedSpeechLocaleID)) + var seen = Set() + let candidates = (preferredLocaleIDs + [fallbackLocaleID]) + .compactMap(self.normalizedSpeechLocaleID) + + for candidate in candidates { + guard seen.insert(candidate).inserted else { continue } + if supported.isEmpty || supported.contains(candidate) { + return candidate + } + } + return nil + } + private static func normalizedTalkProviderID(_ raw: String?) -> String? { let trimmed = (raw ?? "").trimmingCharacters(in: .whitespacesAndNewlines).lowercased() return trimmed.isEmpty ? nil : trimmed diff --git a/apps/shared/OpenClawKit/Tests/OpenClawKitTests/TalkConfigParsingTests.swift b/apps/shared/OpenClawKit/Tests/OpenClawKitTests/TalkConfigParsingTests.swift index 5a8d5dd11d3..5215cdca73a 100644 --- a/apps/shared/OpenClawKit/Tests/OpenClawKitTests/TalkConfigParsingTests.swift +++ b/apps/shared/OpenClawKit/Tests/OpenClawKitTests/TalkConfigParsingTests.swift @@ -116,4 +116,21 @@ struct TalkConfigParsingTests { #expect(TalkConfigParsing.resolvedPositiveInt(AnyCodable(true), fallback: 700) == 700) #expect(TalkConfigParsing.resolvedPositiveInt(AnyCodable("1500"), fallback: 700) == 700) } + + @Test func resolvesSpeechLocaleID() { + #expect(TalkConfigParsing.resolvedSpeechLocaleID(["speechLocale": AnyCodable(" ru_RU ")]) == "ru-RU") + #expect(TalkConfigParsing.resolvedSpeechLocaleID(["speechLocale": AnyCodable("")], fallback: "en-US") == "en-US") + } + + @Test func resolvesSpeechRecognitionLocaleFromSupportedFallbacks() { + let locale = TalkConfigParsing.resolvedSpeechRecognitionLocaleID( + preferredLocaleIDs: ["zz-ZZ", "fr-FR"], + supportedLocaleIDs: ["fr-FR", "en-US"]) + let fallback = TalkConfigParsing.resolvedSpeechRecognitionLocaleID( + preferredLocaleIDs: ["zz-ZZ", "yy-YY"], + supportedLocaleIDs: ["en-US"]) + + #expect(locale == "fr-FR") + #expect(fallback == "en-US") + } } diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 630b4ee97d3..6bc941f4abb 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -6ed33ef102e7c92816243bfabc3626222a679c3270c12ec5ea47b28b66204b3b config-baseline.json -f86cb4d57ec1f5fd75008be0ab86151194945eb013a47ab4bdeaddafd3780da7 config-baseline.core.json +9ac3d271f9bfa9611557f0b52e4d0a600693bdd1de75cc1bafc320fc4d4f0075 config-baseline.json +271fdf1d6652927e0fc160a6f25276bf6dccb8f1b27fab15e0fc2620e8cacab4 config-baseline.core.json 7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json 7825b56a5b3fcdbe2e09ef8fe5d9f12ac3598435afebe20413051e45b0d1968e config-baseline.plugin.json diff --git a/docs/gateway/config-agents.md b/docs/gateway/config-agents.md index c735efa1ecf..097611f5f29 100644 --- a/docs/gateway/config-agents.md +++ b/docs/gateway/config-agents.md @@ -1332,6 +1332,7 @@ Defaults for Talk mode (macOS/iOS/Android). }, system: {}, }, + speechLocale: "ru-RU", silenceTimeoutMs: 1500, interruptOnSpeech: true, }, @@ -1346,6 +1347,7 @@ Defaults for Talk mode (macOS/iOS/Android). - `providers.*.voiceAliases` lets Talk directives use friendly names. - `providers.mlx.modelId` selects the Hugging Face repo used by the macOS local MLX helper. If omitted, macOS uses `mlx-community/Soprano-80M-bf16`. - macOS MLX playback runs through the bundled `openclaw-mlx-tts` helper when present, or an executable on `PATH`; `OPENCLAW_MLX_TTS_BIN` overrides the helper path for development. +- `speechLocale` sets the BCP 47 locale id used by iOS/macOS Talk speech recognition. Leave unset to use the device default. - `silenceTimeoutMs` controls how long Talk mode waits after user silence before it sends the transcript. Unset keeps the platform default pause window (`700 ms on macOS and Android, 900 ms on iOS`). --- diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index d1eb0fec488..6e4c27b2d2e 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -43,6 +43,7 @@ Moved to a dedicated page — see - `session.*` (session lifecycle, compaction, pruning) - `messages.*` (message delivery, TTS, markdown rendering) - `talk.*` (Talk mode) + - `talk.speechLocale`: optional BCP 47 locale id for Talk speech recognition on iOS/macOS - `talk.silenceTimeoutMs`: when unset, Talk keeps the platform default pause window before sending the transcript (`700 ms on macOS and Android, 900 ms on iOS`) ## Tools and custom providers diff --git a/docs/nodes/talk.md b/docs/nodes/talk.md index c8d40b08957..fac21310050 100644 --- a/docs/nodes/talk.md +++ b/docs/nodes/talk.md @@ -63,6 +63,7 @@ Supported keys: }, system: {}, }, + speechLocale: "ru-RU", silenceTimeoutMs: 1500, interruptOnSpeech: true, }, @@ -78,6 +79,7 @@ Defaults: - `providers.elevenlabs.modelId`: defaults to `eleven_v3` when unset. - `providers.mlx.modelId`: defaults to `mlx-community/Soprano-80M-bf16` when unset. - `providers.elevenlabs.apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available). +- `speechLocale`: optional BCP 47 locale id for on-device Talk speech recognition on iOS/macOS. Leave unset to use the device default. - `outputFormat`: defaults to `pcm_44100` on macOS/iOS and `pcm_24000` on Android (set `mp3_*` to force MP3 streaming) ## macOS UI diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 51c6768e7d3..722646c7c2e 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -21075,6 +21075,12 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { description: "Provider-specific Talk settings keyed by provider id. During migration, prefer this over legacy talk.* keys.", }, + speechLocale: { + type: "string", + title: "Talk Speech Locale", + description: + 'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.', + }, interruptOnSpeech: { type: "boolean", title: "Talk Interrupt on Speech", @@ -27273,6 +27279,11 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { help: "Enables automatic live-reload behavior for canvas assets during development workflows. Keep disabled in production-like environments where deterministic output is preferred.", tags: ["reliability"], }, + "talk.speechLocale": { + label: "Talk Speech Locale", + help: 'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.', + tags: ["media"], + }, "talk.interruptOnSpeech": { label: "Talk Interrupt on Speech", help: "If true (default), stop assistant speech when the user starts speaking in Talk mode. Keep enabled for conversational turn-taking.", diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 444a12678b5..e15371b4909 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -148,6 +148,8 @@ export const FIELD_HELP: Record = { "Provider-specific Talk settings keyed by provider id. During migration, prefer this over legacy talk.* keys.", "talk.providers.*": "Provider-owned Talk config fields for the matching provider id.", "talk.providers.*.apiKey": "Provider API key for Talk mode.", // pragma: allowlist secret + "talk.speechLocale": + 'BCP 47 locale id for Talk speech recognition on device nodes, for example "ru-RU". Leave unset to use each device default.', "talk.interruptOnSpeech": "If true (default), stop assistant speech when the user starts speaking in Talk mode. Keep enabled for conversational turn-taking.", "talk.silenceTimeoutMs": `Milliseconds of user silence before Talk mode finalizes and sends the current transcript. Leave unset to keep the platform default pause window (${describeTalkSilenceTimeoutDefaults()}).`, diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 0cf0f1d3fef..e2fc55c6959 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -786,6 +786,7 @@ export const FIELD_LABELS: Record = { "canvasHost.port": "Canvas Host Port", "canvasHost.liveReload": "Canvas Host Live Reload", talk: "Talk", + "talk.speechLocale": "Talk Speech Locale", "talk.interruptOnSpeech": "Talk Interrupt on Speech", "talk.silenceTimeoutMs": "Talk Silence Timeout (ms)", messages: "Messages", diff --git a/src/config/talk.normalize.test.ts b/src/config/talk.normalize.test.ts index f1150b05278..54e6d3b72e0 100644 --- a/src/config/talk.normalize.test.ts +++ b/src/config/talk.normalize.test.ts @@ -10,11 +10,13 @@ describe("talk normalization", () => { modelId: "eleven_v3", outputFormat: "pcm_44100", apiKey: "secret-key", // pragma: allowlist secret + speechLocale: " ru-RU ", interruptOnSpeech: false, silenceTimeoutMs: 1500, } as unknown as never); expect(normalized).toEqual({ + speechLocale: "ru-RU", interruptOnSpeech: false, silenceTimeoutMs: 1500, }); @@ -77,6 +79,7 @@ describe("talk normalization", () => { modelId: "acme-model", }, }, + speechLocale: "ru-RU", interruptOnSpeech: true, }); @@ -95,6 +98,7 @@ describe("talk normalization", () => { modelId: "acme-model", }, }, + speechLocale: "ru-RU", interruptOnSpeech: true, }); }); diff --git a/src/config/talk.ts b/src/config/talk.ts index af0bc95f0fe..a47f13d7358 100644 --- a/src/config/talk.ts +++ b/src/config/talk.ts @@ -105,6 +105,10 @@ export function normalizeTalkSection(value: TalkConfig | undefined): TalkConfig const source = value as Record; const normalized: TalkConfig = {}; + const speechLocale = normalizeOptionalString(source.speechLocale); + if (speechLocale) { + normalized.speechLocale = speechLocale; + } if (typeof source.interruptOnSpeech === "boolean") { normalized.interruptOnSpeech = source.interruptOnSpeech; } @@ -172,6 +176,9 @@ export function buildTalkConfigResponse(value: unknown): TalkConfigResponse | un if (typeof normalized?.silenceTimeoutMs === "number") { payload.silenceTimeoutMs = normalized.silenceTimeoutMs; } + if (typeof normalized?.speechLocale === "string") { + payload.speechLocale = normalized.speechLocale; + } if (normalized?.providers && Object.keys(normalized.providers).length > 0) { payload.providers = normalized.providers; } diff --git a/src/config/types.gateway.ts b/src/config/types.gateway.ts index 2e7527ffdc3..57ac647fe9e 100644 --- a/src/config/types.gateway.ts +++ b/src/config/types.gateway.ts @@ -67,6 +67,8 @@ export type TalkConfig = { provider?: string; /** Provider-specific Talk config keyed by provider id. */ providers?: Record; + /** BCP 47 locale id used for Talk speech recognition on device nodes. */ + speechLocale?: string; /** Stop speaking when user starts talking (default: true). */ interruptOnSpeech?: boolean; /** Milliseconds of user silence before Talk mode sends the transcript after a pause. */ diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 10a4d561e5c..26bd902750e 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -186,6 +186,7 @@ const TalkSchema = z .object({ provider: z.string().optional(), providers: z.record(z.string(), TalkProviderEntrySchema).optional(), + speechLocale: z.string().optional(), interruptOnSpeech: z.boolean().optional(), silenceTimeoutMs: z.number().int().positive().optional(), }) diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 62c62dcb726..45769210a18 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -78,6 +78,7 @@ const TalkConfigSchema = Type.Object( provider: Type.Optional(Type.String()), providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)), resolved: ResolvedTalkConfigSchema, + speechLocale: Type.Optional(Type.String()), interruptOnSpeech: Type.Optional(Type.Boolean()), silenceTimeoutMs: Type.Optional(Type.Integer({ minimum: 1 })), }, diff --git a/src/gateway/server.talk-config.test.ts b/src/gateway/server.talk-config.test.ts index 8d77762e161..dbc85861092 100644 --- a/src/gateway/server.talk-config.test.ts +++ b/src/gateway/server.talk-config.test.ts @@ -34,6 +34,7 @@ type TalkConfigPayload = { provider?: string; config?: { voiceId?: string; apiKey?: string | SecretRef }; }; + speechLocale?: string; silenceTimeoutMs?: number; }; session?: { mainKey?: string }; @@ -144,6 +145,7 @@ function expectTalkConfig( apiKey?: string | SecretRef; providerApiKey?: string | SecretRef; resolvedApiKey?: string | SecretRef; + speechLocale?: string; silenceTimeoutMs?: number; }, ) { @@ -162,6 +164,9 @@ function expectTalkConfig( if ("resolvedApiKey" in expected) { expect(talk?.resolved?.config?.apiKey).toEqual(expected.resolvedApiKey); } + if ("speechLocale" in expected) { + expect(talk?.speechLocale).toBe(expected.speechLocale); + } if ("silenceTimeoutMs" in expected) { expect(talk?.silenceTimeoutMs).toBe(expected.silenceTimeoutMs); } @@ -179,6 +184,7 @@ describe("gateway talk.config", () => { apiKey: "secret-key-abc", // pragma: allowlist secret }, }, + speechLocale: "ru-RU", silenceTimeoutMs: 1500, }, session: { @@ -196,6 +202,7 @@ describe("gateway talk.config", () => { provider: GENERIC_TALK_PROVIDER_ID, voiceId: "voice-123", apiKey: "__OPENCLAW_REDACTED__", + speechLocale: "ru-RU", silenceTimeoutMs: 1500, }); expect(res.payload?.config?.session?.mainKey).toBe("main-test");