diff --git a/CHANGELOG.md b/CHANGELOG.md index 96ab09e9a83..f26b566a95b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -82,6 +82,7 @@ Docs: https://docs.openclaw.ai - Providers/OpenAI-compatible: treat singular MLX-style `finish_reason: "tool_call"` as tool use instead of a provider error. Fixes #61499. - Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot. - Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153. +- macOS Talk Mode: retry failed local ElevenLabs stream playback through gateway `talk.speak` before falling back to the system voice, so configured ElevenLabs voices still play when streaming playback fails. Fixes #65662. - Plugins/Voice Call: reap stale pre-answer calls by default, honor configured TTS timeouts for Twilio media-stream playback, and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens. - Plugins/Voice Call: terminate expired restored call sessions with the provider and restart restored max-duration timers with only the remaining duration, preventing stale outbound retry loops after Gateway restarts. Fixes #48739. Thanks @mira-solari. - Plugins/Voice Call: start provider STT after Telnyx outbound conversation greetings and pass configured Telnyx voice IDs through to the speak action. Fixes #56091. Thanks @Roshan. diff --git a/apps/macos/Sources/OpenClaw/GatewayConnection.swift b/apps/macos/Sources/OpenClaw/GatewayConnection.swift index eb12323244a..5368703ad08 100644 --- a/apps/macos/Sources/OpenClaw/GatewayConnection.swift +++ b/apps/macos/Sources/OpenClaw/GatewayConnection.swift @@ -70,6 +70,7 @@ actor GatewayConnection { case wizardStatus = "wizard.status" case talkConfig = "talk.config" case talkMode = "talk.mode" + case talkSpeak = "talk.speak" case webLoginStart = "web.login.start" case webLoginWait = "web.login.wait" case channelsLogout = "channels.logout" diff --git a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift index bcd8057f268..712324519c5 100644 --- a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift +++ b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift @@ -2,6 +2,7 @@ import AVFoundation import Foundation import OpenClawChatUI import OpenClawKit +import OpenClawProtocol import OSLog import Speech @@ -475,7 +476,16 @@ actor TalkModeRuntime { self.ttsLogger .error( "talk TTS failed: \(error.localizedDescription, privacy: .public); " + - "falling back to system voice") + "retrying gateway talk.speak") + do { + try await self.playGatewayTalkSpeak(input: input) + return + } catch { + self.ttsLogger + .error( + "talk gateway TTS failed: \(error.localizedDescription, privacy: .public); " + + "falling back to system voice") + } do { try await self.playSystemVoice(input: input) } catch { @@ -720,6 +730,42 @@ actor TalkModeRuntime { return await self.playMP3(stream: stream) } + private func playGatewayTalkSpeak(input: TalkPlaybackInput) async throws { + let params = Self.makeTalkSpeakParams( + text: input.cleanedText, + voiceId: input.voiceId, + modelId: self.currentModelId ?? self.defaultModelId, + outputFormat: self.defaultOutputFormat, + directive: input.directive) + let result: TalkSpeakResult = try await GatewayConnection.shared.requestDecoded( + method: .talkSpeak, + params: params, + timeoutMs: max(30000, input.synthTimeoutSeconds * 1000 + 5000)) + guard let audioData = Data(base64Encoded: result.audiobase64), !audioData.isEmpty else { + throw NSError(domain: "TalkSpeak", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "gateway talk.speak returned empty audio", + ]) + } + _ = await self.stopPCM() + _ = await self.stopMP3() + if self.interruptOnSpeech { + guard await self.prepareForPlayback(generation: input.generation) else { return } + } + await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } + self.phase = .speaking + let playback = await self.playTalkAudio(data: audioData) + self.ttsLogger + .info( + "talk gateway audio provider=\(result.provider, privacy: .public) " + + "format=\(result.outputformat ?? "unknown", privacy: .public) " + + "finished=\(playback.finished, privacy: .public)") + if !playback.finished, playback.interruptedAt == nil { + throw NSError(domain: "TalkSpeak", code: 2, userInfo: [ + NSLocalizedDescriptionKey: "gateway talk.speak audio playback failed", + ]) + } + } + private func playSystemVoice(input: TalkPlaybackInput) async throws { self.ttsLogger.info("talk system voice start chars=\(input.cleanedText.count, privacy: .public)") if self.interruptOnSpeech { @@ -847,6 +893,54 @@ actor TalkModeRuntime { } extension TalkModeRuntime { + static func makeTalkSpeakParams( + text: String, + voiceId: String?, + modelId: String?, + outputFormat: String?, + directive: TalkDirective?) -> [String: AnyCodable] + { + var params: [String: AnyCodable] = ["text": AnyCodable(text)] + + func addString(_ key: String, _ value: String?) { + let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + guard !trimmed.isEmpty else { return } + params[key] = AnyCodable(trimmed) + } + + addString("voiceId", voiceId) + addString("modelId", directive?.modelId ?? modelId) + addString("outputFormat", directive?.outputFormat ?? outputFormat) + if let speed = directive?.speed { + params["speed"] = AnyCodable(speed) + } + if let rateWPM = directive?.rateWPM { + params["rateWpm"] = AnyCodable(rateWPM) + } + if let stability = directive?.stability { + params["stability"] = AnyCodable(stability) + } + if let similarity = directive?.similarity { + params["similarity"] = AnyCodable(similarity) + } + if let style = directive?.style { + params["style"] = AnyCodable(style) + } + if let speakerBoost = directive?.speakerBoost { + params["speakerBoost"] = AnyCodable(speakerBoost) + } + if let seed = directive?.seed { + params["seed"] = AnyCodable(seed) + } + addString("normalize", directive?.normalize) + addString("language", directive?.language) + if let latencyTier = directive?.latencyTier { + params["latencyTier"] = AnyCodable(latencyTier) + } + + return params + } + // MARK: - Audio playback (MainActor helpers) @MainActor diff --git a/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift b/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift index 90100b4593f..64818be2aab 100644 --- a/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift +++ b/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift @@ -1,3 +1,4 @@ +import OpenClawKit import Speech import Testing @testable import OpenClaw @@ -16,23 +17,19 @@ struct TalkModeRuntimeSpeechTests { let elevenLabsPlan = TalkModeRuntime.playbackPlan( provider: "elevenlabs", apiKey: "key", - voiceId: "voice" - ) + voiceId: "voice") let missingKeyPlan = TalkModeRuntime.playbackPlan( provider: "elevenlabs", apiKey: nil, - voiceId: "voice" - ) + voiceId: "voice") let missingVoicePlan = TalkModeRuntime.playbackPlan( provider: "elevenlabs", apiKey: "key", - voiceId: nil - ) + voiceId: nil) let blankKeyPlan = TalkModeRuntime.playbackPlan( provider: "elevenlabs", apiKey: "", - voiceId: "voice" - ) + voiceId: "voice") let mlxPlan = TalkModeRuntime.playbackPlan(provider: "mlx", apiKey: nil, voiceId: nil) let systemPlan = TalkModeRuntime.playbackPlan(provider: "system", apiKey: nil, voiceId: nil) @@ -43,4 +40,40 @@ struct TalkModeRuntimeSpeechTests { #expect(mlxPlan == .mlxThenSystemVoice) #expect(systemPlan == .systemVoiceOnly) } + + @Test func `talk speak params carry resolved voice and directive overrides`() { + let params = TalkModeRuntime.makeTalkSpeakParams( + text: "hello", + voiceId: "voice-123", + modelId: "eleven_v3", + outputFormat: "mp3_44100_128", + directive: TalkDirective( + modelId: "eleven_turbo_v2_5", + speed: 1.1, + rateWPM: 180, + stability: 0.4, + similarity: 0.7, + style: 0.2, + speakerBoost: true, + seed: 42, + normalize: "auto", + language: "en", + outputFormat: "mp3_44100_128", + latencyTier: 3)) + + #expect(params["text"]?.value as? String == "hello") + #expect(params["voiceId"]?.value as? String == "voice-123") + #expect(params["modelId"]?.value as? String == "eleven_turbo_v2_5") + #expect(params["outputFormat"]?.value as? String == "mp3_44100_128") + #expect(params["speed"]?.value as? Double == 1.1) + #expect(params["rateWpm"]?.value as? Int == 180) + #expect(params["stability"]?.value as? Double == 0.4) + #expect(params["similarity"]?.value as? Double == 0.7) + #expect(params["style"]?.value as? Double == 0.2) + #expect(params["speakerBoost"]?.value as? Bool == true) + #expect(params["seed"]?.value as? Int == 42) + #expect(params["normalize"]?.value as? String == "auto") + #expect(params["language"]?.value as? String == "en") + #expect(params["latencyTier"]?.value as? Int == 3) + } }