fix(macos): retry talk tts via gateway

2026-05-01 06:36:23 +08:00 · 2026-04-25 04:09:37 +01:00
parent 9a0b26cafc
commit 3731a7c8f2
4 changed files with 138 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -82,6 +82,7 @@ Docs: https://docs.openclaw.ai
 - Providers/OpenAI-compatible: treat singular MLX-style `finish_reason: "tool_call"` as tool use instead of a provider error. Fixes #61499.
 - Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
 - Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153.
+- macOS Talk Mode: retry failed local ElevenLabs stream playback through gateway `talk.speak` before falling back to the system voice, so configured ElevenLabs voices still play when streaming playback fails. Fixes #65662.
 - Plugins/Voice Call: reap stale pre-answer calls by default, honor configured TTS timeouts for Twilio media-stream playback, and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens.
 - Plugins/Voice Call: terminate expired restored call sessions with the provider and restart restored max-duration timers with only the remaining duration, preventing stale outbound retry loops after Gateway restarts. Fixes #48739. Thanks @mira-solari.
 - Plugins/Voice Call: start provider STT after Telnyx outbound conversation greetings and pass configured Telnyx voice IDs through to the speak action. Fixes #56091. Thanks @Roshan.
--- a/apps/macos/Sources/OpenClaw/GatewayConnection.swift
+++ b/apps/macos/Sources/OpenClaw/GatewayConnection.swift
@@ -70,6 +70,7 @@ actor GatewayConnection {
        case wizardStatus = "wizard.status"
        case talkConfig = "talk.config"
        case talkMode = "talk.mode"
+        case talkSpeak = "talk.speak"
        case webLoginStart = "web.login.start"
        case webLoginWait = "web.login.wait"
        case channelsLogout = "channels.logout"
--- a/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift
+++ b/apps/macos/Sources/OpenClaw/TalkModeRuntime.swift
@@ -2,6 +2,7 @@ import AVFoundation
 import Foundation
 import OpenClawChatUI
 import OpenClawKit
+import OpenClawProtocol
 import OSLog
 import Speech

@@ -475,7 +476,16 @@ actor TalkModeRuntime {
                self.ttsLogger
                    .error(
                        "talk TTS failed: \(error.localizedDescription, privacy: .public); " +
-                            "falling back to system voice")
+                            "retrying gateway talk.speak")
+                do {
+                    try await self.playGatewayTalkSpeak(input: input)
+                    return
+                } catch {
+                    self.ttsLogger
+                        .error(
+                            "talk gateway TTS failed: \(error.localizedDescription, privacy: .public); " +
+                                "falling back to system voice")
+                }
                do {
                    try await self.playSystemVoice(input: input)
                } catch {
@@ -720,6 +730,42 @@ actor TalkModeRuntime {
        return await self.playMP3(stream: stream)
    }

+    private func playGatewayTalkSpeak(input: TalkPlaybackInput) async throws {
+        let params = Self.makeTalkSpeakParams(
+            text: input.cleanedText,
+            voiceId: input.voiceId,
+            modelId: self.currentModelId ?? self.defaultModelId,
+            outputFormat: self.defaultOutputFormat,
+            directive: input.directive)
+        let result: TalkSpeakResult = try await GatewayConnection.shared.requestDecoded(
+            method: .talkSpeak,
+            params: params,
+            timeoutMs: max(30000, input.synthTimeoutSeconds * 1000 + 5000))
+        guard let audioData = Data(base64Encoded: result.audiobase64), !audioData.isEmpty else {
+            throw NSError(domain: "TalkSpeak", code: 1, userInfo: [
+                NSLocalizedDescriptionKey: "gateway talk.speak returned empty audio",
+            ])
+        }
+        _ = await self.stopPCM()
+        _ = await self.stopMP3()
+        if self.interruptOnSpeech {
+            guard await self.prepareForPlayback(generation: input.generation) else { return }
+        }
+        await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
+        self.phase = .speaking
+        let playback = await self.playTalkAudio(data: audioData)
+        self.ttsLogger
+            .info(
+                "talk gateway audio provider=\(result.provider, privacy: .public) " +
+                    "format=\(result.outputformat ?? "unknown", privacy: .public) " +
+                    "finished=\(playback.finished, privacy: .public)")
+        if !playback.finished, playback.interruptedAt == nil {
+            throw NSError(domain: "TalkSpeak", code: 2, userInfo: [
+                NSLocalizedDescriptionKey: "gateway talk.speak audio playback failed",
+            ])
+        }
+    }
+
    private func playSystemVoice(input: TalkPlaybackInput) async throws {
        self.ttsLogger.info("talk system voice start chars=\(input.cleanedText.count, privacy: .public)")
        if self.interruptOnSpeech {
@@ -847,6 +893,54 @@ actor TalkModeRuntime {
 }

 extension TalkModeRuntime {
+    static func makeTalkSpeakParams(
+        text: String,
+        voiceId: String?,
+        modelId: String?,
+        outputFormat: String?,
+        directive: TalkDirective?) -> [String: AnyCodable]
+    {
+        var params: [String: AnyCodable] = ["text": AnyCodable(text)]
+
+        func addString(_ key: String, _ value: String?) {
+            let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
+            guard !trimmed.isEmpty else { return }
+            params[key] = AnyCodable(trimmed)
+        }
+
+        addString("voiceId", voiceId)
+        addString("modelId", directive?.modelId ?? modelId)
+        addString("outputFormat", directive?.outputFormat ?? outputFormat)
+        if let speed = directive?.speed {
+            params["speed"] = AnyCodable(speed)
+        }
+        if let rateWPM = directive?.rateWPM {
+            params["rateWpm"] = AnyCodable(rateWPM)
+        }
+        if let stability = directive?.stability {
+            params["stability"] = AnyCodable(stability)
+        }
+        if let similarity = directive?.similarity {
+            params["similarity"] = AnyCodable(similarity)
+        }
+        if let style = directive?.style {
+            params["style"] = AnyCodable(style)
+        }
+        if let speakerBoost = directive?.speakerBoost {
+            params["speakerBoost"] = AnyCodable(speakerBoost)
+        }
+        if let seed = directive?.seed {
+            params["seed"] = AnyCodable(seed)
+        }
+        addString("normalize", directive?.normalize)
+        addString("language", directive?.language)
+        if let latencyTier = directive?.latencyTier {
+            params["latencyTier"] = AnyCodable(latencyTier)
+        }
+
+        return params
+    }
+
    // MARK: - Audio playback (MainActor helpers)

    @MainActor
--- a/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift
+++ b/apps/macos/Tests/OpenClawIPCTests/TalkModeRuntimeSpeechTests.swift
@@ -1,3 +1,4 @@
+import OpenClawKit
 import Speech
 import Testing
@testable import OpenClaw
@@ -16,23 +17,19 @@ struct TalkModeRuntimeSpeechTests {
        let elevenLabsPlan = TalkModeRuntime.playbackPlan(
            provider: "elevenlabs",
            apiKey: "key",
-            voiceId: "voice"
-        )
+            voiceId: "voice")
        let missingKeyPlan = TalkModeRuntime.playbackPlan(
            provider: "elevenlabs",
            apiKey: nil,
-            voiceId: "voice"
-        )
+            voiceId: "voice")
        let missingVoicePlan = TalkModeRuntime.playbackPlan(
            provider: "elevenlabs",
            apiKey: "key",
-            voiceId: nil
-        )
+            voiceId: nil)
        let blankKeyPlan = TalkModeRuntime.playbackPlan(
            provider: "elevenlabs",
            apiKey: "",
-            voiceId: "voice"
-        )
+            voiceId: "voice")
        let mlxPlan = TalkModeRuntime.playbackPlan(provider: "mlx", apiKey: nil, voiceId: nil)
        let systemPlan = TalkModeRuntime.playbackPlan(provider: "system", apiKey: nil, voiceId: nil)

@@ -43,4 +40,40 @@ struct TalkModeRuntimeSpeechTests {
        #expect(mlxPlan == .mlxThenSystemVoice)
        #expect(systemPlan == .systemVoiceOnly)
    }
+
+    @Test func `talk speak params carry resolved voice and directive overrides`() {
+        let params = TalkModeRuntime.makeTalkSpeakParams(
+            text: "hello",
+            voiceId: "voice-123",
+            modelId: "eleven_v3",
+            outputFormat: "mp3_44100_128",
+            directive: TalkDirective(
+                modelId: "eleven_turbo_v2_5",
+                speed: 1.1,
+                rateWPM: 180,
+                stability: 0.4,
+                similarity: 0.7,
+                style: 0.2,
+                speakerBoost: true,
+                seed: 42,
+                normalize: "auto",
+                language: "en",
+                outputFormat: "mp3_44100_128",
+                latencyTier: 3))
+
+        #expect(params["text"]?.value as? String == "hello")
+        #expect(params["voiceId"]?.value as? String == "voice-123")
+        #expect(params["modelId"]?.value as? String == "eleven_turbo_v2_5")
+        #expect(params["outputFormat"]?.value as? String == "mp3_44100_128")
+        #expect(params["speed"]?.value as? Double == 1.1)
+        #expect(params["rateWpm"]?.value as? Int == 180)
+        #expect(params["stability"]?.value as? Double == 0.4)
+        #expect(params["similarity"]?.value as? Double == 0.7)
+        #expect(params["style"]?.value as? Double == 0.2)
+        #expect(params["speakerBoost"]?.value as? Bool == true)
+        #expect(params["seed"]?.value as? Int == 42)
+        #expect(params["normalize"]?.value as? String == "auto")
+        #expect(params["language"]?.value as? String == "en")
+        #expect(params["latencyTier"]?.value as? Int == 3)
+    }
 }