fix(macos): retry talk tts via gateway

This commit is contained in:
Peter Steinberger
2026-04-25 04:09:37 +01:00
parent 9a0b26cafc
commit 3731a7c8f2
4 changed files with 138 additions and 9 deletions

View File

@@ -82,6 +82,7 @@ Docs: https://docs.openclaw.ai
- Providers/OpenAI-compatible: treat singular MLX-style `finish_reason: "tool_call"` as tool use instead of a provider error. Fixes #61499.
- Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
- Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153.
- macOS Talk Mode: retry failed local ElevenLabs stream playback through gateway `talk.speak` before falling back to the system voice, so configured ElevenLabs voices still play when streaming playback fails. Fixes #65662.
- Plugins/Voice Call: reap stale pre-answer calls by default, honor configured TTS timeouts for Twilio media-stream playback, and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens.
- Plugins/Voice Call: terminate expired restored call sessions with the provider and restart restored max-duration timers with only the remaining duration, preventing stale outbound retry loops after Gateway restarts. Fixes #48739. Thanks @mira-solari.
- Plugins/Voice Call: start provider STT after Telnyx outbound conversation greetings and pass configured Telnyx voice IDs through to the speak action. Fixes #56091. Thanks @Roshan.

View File

@@ -70,6 +70,7 @@ actor GatewayConnection {
case wizardStatus = "wizard.status"
case talkConfig = "talk.config"
case talkMode = "talk.mode"
case talkSpeak = "talk.speak"
case webLoginStart = "web.login.start"
case webLoginWait = "web.login.wait"
case channelsLogout = "channels.logout"

View File

@@ -2,6 +2,7 @@ import AVFoundation
import Foundation
import OpenClawChatUI
import OpenClawKit
import OpenClawProtocol
import OSLog
import Speech
@@ -475,7 +476,16 @@ actor TalkModeRuntime {
self.ttsLogger
.error(
"talk TTS failed: \(error.localizedDescription, privacy: .public); " +
"falling back to system voice")
"retrying gateway talk.speak")
do {
try await self.playGatewayTalkSpeak(input: input)
return
} catch {
self.ttsLogger
.error(
"talk gateway TTS failed: \(error.localizedDescription, privacy: .public); " +
"falling back to system voice")
}
do {
try await self.playSystemVoice(input: input)
} catch {
@@ -720,6 +730,42 @@ actor TalkModeRuntime {
return await self.playMP3(stream: stream)
}
private func playGatewayTalkSpeak(input: TalkPlaybackInput) async throws {
let params = Self.makeTalkSpeakParams(
text: input.cleanedText,
voiceId: input.voiceId,
modelId: self.currentModelId ?? self.defaultModelId,
outputFormat: self.defaultOutputFormat,
directive: input.directive)
let result: TalkSpeakResult = try await GatewayConnection.shared.requestDecoded(
method: .talkSpeak,
params: params,
timeoutMs: max(30000, input.synthTimeoutSeconds * 1000 + 5000))
guard let audioData = Data(base64Encoded: result.audiobase64), !audioData.isEmpty else {
throw NSError(domain: "TalkSpeak", code: 1, userInfo: [
NSLocalizedDescriptionKey: "gateway talk.speak returned empty audio",
])
}
_ = await self.stopPCM()
_ = await self.stopMP3()
if self.interruptOnSpeech {
guard await self.prepareForPlayback(generation: input.generation) else { return }
}
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
self.phase = .speaking
let playback = await self.playTalkAudio(data: audioData)
self.ttsLogger
.info(
"talk gateway audio provider=\(result.provider, privacy: .public) " +
"format=\(result.outputformat ?? "unknown", privacy: .public) " +
"finished=\(playback.finished, privacy: .public)")
if !playback.finished, playback.interruptedAt == nil {
throw NSError(domain: "TalkSpeak", code: 2, userInfo: [
NSLocalizedDescriptionKey: "gateway talk.speak audio playback failed",
])
}
}
private func playSystemVoice(input: TalkPlaybackInput) async throws {
self.ttsLogger.info("talk system voice start chars=\(input.cleanedText.count, privacy: .public)")
if self.interruptOnSpeech {
@@ -847,6 +893,54 @@ actor TalkModeRuntime {
}
extension TalkModeRuntime {
static func makeTalkSpeakParams(
text: String,
voiceId: String?,
modelId: String?,
outputFormat: String?,
directive: TalkDirective?) -> [String: AnyCodable]
{
var params: [String: AnyCodable] = ["text": AnyCodable(text)]
func addString(_ key: String, _ value: String?) {
let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !trimmed.isEmpty else { return }
params[key] = AnyCodable(trimmed)
}
addString("voiceId", voiceId)
addString("modelId", directive?.modelId ?? modelId)
addString("outputFormat", directive?.outputFormat ?? outputFormat)
if let speed = directive?.speed {
params["speed"] = AnyCodable(speed)
}
if let rateWPM = directive?.rateWPM {
params["rateWpm"] = AnyCodable(rateWPM)
}
if let stability = directive?.stability {
params["stability"] = AnyCodable(stability)
}
if let similarity = directive?.similarity {
params["similarity"] = AnyCodable(similarity)
}
if let style = directive?.style {
params["style"] = AnyCodable(style)
}
if let speakerBoost = directive?.speakerBoost {
params["speakerBoost"] = AnyCodable(speakerBoost)
}
if let seed = directive?.seed {
params["seed"] = AnyCodable(seed)
}
addString("normalize", directive?.normalize)
addString("language", directive?.language)
if let latencyTier = directive?.latencyTier {
params["latencyTier"] = AnyCodable(latencyTier)
}
return params
}
// MARK: - Audio playback (MainActor helpers)
@MainActor

View File

@@ -1,3 +1,4 @@
import OpenClawKit
import Speech
import Testing
@testable import OpenClaw
@@ -16,23 +17,19 @@ struct TalkModeRuntimeSpeechTests {
let elevenLabsPlan = TalkModeRuntime.playbackPlan(
provider: "elevenlabs",
apiKey: "key",
voiceId: "voice"
)
voiceId: "voice")
let missingKeyPlan = TalkModeRuntime.playbackPlan(
provider: "elevenlabs",
apiKey: nil,
voiceId: "voice"
)
voiceId: "voice")
let missingVoicePlan = TalkModeRuntime.playbackPlan(
provider: "elevenlabs",
apiKey: "key",
voiceId: nil
)
voiceId: nil)
let blankKeyPlan = TalkModeRuntime.playbackPlan(
provider: "elevenlabs",
apiKey: "",
voiceId: "voice"
)
voiceId: "voice")
let mlxPlan = TalkModeRuntime.playbackPlan(provider: "mlx", apiKey: nil, voiceId: nil)
let systemPlan = TalkModeRuntime.playbackPlan(provider: "system", apiKey: nil, voiceId: nil)
@@ -43,4 +40,40 @@ struct TalkModeRuntimeSpeechTests {
#expect(mlxPlan == .mlxThenSystemVoice)
#expect(systemPlan == .systemVoiceOnly)
}
@Test func `talk speak params carry resolved voice and directive overrides`() {
let params = TalkModeRuntime.makeTalkSpeakParams(
text: "hello",
voiceId: "voice-123",
modelId: "eleven_v3",
outputFormat: "mp3_44100_128",
directive: TalkDirective(
modelId: "eleven_turbo_v2_5",
speed: 1.1,
rateWPM: 180,
stability: 0.4,
similarity: 0.7,
style: 0.2,
speakerBoost: true,
seed: 42,
normalize: "auto",
language: "en",
outputFormat: "mp3_44100_128",
latencyTier: 3))
#expect(params["text"]?.value as? String == "hello")
#expect(params["voiceId"]?.value as? String == "voice-123")
#expect(params["modelId"]?.value as? String == "eleven_turbo_v2_5")
#expect(params["outputFormat"]?.value as? String == "mp3_44100_128")
#expect(params["speed"]?.value as? Double == 1.1)
#expect(params["rateWpm"]?.value as? Int == 180)
#expect(params["stability"]?.value as? Double == 0.4)
#expect(params["similarity"]?.value as? Double == 0.7)
#expect(params["style"]?.value as? Double == 0.2)
#expect(params["speakerBoost"]?.value as? Bool == true)
#expect(params["seed"]?.value as? Int == 42)
#expect(params["normalize"]?.value as? String == "auto")
#expect(params["language"]?.value as? String == "en")
#expect(params["latencyTier"]?.value as? Int == 3)
}
}