mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-05-01 06:36:23 +08:00
fix(macos): retry talk tts via gateway
This commit is contained in:
@@ -82,6 +82,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Providers/OpenAI-compatible: treat singular MLX-style `finish_reason: "tool_call"` as tool use instead of a provider error. Fixes #61499.
|
||||
- Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
|
||||
- Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153.
|
||||
- macOS Talk Mode: retry failed local ElevenLabs stream playback through gateway `talk.speak` before falling back to the system voice, so configured ElevenLabs voices still play when streaming playback fails. Fixes #65662.
|
||||
- Plugins/Voice Call: reap stale pre-answer calls by default, honor configured TTS timeouts for Twilio media-stream playback, and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens.
|
||||
- Plugins/Voice Call: terminate expired restored call sessions with the provider and restart restored max-duration timers with only the remaining duration, preventing stale outbound retry loops after Gateway restarts. Fixes #48739. Thanks @mira-solari.
|
||||
- Plugins/Voice Call: start provider STT after Telnyx outbound conversation greetings and pass configured Telnyx voice IDs through to the speak action. Fixes #56091. Thanks @Roshan.
|
||||
|
||||
@@ -70,6 +70,7 @@ actor GatewayConnection {
|
||||
case wizardStatus = "wizard.status"
|
||||
case talkConfig = "talk.config"
|
||||
case talkMode = "talk.mode"
|
||||
case talkSpeak = "talk.speak"
|
||||
case webLoginStart = "web.login.start"
|
||||
case webLoginWait = "web.login.wait"
|
||||
case channelsLogout = "channels.logout"
|
||||
|
||||
@@ -2,6 +2,7 @@ import AVFoundation
|
||||
import Foundation
|
||||
import OpenClawChatUI
|
||||
import OpenClawKit
|
||||
import OpenClawProtocol
|
||||
import OSLog
|
||||
import Speech
|
||||
|
||||
@@ -475,7 +476,16 @@ actor TalkModeRuntime {
|
||||
self.ttsLogger
|
||||
.error(
|
||||
"talk TTS failed: \(error.localizedDescription, privacy: .public); " +
|
||||
"falling back to system voice")
|
||||
"retrying gateway talk.speak")
|
||||
do {
|
||||
try await self.playGatewayTalkSpeak(input: input)
|
||||
return
|
||||
} catch {
|
||||
self.ttsLogger
|
||||
.error(
|
||||
"talk gateway TTS failed: \(error.localizedDescription, privacy: .public); " +
|
||||
"falling back to system voice")
|
||||
}
|
||||
do {
|
||||
try await self.playSystemVoice(input: input)
|
||||
} catch {
|
||||
@@ -720,6 +730,42 @@ actor TalkModeRuntime {
|
||||
return await self.playMP3(stream: stream)
|
||||
}
|
||||
|
||||
private func playGatewayTalkSpeak(input: TalkPlaybackInput) async throws {
|
||||
let params = Self.makeTalkSpeakParams(
|
||||
text: input.cleanedText,
|
||||
voiceId: input.voiceId,
|
||||
modelId: self.currentModelId ?? self.defaultModelId,
|
||||
outputFormat: self.defaultOutputFormat,
|
||||
directive: input.directive)
|
||||
let result: TalkSpeakResult = try await GatewayConnection.shared.requestDecoded(
|
||||
method: .talkSpeak,
|
||||
params: params,
|
||||
timeoutMs: max(30000, input.synthTimeoutSeconds * 1000 + 5000))
|
||||
guard let audioData = Data(base64Encoded: result.audiobase64), !audioData.isEmpty else {
|
||||
throw NSError(domain: "TalkSpeak", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "gateway talk.speak returned empty audio",
|
||||
])
|
||||
}
|
||||
_ = await self.stopPCM()
|
||||
_ = await self.stopMP3()
|
||||
if self.interruptOnSpeech {
|
||||
guard await self.prepareForPlayback(generation: input.generation) else { return }
|
||||
}
|
||||
await MainActor.run { TalkModeController.shared.updatePhase(.speaking) }
|
||||
self.phase = .speaking
|
||||
let playback = await self.playTalkAudio(data: audioData)
|
||||
self.ttsLogger
|
||||
.info(
|
||||
"talk gateway audio provider=\(result.provider, privacy: .public) " +
|
||||
"format=\(result.outputformat ?? "unknown", privacy: .public) " +
|
||||
"finished=\(playback.finished, privacy: .public)")
|
||||
if !playback.finished, playback.interruptedAt == nil {
|
||||
throw NSError(domain: "TalkSpeak", code: 2, userInfo: [
|
||||
NSLocalizedDescriptionKey: "gateway talk.speak audio playback failed",
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
private func playSystemVoice(input: TalkPlaybackInput) async throws {
|
||||
self.ttsLogger.info("talk system voice start chars=\(input.cleanedText.count, privacy: .public)")
|
||||
if self.interruptOnSpeech {
|
||||
@@ -847,6 +893,54 @@ actor TalkModeRuntime {
|
||||
}
|
||||
|
||||
extension TalkModeRuntime {
|
||||
static func makeTalkSpeakParams(
|
||||
text: String,
|
||||
voiceId: String?,
|
||||
modelId: String?,
|
||||
outputFormat: String?,
|
||||
directive: TalkDirective?) -> [String: AnyCodable]
|
||||
{
|
||||
var params: [String: AnyCodable] = ["text": AnyCodable(text)]
|
||||
|
||||
func addString(_ key: String, _ value: String?) {
|
||||
let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
guard !trimmed.isEmpty else { return }
|
||||
params[key] = AnyCodable(trimmed)
|
||||
}
|
||||
|
||||
addString("voiceId", voiceId)
|
||||
addString("modelId", directive?.modelId ?? modelId)
|
||||
addString("outputFormat", directive?.outputFormat ?? outputFormat)
|
||||
if let speed = directive?.speed {
|
||||
params["speed"] = AnyCodable(speed)
|
||||
}
|
||||
if let rateWPM = directive?.rateWPM {
|
||||
params["rateWpm"] = AnyCodable(rateWPM)
|
||||
}
|
||||
if let stability = directive?.stability {
|
||||
params["stability"] = AnyCodable(stability)
|
||||
}
|
||||
if let similarity = directive?.similarity {
|
||||
params["similarity"] = AnyCodable(similarity)
|
||||
}
|
||||
if let style = directive?.style {
|
||||
params["style"] = AnyCodable(style)
|
||||
}
|
||||
if let speakerBoost = directive?.speakerBoost {
|
||||
params["speakerBoost"] = AnyCodable(speakerBoost)
|
||||
}
|
||||
if let seed = directive?.seed {
|
||||
params["seed"] = AnyCodable(seed)
|
||||
}
|
||||
addString("normalize", directive?.normalize)
|
||||
addString("language", directive?.language)
|
||||
if let latencyTier = directive?.latencyTier {
|
||||
params["latencyTier"] = AnyCodable(latencyTier)
|
||||
}
|
||||
|
||||
return params
|
||||
}
|
||||
|
||||
// MARK: - Audio playback (MainActor helpers)
|
||||
|
||||
@MainActor
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import OpenClawKit
|
||||
import Speech
|
||||
import Testing
|
||||
@testable import OpenClaw
|
||||
@@ -16,23 +17,19 @@ struct TalkModeRuntimeSpeechTests {
|
||||
let elevenLabsPlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: "key",
|
||||
voiceId: "voice"
|
||||
)
|
||||
voiceId: "voice")
|
||||
let missingKeyPlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: nil,
|
||||
voiceId: "voice"
|
||||
)
|
||||
voiceId: "voice")
|
||||
let missingVoicePlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: "key",
|
||||
voiceId: nil
|
||||
)
|
||||
voiceId: nil)
|
||||
let blankKeyPlan = TalkModeRuntime.playbackPlan(
|
||||
provider: "elevenlabs",
|
||||
apiKey: "",
|
||||
voiceId: "voice"
|
||||
)
|
||||
voiceId: "voice")
|
||||
let mlxPlan = TalkModeRuntime.playbackPlan(provider: "mlx", apiKey: nil, voiceId: nil)
|
||||
let systemPlan = TalkModeRuntime.playbackPlan(provider: "system", apiKey: nil, voiceId: nil)
|
||||
|
||||
@@ -43,4 +40,40 @@ struct TalkModeRuntimeSpeechTests {
|
||||
#expect(mlxPlan == .mlxThenSystemVoice)
|
||||
#expect(systemPlan == .systemVoiceOnly)
|
||||
}
|
||||
|
||||
@Test func `talk speak params carry resolved voice and directive overrides`() {
|
||||
let params = TalkModeRuntime.makeTalkSpeakParams(
|
||||
text: "hello",
|
||||
voiceId: "voice-123",
|
||||
modelId: "eleven_v3",
|
||||
outputFormat: "mp3_44100_128",
|
||||
directive: TalkDirective(
|
||||
modelId: "eleven_turbo_v2_5",
|
||||
speed: 1.1,
|
||||
rateWPM: 180,
|
||||
stability: 0.4,
|
||||
similarity: 0.7,
|
||||
style: 0.2,
|
||||
speakerBoost: true,
|
||||
seed: 42,
|
||||
normalize: "auto",
|
||||
language: "en",
|
||||
outputFormat: "mp3_44100_128",
|
||||
latencyTier: 3))
|
||||
|
||||
#expect(params["text"]?.value as? String == "hello")
|
||||
#expect(params["voiceId"]?.value as? String == "voice-123")
|
||||
#expect(params["modelId"]?.value as? String == "eleven_turbo_v2_5")
|
||||
#expect(params["outputFormat"]?.value as? String == "mp3_44100_128")
|
||||
#expect(params["speed"]?.value as? Double == 1.1)
|
||||
#expect(params["rateWpm"]?.value as? Int == 180)
|
||||
#expect(params["stability"]?.value as? Double == 0.4)
|
||||
#expect(params["similarity"]?.value as? Double == 0.7)
|
||||
#expect(params["style"]?.value as? Double == 0.2)
|
||||
#expect(params["speakerBoost"]?.value as? Bool == true)
|
||||
#expect(params["seed"]?.value as? Int == 42)
|
||||
#expect(params["normalize"]?.value as? String == "auto")
|
||||
#expect(params["language"]?.value as? String == "en")
|
||||
#expect(params["latencyTier"]?.value as? Int == 3)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user