diff --git a/CHANGELOG.md b/CHANGELOG.md index 05b2340b51c..c99a9564015 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai - Dashboard/Windows: open Control UI and OAuth URLs through the system URL handler without `cmd.exe` parsing or PATH-based `rundll32` lookup, and reject non-HTTP browser-open inputs. Fixes #71098. Thanks @Sanjays2402. - Providers/OpenAI: separate API-key and Codex sign-in onboarding groups, and avoid replaying stale OpenAI Responses reasoning blocks after a model route switch. - Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot. +- Plugins/Voice Call: honor configured TTS timeouts for Twilio media-stream playback and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens. - Skills: honor legacy `metadata.clawdbot` requirements and installer hints when `metadata.openclaw` is absent, so older skills no longer appear ready when required binaries are missing. Fixes #71323. Thanks @chen-zhang-cs-code. - Browser/config: expand `~` in `browser.executablePath` before Chromium launch, so home-relative custom browser paths no longer fail with `ENOENT`. Fixes #67264. Thanks @Quratulain-bilal. - Telegram/streaming: hide tool-progress status updates by default while keeping explicit `streaming.preview.toolProgress` opt-in support for edited preview messages. Fixes #71320. Thanks @neeravmakwana. diff --git a/extensions/voice-call/src/providers/twilio.test.ts b/extensions/voice-call/src/providers/twilio.test.ts index 836701422f7..ec1f7793015 100644 --- a/extensions/voice-call/src/providers/twilio.test.ts +++ b/extensions/voice-call/src/providers/twilio.test.ts @@ -335,6 +335,7 @@ describe("TwilioProvider", () => { provider.setMediaStreamHandler(mediaStreamHandler as never); provider.setTTSProvider({ + synthesisTimeoutMs: 5000, synthesizeForTelephony: async () => await new Promise(() => {}), }); @@ -344,8 +345,8 @@ describe("TwilioProvider", () => { providerCallId: "CA-timeout", text: "Timeout me", }), - ).rejects.toThrow("Telephony TTS synthesis timed out"); - await vi.advanceTimersByTimeAsync(8_100); + ).rejects.toThrow("Telephony TTS synthesis timed out after 5000ms"); + await vi.advanceTimersByTimeAsync(5_100); await playExpectation; expect(sendAudio).toHaveBeenCalled(); expect(sendMark).not.toHaveBeenCalled(); @@ -373,6 +374,7 @@ describe("TwilioProvider", () => { provider.setMediaStreamHandler(mediaStreamHandler as never); provider.setTTSProvider({ + synthesisTimeoutMs: 5000, synthesizeForTelephony: async () => Buffer.alloc(320), }); @@ -386,4 +388,38 @@ describe("TwilioProvider", () => { expect(sendAudio).toHaveBeenCalled(); expect(sendMark).toHaveBeenCalledTimes(1); }); + + it("fails stream playback when telephony synthesis returns empty audio", async () => { + const provider = createProvider(); + provider.registerCallStream("CA-empty", "MZ-empty"); + + const sendAudio = vi.fn(); + const sendMark = vi.fn(); + const mediaStreamHandler = { + queueTts: async ( + _streamSid: string, + playFn: (signal: AbortSignal) => Promise, + ): Promise => { + await playFn(new AbortController().signal); + }, + sendAudio, + sendMark, + }; + + provider.setMediaStreamHandler(mediaStreamHandler as never); + provider.setTTSProvider({ + synthesisTimeoutMs: 5000, + synthesizeForTelephony: async () => Buffer.alloc(0), + }); + + await expect( + provider.playTts({ + callId: "call-empty", + providerCallId: "CA-empty", + text: "Empty audio", + }), + ).rejects.toThrow("Telephony TTS produced no audio"); + expect(sendAudio).toHaveBeenCalled(); + expect(sendMark).not.toHaveBeenCalled(); + }); }); diff --git a/extensions/voice-call/src/providers/twilio.ts b/extensions/voice-call/src/providers/twilio.ts index e977b94c00f..0a7bb0377c8 100644 --- a/extensions/voice-call/src/providers/twilio.ts +++ b/extensions/voice-call/src/providers/twilio.ts @@ -63,7 +63,6 @@ type StreamSendResult = { export class TwilioProvider implements VoiceCallProvider { readonly name = "twilio" as const; - private static readonly TTS_SYNTH_TIMEOUT_MS = 8000; private readonly accountSid: string; private readonly authToken: string; @@ -672,16 +671,13 @@ export class TwilioProvider implements VoiceCallProvider { // Generate audio with core TTS (returns mu-law at 8kHz) let muLawAudio: Buffer; let synthTimeout: ReturnType | null = null; + const synthTimeoutMs = ttsProvider.synthesisTimeoutMs; try { const synthPromise = ttsProvider.synthesizeForTelephony(text); const timeoutPromise = new Promise((_, reject) => { synthTimeout = setTimeout(() => { - reject( - new Error( - `Telephony TTS synthesis timed out after ${TwilioProvider.TTS_SYNTH_TIMEOUT_MS}ms`, - ), - ); - }, TwilioProvider.TTS_SYNTH_TIMEOUT_MS); + reject(new Error(`Telephony TTS synthesis timed out after ${synthTimeoutMs}ms`)); + }, synthTimeoutMs); }); muLawAudio = await Promise.race([synthPromise, timeoutPromise]); } finally { @@ -691,6 +687,10 @@ export class TwilioProvider implements VoiceCallProvider { clearInterval(keepAlive); } + if (muLawAudio.length === 0) { + throw new Error("Telephony TTS produced no audio"); + } + let chunkAttempts = 0; let chunkDelivered = 0; let nextChunkDueAt = Date.now() + CHUNK_DELAY_MS; diff --git a/extensions/voice-call/src/telephony-tts.test.ts b/extensions/voice-call/src/telephony-tts.test.ts index 75782787d0b..c1e26f868fc 100644 --- a/extensions/voice-call/src/telephony-tts.test.ts +++ b/extensions/voice-call/src/telephony-tts.test.ts @@ -116,4 +116,34 @@ describe("createTelephonyTtsProvider deepMerge hardening", () => { "[voice-call] Telephony TTS fallback used from=elevenlabs to=microsoft attempts=elevenlabs -> microsoft", ); }); + + it("exposes configured timeoutMs as synthesisTimeoutMs", () => { + const provider = createTelephonyTtsProvider({ + coreConfig: { messages: { tts: { provider: "openai", timeoutMs: 15000 } } }, + runtime: { + textToSpeechTelephony: async () => ({ + success: true, + audioBuffer: Buffer.alloc(2), + sampleRate: 8000, + }), + }, + }); + + expect(provider.synthesisTimeoutMs).toBe(15000); + }); + + it("keeps the telephony timeout default when timeoutMs is not configured", () => { + const provider = createTelephonyTtsProvider({ + coreConfig: createCoreConfig(), + runtime: { + textToSpeechTelephony: async () => ({ + success: true, + audioBuffer: Buffer.alloc(2), + sampleRate: 8000, + }), + }, + }); + + expect(provider.synthesisTimeoutMs).toBe(8000); + }); }); diff --git a/extensions/voice-call/src/telephony-tts.ts b/extensions/voice-call/src/telephony-tts.ts index bfedb4db1f7..98e319943c5 100644 --- a/extensions/voice-call/src/telephony-tts.ts +++ b/extensions/voice-call/src/telephony-tts.ts @@ -20,9 +20,12 @@ export type TelephonyTtsRuntime = { }; export type TelephonyTtsProvider = { + synthesisTimeoutMs: number; synthesizeForTelephony: (text: string) => Promise; }; +const TELEPHONY_DEFAULT_TTS_TIMEOUT_MS = 8000; + export function createTelephonyTtsProvider(params: { coreConfig: CoreConfig; ttsOverride?: VoiceCallTtsConfig; @@ -33,8 +36,11 @@ export function createTelephonyTtsProvider(params: { }): TelephonyTtsProvider { const { coreConfig, ttsOverride, runtime, logger } = params; const mergedConfig = applyTtsOverride(coreConfig, ttsOverride); + const synthesisTimeoutMs = + mergedConfig.messages?.tts?.timeoutMs ?? TELEPHONY_DEFAULT_TTS_TIMEOUT_MS; return { + synthesisTimeoutMs, synthesizeForTelephony: async (text: string) => { const result = await runtime.textToSpeechTelephony({ text,