fix(voice-call): honor telephony tts timeout

2026-04-30 22:12:32 +08:00 · 2026-04-25 03:47:49 +01:00
parent e5babbb5e7
commit 7dc005fab6
5 changed files with 82 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai
 - Dashboard/Windows: open Control UI and OAuth URLs through the system URL handler without `cmd.exe` parsing or PATH-based `rundll32` lookup, and reject non-HTTP browser-open inputs. Fixes #71098. Thanks @Sanjays2402.
 - Providers/OpenAI: separate API-key and Codex sign-in onboarding groups, and avoid replaying stale OpenAI Responses reasoning blocks after a model route switch.
 - Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
+- Plugins/Voice Call: honor configured TTS timeouts for Twilio media-stream playback and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens.
 - Skills: honor legacy `metadata.clawdbot` requirements and installer hints when `metadata.openclaw` is absent, so older skills no longer appear ready when required binaries are missing. Fixes #71323. Thanks @chen-zhang-cs-code.
 - Browser/config: expand `~` in `browser.executablePath` before Chromium launch, so home-relative custom browser paths no longer fail with `ENOENT`. Fixes #67264. Thanks @Quratulain-bilal.
 - Telegram/streaming: hide tool-progress status updates by default while keeping explicit `streaming.preview.toolProgress` opt-in support for edited preview messages. Fixes #71320. Thanks @neeravmakwana.
--- a/extensions/voice-call/src/providers/twilio.test.ts
+++ b/extensions/voice-call/src/providers/twilio.test.ts
@@ -335,6 +335,7 @@ describe("TwilioProvider", () => {

      provider.setMediaStreamHandler(mediaStreamHandler as never);
      provider.setTTSProvider({
+        synthesisTimeoutMs: 5000,
        synthesizeForTelephony: async () => await new Promise<Buffer>(() => {}),
      });

@@ -344,8 +345,8 @@ describe("TwilioProvider", () => {
          providerCallId: "CA-timeout",
          text: "Timeout me",
        }),
-      ).rejects.toThrow("Telephony TTS synthesis timed out");
-      await vi.advanceTimersByTimeAsync(8_100);
+      ).rejects.toThrow("Telephony TTS synthesis timed out after 5000ms");
+      await vi.advanceTimersByTimeAsync(5_100);
      await playExpectation;
      expect(sendAudio).toHaveBeenCalled();
      expect(sendMark).not.toHaveBeenCalled();
@@ -373,6 +374,7 @@ describe("TwilioProvider", () => {

    provider.setMediaStreamHandler(mediaStreamHandler as never);
    provider.setTTSProvider({
+      synthesisTimeoutMs: 5000,
      synthesizeForTelephony: async () => Buffer.alloc(320),
    });

@@ -386,4 +388,38 @@ describe("TwilioProvider", () => {
    expect(sendAudio).toHaveBeenCalled();
    expect(sendMark).toHaveBeenCalledTimes(1);
  });
+
+  it("fails stream playback when telephony synthesis returns empty audio", async () => {
+    const provider = createProvider();
+    provider.registerCallStream("CA-empty", "MZ-empty");
+
+    const sendAudio = vi.fn();
+    const sendMark = vi.fn();
+    const mediaStreamHandler = {
+      queueTts: async (
+        _streamSid: string,
+        playFn: (signal: AbortSignal) => Promise<void>,
+      ): Promise<void> => {
+        await playFn(new AbortController().signal);
+      },
+      sendAudio,
+      sendMark,
+    };
+
+    provider.setMediaStreamHandler(mediaStreamHandler as never);
+    provider.setTTSProvider({
+      synthesisTimeoutMs: 5000,
+      synthesizeForTelephony: async () => Buffer.alloc(0),
+    });
+
+    await expect(
+      provider.playTts({
+        callId: "call-empty",
+        providerCallId: "CA-empty",
+        text: "Empty audio",
+      }),
+    ).rejects.toThrow("Telephony TTS produced no audio");
+    expect(sendAudio).toHaveBeenCalled();
+    expect(sendMark).not.toHaveBeenCalled();
+  });
 });
--- a/extensions/voice-call/src/providers/twilio.ts
+++ b/extensions/voice-call/src/providers/twilio.ts
@@ -63,7 +63,6 @@ type StreamSendResult = {

 export class TwilioProvider implements VoiceCallProvider {
  readonly name = "twilio" as const;
-  private static readonly TTS_SYNTH_TIMEOUT_MS = 8000;

  private readonly accountSid: string;
  private readonly authToken: string;
@@ -672,16 +671,13 @@ export class TwilioProvider implements VoiceCallProvider {
      // Generate audio with core TTS (returns mu-law at 8kHz)
      let muLawAudio: Buffer;
      let synthTimeout: ReturnType<typeof setTimeout> | null = null;
+      const synthTimeoutMs = ttsProvider.synthesisTimeoutMs;
      try {
        const synthPromise = ttsProvider.synthesizeForTelephony(text);
        const timeoutPromise = new Promise<Buffer>((_, reject) => {
          synthTimeout = setTimeout(() => {
-            reject(
-              new Error(
-                `Telephony TTS synthesis timed out after ${TwilioProvider.TTS_SYNTH_TIMEOUT_MS}ms`,
-              ),
-            );
-          }, TwilioProvider.TTS_SYNTH_TIMEOUT_MS);
+            reject(new Error(`Telephony TTS synthesis timed out after ${synthTimeoutMs}ms`));
+          }, synthTimeoutMs);
        });
        muLawAudio = await Promise.race([synthPromise, timeoutPromise]);
      } finally {
@@ -691,6 +687,10 @@ export class TwilioProvider implements VoiceCallProvider {
        clearInterval(keepAlive);
      }

+      if (muLawAudio.length === 0) {
+        throw new Error("Telephony TTS produced no audio");
+      }
+
      let chunkAttempts = 0;
      let chunkDelivered = 0;
      let nextChunkDueAt = Date.now() + CHUNK_DELAY_MS;
--- a/extensions/voice-call/src/telephony-tts.test.ts
+++ b/extensions/voice-call/src/telephony-tts.test.ts
@@ -116,4 +116,34 @@ describe("createTelephonyTtsProvider deepMerge hardening", () => {
      "[voice-call] Telephony TTS fallback used from=elevenlabs to=microsoft attempts=elevenlabs -> microsoft",
    );
  });
+
+  it("exposes configured timeoutMs as synthesisTimeoutMs", () => {
+    const provider = createTelephonyTtsProvider({
+      coreConfig: { messages: { tts: { provider: "openai", timeoutMs: 15000 } } },
+      runtime: {
+        textToSpeechTelephony: async () => ({
+          success: true,
+          audioBuffer: Buffer.alloc(2),
+          sampleRate: 8000,
+        }),
+      },
+    });
+
+    expect(provider.synthesisTimeoutMs).toBe(15000);
+  });
+
+  it("keeps the telephony timeout default when timeoutMs is not configured", () => {
+    const provider = createTelephonyTtsProvider({
+      coreConfig: createCoreConfig(),
+      runtime: {
+        textToSpeechTelephony: async () => ({
+          success: true,
+          audioBuffer: Buffer.alloc(2),
+          sampleRate: 8000,
+        }),
+      },
+    });
+
+    expect(provider.synthesisTimeoutMs).toBe(8000);
+  });
 });
--- a/extensions/voice-call/src/telephony-tts.ts
+++ b/extensions/voice-call/src/telephony-tts.ts
@@ -20,9 +20,12 @@ export type TelephonyTtsRuntime = {
 };

 export type TelephonyTtsProvider = {
+  synthesisTimeoutMs: number;
  synthesizeForTelephony: (text: string) => Promise<Buffer>;
 };

+const TELEPHONY_DEFAULT_TTS_TIMEOUT_MS = 8000;
+
 export function createTelephonyTtsProvider(params: {
  coreConfig: CoreConfig;
  ttsOverride?: VoiceCallTtsConfig;
@@ -33,8 +36,11 @@ export function createTelephonyTtsProvider(params: {
 }): TelephonyTtsProvider {
  const { coreConfig, ttsOverride, runtime, logger } = params;
  const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
+  const synthesisTimeoutMs =
+    mergedConfig.messages?.tts?.timeoutMs ?? TELEPHONY_DEFAULT_TTS_TIMEOUT_MS;

  return {
+    synthesisTimeoutMs,
    synthesizeForTelephony: async (text: string) => {
      const result = await runtime.textToSpeechTelephony({
        text,