fix(voice-call): honor telephony tts timeout

This commit is contained in:
Peter Steinberger
2026-04-25 03:47:49 +01:00
parent e5babbb5e7
commit 7dc005fab6
5 changed files with 82 additions and 9 deletions

View File

@@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai
- Dashboard/Windows: open Control UI and OAuth URLs through the system URL handler without `cmd.exe` parsing or PATH-based `rundll32` lookup, and reject non-HTTP browser-open inputs. Fixes #71098. Thanks @Sanjays2402.
- Providers/OpenAI: separate API-key and Codex sign-in onboarding groups, and avoid replaying stale OpenAI Responses reasoning blocks after a model route switch.
- Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
- Plugins/Voice Call: honor configured TTS timeouts for Twilio media-stream playback and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens.
- Skills: honor legacy `metadata.clawdbot` requirements and installer hints when `metadata.openclaw` is absent, so older skills no longer appear ready when required binaries are missing. Fixes #71323. Thanks @chen-zhang-cs-code.
- Browser/config: expand `~` in `browser.executablePath` before Chromium launch, so home-relative custom browser paths no longer fail with `ENOENT`. Fixes #67264. Thanks @Quratulain-bilal.
- Telegram/streaming: hide tool-progress status updates by default while keeping explicit `streaming.preview.toolProgress` opt-in support for edited preview messages. Fixes #71320. Thanks @neeravmakwana.

View File

@@ -335,6 +335,7 @@ describe("TwilioProvider", () => {
provider.setMediaStreamHandler(mediaStreamHandler as never);
provider.setTTSProvider({
synthesisTimeoutMs: 5000,
synthesizeForTelephony: async () => await new Promise<Buffer>(() => {}),
});
@@ -344,8 +345,8 @@ describe("TwilioProvider", () => {
providerCallId: "CA-timeout",
text: "Timeout me",
}),
).rejects.toThrow("Telephony TTS synthesis timed out");
await vi.advanceTimersByTimeAsync(8_100);
).rejects.toThrow("Telephony TTS synthesis timed out after 5000ms");
await vi.advanceTimersByTimeAsync(5_100);
await playExpectation;
expect(sendAudio).toHaveBeenCalled();
expect(sendMark).not.toHaveBeenCalled();
@@ -373,6 +374,7 @@ describe("TwilioProvider", () => {
provider.setMediaStreamHandler(mediaStreamHandler as never);
provider.setTTSProvider({
synthesisTimeoutMs: 5000,
synthesizeForTelephony: async () => Buffer.alloc(320),
});
@@ -386,4 +388,38 @@ describe("TwilioProvider", () => {
expect(sendAudio).toHaveBeenCalled();
expect(sendMark).toHaveBeenCalledTimes(1);
});
it("fails stream playback when telephony synthesis returns empty audio", async () => {
const provider = createProvider();
provider.registerCallStream("CA-empty", "MZ-empty");
const sendAudio = vi.fn();
const sendMark = vi.fn();
const mediaStreamHandler = {
queueTts: async (
_streamSid: string,
playFn: (signal: AbortSignal) => Promise<void>,
): Promise<void> => {
await playFn(new AbortController().signal);
},
sendAudio,
sendMark,
};
provider.setMediaStreamHandler(mediaStreamHandler as never);
provider.setTTSProvider({
synthesisTimeoutMs: 5000,
synthesizeForTelephony: async () => Buffer.alloc(0),
});
await expect(
provider.playTts({
callId: "call-empty",
providerCallId: "CA-empty",
text: "Empty audio",
}),
).rejects.toThrow("Telephony TTS produced no audio");
expect(sendAudio).toHaveBeenCalled();
expect(sendMark).not.toHaveBeenCalled();
});
});

View File

@@ -63,7 +63,6 @@ type StreamSendResult = {
export class TwilioProvider implements VoiceCallProvider {
readonly name = "twilio" as const;
private static readonly TTS_SYNTH_TIMEOUT_MS = 8000;
private readonly accountSid: string;
private readonly authToken: string;
@@ -672,16 +671,13 @@ export class TwilioProvider implements VoiceCallProvider {
// Generate audio with core TTS (returns mu-law at 8kHz)
let muLawAudio: Buffer;
let synthTimeout: ReturnType<typeof setTimeout> | null = null;
const synthTimeoutMs = ttsProvider.synthesisTimeoutMs;
try {
const synthPromise = ttsProvider.synthesizeForTelephony(text);
const timeoutPromise = new Promise<Buffer>((_, reject) => {
synthTimeout = setTimeout(() => {
reject(
new Error(
`Telephony TTS synthesis timed out after ${TwilioProvider.TTS_SYNTH_TIMEOUT_MS}ms`,
),
);
}, TwilioProvider.TTS_SYNTH_TIMEOUT_MS);
reject(new Error(`Telephony TTS synthesis timed out after ${synthTimeoutMs}ms`));
}, synthTimeoutMs);
});
muLawAudio = await Promise.race([synthPromise, timeoutPromise]);
} finally {
@@ -691,6 +687,10 @@ export class TwilioProvider implements VoiceCallProvider {
clearInterval(keepAlive);
}
if (muLawAudio.length === 0) {
throw new Error("Telephony TTS produced no audio");
}
let chunkAttempts = 0;
let chunkDelivered = 0;
let nextChunkDueAt = Date.now() + CHUNK_DELAY_MS;

View File

@@ -116,4 +116,34 @@ describe("createTelephonyTtsProvider deepMerge hardening", () => {
"[voice-call] Telephony TTS fallback used from=elevenlabs to=microsoft attempts=elevenlabs -> microsoft",
);
});
it("exposes configured timeoutMs as synthesisTimeoutMs", () => {
const provider = createTelephonyTtsProvider({
coreConfig: { messages: { tts: { provider: "openai", timeoutMs: 15000 } } },
runtime: {
textToSpeechTelephony: async () => ({
success: true,
audioBuffer: Buffer.alloc(2),
sampleRate: 8000,
}),
},
});
expect(provider.synthesisTimeoutMs).toBe(15000);
});
it("keeps the telephony timeout default when timeoutMs is not configured", () => {
const provider = createTelephonyTtsProvider({
coreConfig: createCoreConfig(),
runtime: {
textToSpeechTelephony: async () => ({
success: true,
audioBuffer: Buffer.alloc(2),
sampleRate: 8000,
}),
},
});
expect(provider.synthesisTimeoutMs).toBe(8000);
});
});

View File

@@ -20,9 +20,12 @@ export type TelephonyTtsRuntime = {
};
export type TelephonyTtsProvider = {
synthesisTimeoutMs: number;
synthesizeForTelephony: (text: string) => Promise<Buffer>;
};
const TELEPHONY_DEFAULT_TTS_TIMEOUT_MS = 8000;
export function createTelephonyTtsProvider(params: {
coreConfig: CoreConfig;
ttsOverride?: VoiceCallTtsConfig;
@@ -33,8 +36,11 @@ export function createTelephonyTtsProvider(params: {
}): TelephonyTtsProvider {
const { coreConfig, ttsOverride, runtime, logger } = params;
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
const synthesisTimeoutMs =
mergedConfig.messages?.tts?.timeoutMs ?? TELEPHONY_DEFAULT_TTS_TIMEOUT_MS;
return {
synthesisTimeoutMs,
synthesizeForTelephony: async (text: string) => {
const result = await runtime.textToSpeechTelephony({
text,