mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-04-30 22:12:32 +08:00
fix(voice-call): honor telephony tts timeout
This commit is contained in:
@@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Dashboard/Windows: open Control UI and OAuth URLs through the system URL handler without `cmd.exe` parsing or PATH-based `rundll32` lookup, and reject non-HTTP browser-open inputs. Fixes #71098. Thanks @Sanjays2402.
|
||||
- Providers/OpenAI: separate API-key and Codex sign-in onboarding groups, and avoid replaying stale OpenAI Responses reasoning blocks after a model route switch.
|
||||
- Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
|
||||
- Plugins/Voice Call: honor configured TTS timeouts for Twilio media-stream playback and fail empty telephony audio instead of completing as silence. Fixes #42071; supersedes #60957. Thanks @Ryce and @sliekens.
|
||||
- Skills: honor legacy `metadata.clawdbot` requirements and installer hints when `metadata.openclaw` is absent, so older skills no longer appear ready when required binaries are missing. Fixes #71323. Thanks @chen-zhang-cs-code.
|
||||
- Browser/config: expand `~` in `browser.executablePath` before Chromium launch, so home-relative custom browser paths no longer fail with `ENOENT`. Fixes #67264. Thanks @Quratulain-bilal.
|
||||
- Telegram/streaming: hide tool-progress status updates by default while keeping explicit `streaming.preview.toolProgress` opt-in support for edited preview messages. Fixes #71320. Thanks @neeravmakwana.
|
||||
|
||||
@@ -335,6 +335,7 @@ describe("TwilioProvider", () => {
|
||||
|
||||
provider.setMediaStreamHandler(mediaStreamHandler as never);
|
||||
provider.setTTSProvider({
|
||||
synthesisTimeoutMs: 5000,
|
||||
synthesizeForTelephony: async () => await new Promise<Buffer>(() => {}),
|
||||
});
|
||||
|
||||
@@ -344,8 +345,8 @@ describe("TwilioProvider", () => {
|
||||
providerCallId: "CA-timeout",
|
||||
text: "Timeout me",
|
||||
}),
|
||||
).rejects.toThrow("Telephony TTS synthesis timed out");
|
||||
await vi.advanceTimersByTimeAsync(8_100);
|
||||
).rejects.toThrow("Telephony TTS synthesis timed out after 5000ms");
|
||||
await vi.advanceTimersByTimeAsync(5_100);
|
||||
await playExpectation;
|
||||
expect(sendAudio).toHaveBeenCalled();
|
||||
expect(sendMark).not.toHaveBeenCalled();
|
||||
@@ -373,6 +374,7 @@ describe("TwilioProvider", () => {
|
||||
|
||||
provider.setMediaStreamHandler(mediaStreamHandler as never);
|
||||
provider.setTTSProvider({
|
||||
synthesisTimeoutMs: 5000,
|
||||
synthesizeForTelephony: async () => Buffer.alloc(320),
|
||||
});
|
||||
|
||||
@@ -386,4 +388,38 @@ describe("TwilioProvider", () => {
|
||||
expect(sendAudio).toHaveBeenCalled();
|
||||
expect(sendMark).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("fails stream playback when telephony synthesis returns empty audio", async () => {
|
||||
const provider = createProvider();
|
||||
provider.registerCallStream("CA-empty", "MZ-empty");
|
||||
|
||||
const sendAudio = vi.fn();
|
||||
const sendMark = vi.fn();
|
||||
const mediaStreamHandler = {
|
||||
queueTts: async (
|
||||
_streamSid: string,
|
||||
playFn: (signal: AbortSignal) => Promise<void>,
|
||||
): Promise<void> => {
|
||||
await playFn(new AbortController().signal);
|
||||
},
|
||||
sendAudio,
|
||||
sendMark,
|
||||
};
|
||||
|
||||
provider.setMediaStreamHandler(mediaStreamHandler as never);
|
||||
provider.setTTSProvider({
|
||||
synthesisTimeoutMs: 5000,
|
||||
synthesizeForTelephony: async () => Buffer.alloc(0),
|
||||
});
|
||||
|
||||
await expect(
|
||||
provider.playTts({
|
||||
callId: "call-empty",
|
||||
providerCallId: "CA-empty",
|
||||
text: "Empty audio",
|
||||
}),
|
||||
).rejects.toThrow("Telephony TTS produced no audio");
|
||||
expect(sendAudio).toHaveBeenCalled();
|
||||
expect(sendMark).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -63,7 +63,6 @@ type StreamSendResult = {
|
||||
|
||||
export class TwilioProvider implements VoiceCallProvider {
|
||||
readonly name = "twilio" as const;
|
||||
private static readonly TTS_SYNTH_TIMEOUT_MS = 8000;
|
||||
|
||||
private readonly accountSid: string;
|
||||
private readonly authToken: string;
|
||||
@@ -672,16 +671,13 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
// Generate audio with core TTS (returns mu-law at 8kHz)
|
||||
let muLawAudio: Buffer;
|
||||
let synthTimeout: ReturnType<typeof setTimeout> | null = null;
|
||||
const synthTimeoutMs = ttsProvider.synthesisTimeoutMs;
|
||||
try {
|
||||
const synthPromise = ttsProvider.synthesizeForTelephony(text);
|
||||
const timeoutPromise = new Promise<Buffer>((_, reject) => {
|
||||
synthTimeout = setTimeout(() => {
|
||||
reject(
|
||||
new Error(
|
||||
`Telephony TTS synthesis timed out after ${TwilioProvider.TTS_SYNTH_TIMEOUT_MS}ms`,
|
||||
),
|
||||
);
|
||||
}, TwilioProvider.TTS_SYNTH_TIMEOUT_MS);
|
||||
reject(new Error(`Telephony TTS synthesis timed out after ${synthTimeoutMs}ms`));
|
||||
}, synthTimeoutMs);
|
||||
});
|
||||
muLawAudio = await Promise.race([synthPromise, timeoutPromise]);
|
||||
} finally {
|
||||
@@ -691,6 +687,10 @@ export class TwilioProvider implements VoiceCallProvider {
|
||||
clearInterval(keepAlive);
|
||||
}
|
||||
|
||||
if (muLawAudio.length === 0) {
|
||||
throw new Error("Telephony TTS produced no audio");
|
||||
}
|
||||
|
||||
let chunkAttempts = 0;
|
||||
let chunkDelivered = 0;
|
||||
let nextChunkDueAt = Date.now() + CHUNK_DELAY_MS;
|
||||
|
||||
@@ -116,4 +116,34 @@ describe("createTelephonyTtsProvider deepMerge hardening", () => {
|
||||
"[voice-call] Telephony TTS fallback used from=elevenlabs to=microsoft attempts=elevenlabs -> microsoft",
|
||||
);
|
||||
});
|
||||
|
||||
it("exposes configured timeoutMs as synthesisTimeoutMs", () => {
|
||||
const provider = createTelephonyTtsProvider({
|
||||
coreConfig: { messages: { tts: { provider: "openai", timeoutMs: 15000 } } },
|
||||
runtime: {
|
||||
textToSpeechTelephony: async () => ({
|
||||
success: true,
|
||||
audioBuffer: Buffer.alloc(2),
|
||||
sampleRate: 8000,
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
expect(provider.synthesisTimeoutMs).toBe(15000);
|
||||
});
|
||||
|
||||
it("keeps the telephony timeout default when timeoutMs is not configured", () => {
|
||||
const provider = createTelephonyTtsProvider({
|
||||
coreConfig: createCoreConfig(),
|
||||
runtime: {
|
||||
textToSpeechTelephony: async () => ({
|
||||
success: true,
|
||||
audioBuffer: Buffer.alloc(2),
|
||||
sampleRate: 8000,
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
expect(provider.synthesisTimeoutMs).toBe(8000);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -20,9 +20,12 @@ export type TelephonyTtsRuntime = {
|
||||
};
|
||||
|
||||
export type TelephonyTtsProvider = {
|
||||
synthesisTimeoutMs: number;
|
||||
synthesizeForTelephony: (text: string) => Promise<Buffer>;
|
||||
};
|
||||
|
||||
const TELEPHONY_DEFAULT_TTS_TIMEOUT_MS = 8000;
|
||||
|
||||
export function createTelephonyTtsProvider(params: {
|
||||
coreConfig: CoreConfig;
|
||||
ttsOverride?: VoiceCallTtsConfig;
|
||||
@@ -33,8 +36,11 @@ export function createTelephonyTtsProvider(params: {
|
||||
}): TelephonyTtsProvider {
|
||||
const { coreConfig, ttsOverride, runtime, logger } = params;
|
||||
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
|
||||
const synthesisTimeoutMs =
|
||||
mergedConfig.messages?.tts?.timeoutMs ?? TELEPHONY_DEFAULT_TTS_TIMEOUT_MS;
|
||||
|
||||
return {
|
||||
synthesisTimeoutMs,
|
||||
synthesizeForTelephony: async (text: string) => {
|
||||
const result = await runtime.textToSpeechTelephony({
|
||||
text,
|
||||
|
||||
Reference in New Issue
Block a user