mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-04-30 14:02:56 +08:00
refactor(tts): resolve voice delivery from channel capabilities
This commit is contained in:
@@ -150,6 +150,8 @@ Docs: https://docs.openclaw.ai
|
||||
same inbound audio twice. Fixes #70580.
|
||||
- TTS/BlueBubbles: deliver compatible auto-TTS audio as iMessage voice memo
|
||||
bubbles instead of plain MP3/CAF file attachments. Fixes #16848.
|
||||
- TTS: resolve voice-note and voice-memo routing from channel plugin
|
||||
capabilities instead of speech-core-owned channel id lists.
|
||||
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
|
||||
plain prompts instead of OpenClaw internal runtime-context envelopes, while
|
||||
keeping those envelopes out of ACP transcripts.
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
690c1cd4c0c2c3d31577958120e14ac0bf555af529e03aa5e7965b1d04659c49 plugin-sdk-api-baseline.json
|
||||
a0e6ba472ddd3acea34c0a8fda8cbb7d1172b1671a671d5fef5a9f42d749ce0d plugin-sdk-api-baseline.jsonl
|
||||
a81b6ddeb1fd24bf234a3b7ba1d51d18d7060afa49378dd92988f326e140db13 plugin-sdk-api-baseline.json
|
||||
90a6e45404c2c017c23ab9ee75e71503ec683a680f64266504fdab69e43f288b plugin-sdk-api-baseline.jsonl
|
||||
|
||||
@@ -646,6 +646,44 @@ or `messages.tts.prefsPath`.
|
||||
These override the effective config from `messages.tts` plus the active
|
||||
`agents.list[].tts` block for that host.
|
||||
|
||||
## Output formats (fixed)
|
||||
|
||||
TTS voice delivery is channel-capability driven. Channel plugins advertise
|
||||
whether voice-style TTS should ask providers for a native `voice-note` target or
|
||||
keep normal `audio-file` synthesis and only mark compatible output for voice
|
||||
delivery.
|
||||
|
||||
- **Voice-note capable channels**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- 48kHz / 64kbps is a good voice message tradeoff.
|
||||
- **Feishu / WhatsApp**: when a voice-note reply is produced as MP3/WebM/WAV/M4A
|
||||
or another likely audio file, the channel plugin transcodes it to 48kHz
|
||||
Ogg/Opus with `ffmpeg` before sending the native voice message. WhatsApp sends
|
||||
the result through the Baileys `audio` payload with `ptt: true` and
|
||||
`audio/ogg; codecs=opus`. If conversion fails, Feishu receives the original
|
||||
file as an attachment; WhatsApp send fails rather than posting an incompatible
|
||||
PTT payload.
|
||||
- **BlueBubbles**: keeps provider synthesis on the normal audio-file path; MP3
|
||||
and CAF outputs are marked for iMessage voice memo delivery.
|
||||
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
|
||||
- 44.1kHz / 128kbps is the default balance for speech clarity.
|
||||
- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For channel-advertised voice-note targets, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery when the channel advertises transcoding.
|
||||
- **Xiaomi MiMo**: MP3 by default, or WAV when configured. For channel-advertised voice-note targets, OpenClaw transcodes Xiaomi output to 48kHz Opus with `ffmpeg` before delivery when the channel advertises transcoding.
|
||||
- **Local CLI**: uses the configured `outputFormat`. Voice-note targets are
|
||||
converted to Ogg/Opus and telephony output is converted to raw 16 kHz mono PCM
|
||||
with `ffmpeg`.
|
||||
- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony.
|
||||
- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony.
|
||||
- **Inworld**: MP3 for normal audio attachments, native `OGG_OPUS` for voice-note targets, and raw `PCM` at 22050 Hz for Talk/telephony.
|
||||
- **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path.
|
||||
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
|
||||
- The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
|
||||
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
|
||||
- Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
|
||||
guaranteed Opus voice messages.
|
||||
- If the configured Microsoft output format fails, OpenClaw retries with MP3.
|
||||
|
||||
OpenAI/ElevenLabs output formats are fixed per channel (see above).
|
||||
|
||||
## Auto-TTS behavior
|
||||
|
||||
When `messages.tts.auto` is enabled, OpenClaw:
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
import { readFileSync } from "node:fs";
|
||||
import { resolve } from "node:path";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../../src/config/config.js";
|
||||
import { buildPluginApi } from "../../src/plugins/api-builder.js";
|
||||
import type { PluginRuntime } from "../../src/plugins/runtime/types.js";
|
||||
import { registerSingleProviderPlugin } from "../../test/helpers/plugins/plugin-registration.js";
|
||||
import { resetBedrockDiscoveryCacheForTest } from "./discovery.js";
|
||||
import amazonBedrockPlugin from "./index.js";
|
||||
import { resetBedrockAppProfileCacheEligibilityForTest } from "./register.sync.runtime.js";
|
||||
import {
|
||||
resetBedrockAppProfileCacheEligibilityForTest,
|
||||
setBedrockAppProfileControlPlaneForTest,
|
||||
} from "./register.sync.runtime.js";
|
||||
|
||||
type BedrockClientResult =
|
||||
| {
|
||||
@@ -211,6 +214,19 @@ describe("amazon-bedrock provider plugin", () => {
|
||||
sendBedrockCommand.mockClear();
|
||||
resetBedrockDiscoveryCacheForTest();
|
||||
resetBedrockAppProfileCacheEligibilityForTest();
|
||||
setBedrockAppProfileControlPlaneForTest((region) => ({
|
||||
async getInferenceProfile(input) {
|
||||
class GetInferenceProfileCommand {
|
||||
constructor(readonly input: Record<string, unknown> = {}) {}
|
||||
}
|
||||
bedrockClientConfigs.push(region ? { region } : {});
|
||||
return await sendBedrockCommand(new GetInferenceProfileCommand(input));
|
||||
},
|
||||
}));
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
setBedrockAppProfileControlPlaneForTest(undefined);
|
||||
});
|
||||
|
||||
it("marks Claude 4.6 Bedrock models as adaptive by default", async () => {
|
||||
|
||||
@@ -153,10 +153,42 @@ function resolvedModelSupportsCaching(modelArn: string): boolean {
|
||||
*/
|
||||
const appProfileCacheEligibleCache = new Map<string, boolean>();
|
||||
|
||||
type BedrockGetInferenceProfileResponse = {
|
||||
models?: Array<{ modelArn?: string }>;
|
||||
};
|
||||
|
||||
type BedrockControlPlane = {
|
||||
getInferenceProfile: (input: {
|
||||
inferenceProfileIdentifier: string;
|
||||
}) => Promise<BedrockGetInferenceProfileResponse>;
|
||||
};
|
||||
|
||||
type BedrockControlPlaneFactory = (region: string | undefined) => BedrockControlPlane;
|
||||
|
||||
let bedrockControlPlaneOverride: BedrockControlPlaneFactory | undefined;
|
||||
|
||||
export function resetBedrockAppProfileCacheEligibilityForTest(): void {
|
||||
appProfileCacheEligibleCache.clear();
|
||||
}
|
||||
|
||||
export function setBedrockAppProfileControlPlaneForTest(
|
||||
controlPlane: BedrockControlPlaneFactory | undefined,
|
||||
): void {
|
||||
bedrockControlPlaneOverride = controlPlane;
|
||||
resetBedrockAppProfileCacheEligibilityForTest();
|
||||
}
|
||||
|
||||
async function createBedrockControlPlane(region: string | undefined): Promise<BedrockControlPlane> {
|
||||
if (bedrockControlPlaneOverride) {
|
||||
return bedrockControlPlaneOverride(region);
|
||||
}
|
||||
const { BedrockClient, GetInferenceProfileCommand } = await import("@aws-sdk/client-bedrock");
|
||||
const client = new BedrockClient(region ? { region } : {});
|
||||
return {
|
||||
getInferenceProfile: async (input) => await client.send(new GetInferenceProfileCommand(input)),
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveAppProfileCacheEligible(
|
||||
modelId: string,
|
||||
fallbackRegion: string | undefined,
|
||||
@@ -165,12 +197,9 @@ async function resolveAppProfileCacheEligible(
|
||||
return appProfileCacheEligibleCache.get(modelId)!;
|
||||
}
|
||||
try {
|
||||
const { BedrockClient, GetInferenceProfileCommand } = await import("@aws-sdk/client-bedrock");
|
||||
const region = extractRegionFromArn(modelId) ?? fallbackRegion;
|
||||
const client = new BedrockClient(region ? { region } : {});
|
||||
const resp = await client.send(
|
||||
new GetInferenceProfileCommand({ inferenceProfileIdentifier: modelId }),
|
||||
);
|
||||
const controlPlane = await createBedrockControlPlane(region);
|
||||
const resp = await controlPlane.getInferenceProfile({ inferenceProfileIdentifier: modelId });
|
||||
const models = resp.models ?? [];
|
||||
const eligible =
|
||||
models.length > 0 &&
|
||||
|
||||
@@ -31,6 +31,12 @@ export const bluebubblesMeta = {
|
||||
export const bluebubblesCapabilities: ChannelPlugin<ResolvedBlueBubblesAccount>["capabilities"] = {
|
||||
chatTypes: ["direct", "group"],
|
||||
media: true,
|
||||
tts: {
|
||||
voice: {
|
||||
synthesisTarget: "audio-file",
|
||||
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
||||
},
|
||||
},
|
||||
reactions: true,
|
||||
edit: true,
|
||||
unsend: true,
|
||||
|
||||
@@ -96,6 +96,11 @@ export function createDiscordPluginBase(params: {
|
||||
reactions: true,
|
||||
threads: true,
|
||||
media: true,
|
||||
tts: {
|
||||
voice: {
|
||||
synthesisTarget: "voice-note",
|
||||
},
|
||||
},
|
||||
nativeCommands: true,
|
||||
},
|
||||
commands: {
|
||||
|
||||
@@ -588,6 +588,12 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
|
||||
polls: false,
|
||||
threads: true,
|
||||
media: true,
|
||||
tts: {
|
||||
voice: {
|
||||
synthesisTarget: "voice-note",
|
||||
transcodesAudio: true,
|
||||
},
|
||||
},
|
||||
reactions: true,
|
||||
edit: true,
|
||||
reply: true,
|
||||
|
||||
@@ -334,6 +334,11 @@ export const matrixPlugin: ChannelPlugin<ResolvedMatrixAccount, MatrixProbe> =
|
||||
reactions: true,
|
||||
threads: true,
|
||||
media: true,
|
||||
tts: {
|
||||
voice: {
|
||||
synthesisTarget: "voice-note",
|
||||
},
|
||||
},
|
||||
},
|
||||
reload: { configPrefixes: ["channels.matrix"] },
|
||||
configSchema: buildChannelConfigSchema(MatrixConfigSchema),
|
||||
|
||||
@@ -21,12 +21,6 @@ const runtimeMocks = vi.hoisted(() => ({
|
||||
refreshOpenAICodexToken: vi.fn(),
|
||||
}));
|
||||
|
||||
type OpenAIRefreshDelegateGlobal = typeof globalThis & {
|
||||
__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__?: (...args: unknown[]) => unknown;
|
||||
};
|
||||
|
||||
const openAIRefreshDelegateGlobal = () => globalThis as OpenAIRefreshDelegateGlobal;
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
|
||||
const actual = await vi.importActual<typeof import("openclaw/plugin-sdk/runtime-env")>(
|
||||
"openclaw/plugin-sdk/runtime-env",
|
||||
@@ -41,12 +35,10 @@ vi.mock("@mariozechner/pi-ai/oauth", () => ({
|
||||
getOAuthApiKey: vi.fn(),
|
||||
getOAuthProviders: () => [],
|
||||
loginOpenAICodex: vi.fn(),
|
||||
refreshOpenAICodexToken: vi.fn((...args: unknown[]) =>
|
||||
openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__?.(...args),
|
||||
),
|
||||
refreshOpenAICodexToken: vi.fn(),
|
||||
}));
|
||||
|
||||
import { refreshOpenAICodexToken } from "./openai-codex-provider.runtime.js";
|
||||
import { createOpenAICodexProviderRuntime } from "./openai-codex-provider.runtime.js";
|
||||
|
||||
const _registerOpenAIPlugin = async () =>
|
||||
registerProviderPlugin({
|
||||
@@ -312,19 +304,19 @@ describe("openai plugin", () => {
|
||||
expires: Date.now() + 60_000,
|
||||
};
|
||||
runtimeMocks.refreshOpenAICodexToken.mockResolvedValue(refreshed);
|
||||
openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__ =
|
||||
runtimeMocks.refreshOpenAICodexToken;
|
||||
try {
|
||||
await expect(refreshOpenAICodexToken("refresh-token")).resolves.toBe(refreshed);
|
||||
const runtime = createOpenAICodexProviderRuntime({
|
||||
ensureGlobalUndiciEnvProxyDispatcher: runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher,
|
||||
getOAuthApiKey: vi.fn(),
|
||||
refreshOpenAICodexToken: runtimeMocks.refreshOpenAICodexToken,
|
||||
});
|
||||
|
||||
expect(runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher).toHaveBeenCalledOnce();
|
||||
expect(runtimeMocks.refreshOpenAICodexToken).toHaveBeenCalledOnce();
|
||||
expect(
|
||||
runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher.mock.invocationCallOrder[0],
|
||||
).toBeLessThan(runtimeMocks.refreshOpenAICodexToken.mock.invocationCallOrder[0]);
|
||||
} finally {
|
||||
delete openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__;
|
||||
}
|
||||
await expect(runtime.refreshOpenAICodexToken("refresh-token")).resolves.toBe(refreshed);
|
||||
|
||||
expect(runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher).toHaveBeenCalledOnce();
|
||||
expect(runtimeMocks.refreshOpenAICodexToken).toHaveBeenCalledOnce();
|
||||
expect(
|
||||
runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher.mock.invocationCallOrder[0],
|
||||
).toBeLessThan(runtimeMocks.refreshOpenAICodexToken.mock.invocationCallOrder[0]);
|
||||
});
|
||||
|
||||
it("registers provider-owned OpenAI tool compat hooks for openai and codex", async () => {
|
||||
|
||||
@@ -4,16 +4,42 @@ import {
|
||||
} from "@mariozechner/pi-ai/oauth";
|
||||
import { ensureGlobalUndiciEnvProxyDispatcher } from "openclaw/plugin-sdk/runtime-env";
|
||||
|
||||
type OpenAICodexProviderRuntimeDeps = {
|
||||
ensureGlobalUndiciEnvProxyDispatcher: typeof ensureGlobalUndiciEnvProxyDispatcher;
|
||||
getOAuthApiKey: typeof getOAuthApiKeyFromPi;
|
||||
refreshOpenAICodexToken: typeof refreshOpenAICodexTokenFromPi;
|
||||
};
|
||||
|
||||
export function createOpenAICodexProviderRuntime(deps: OpenAICodexProviderRuntimeDeps): {
|
||||
getOAuthApiKey: typeof getOAuthApiKey;
|
||||
refreshOpenAICodexToken: typeof refreshOpenAICodexToken;
|
||||
} {
|
||||
return {
|
||||
async getOAuthApiKey(...args) {
|
||||
deps.ensureGlobalUndiciEnvProxyDispatcher();
|
||||
return await deps.getOAuthApiKey(...args);
|
||||
},
|
||||
async refreshOpenAICodexToken(...args) {
|
||||
deps.ensureGlobalUndiciEnvProxyDispatcher();
|
||||
return await deps.refreshOpenAICodexToken(...args);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const runtime = createOpenAICodexProviderRuntime({
|
||||
ensureGlobalUndiciEnvProxyDispatcher,
|
||||
getOAuthApiKey: getOAuthApiKeyFromPi,
|
||||
refreshOpenAICodexToken: refreshOpenAICodexTokenFromPi,
|
||||
});
|
||||
|
||||
export async function getOAuthApiKey(
|
||||
...args: Parameters<typeof getOAuthApiKeyFromPi>
|
||||
): Promise<Awaited<ReturnType<typeof getOAuthApiKeyFromPi>>> {
|
||||
ensureGlobalUndiciEnvProxyDispatcher();
|
||||
return await getOAuthApiKeyFromPi(...args);
|
||||
return await runtime.getOAuthApiKey(...args);
|
||||
}
|
||||
|
||||
export async function refreshOpenAICodexToken(
|
||||
...args: Parameters<typeof refreshOpenAICodexTokenFromPi>
|
||||
): Promise<Awaited<ReturnType<typeof refreshOpenAICodexTokenFromPi>>> {
|
||||
ensureGlobalUndiciEnvProxyDispatcher();
|
||||
return await refreshOpenAICodexTokenFromPi(...args);
|
||||
return await runtime.refreshOpenAICodexToken(...args);
|
||||
}
|
||||
|
||||
@@ -30,6 +30,22 @@ const getSpeechProviderMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/channel-targets", () => ({
|
||||
normalizeChannelId: (channel: string | undefined) => channel?.trim().toLowerCase() ?? null,
|
||||
resolveChannelTtsVoiceDelivery: (channel: string | undefined) => {
|
||||
const normalized = channel?.trim().toLowerCase();
|
||||
if (normalized === "bluebubbles") {
|
||||
return {
|
||||
synthesisTarget: "audio-file",
|
||||
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
||||
};
|
||||
}
|
||||
if (normalized === "feishu" || normalized === "whatsapp") {
|
||||
return { synthesisTarget: "voice-note", transcodesAudio: true };
|
||||
}
|
||||
if (normalized === "discord" || normalized === "matrix" || normalized === "telegram") {
|
||||
return { synthesisTarget: "voice-note" };
|
||||
}
|
||||
return undefined;
|
||||
},
|
||||
}));
|
||||
|
||||
vi.mock("../api.js", async () => {
|
||||
@@ -152,7 +168,7 @@ describe("speech-core native voice-note routing", () => {
|
||||
installSpeechProviders([createMockSpeechProvider()]);
|
||||
});
|
||||
|
||||
it("keeps native voice-note channel support centralized", () => {
|
||||
it("resolves voice delivery support from channel capabilities", () => {
|
||||
for (const channel of nativeVoiceNoteChannels) {
|
||||
expect(_test.supportsNativeVoiceNoteTts(channel)).toBe(true);
|
||||
expect(_test.supportsNativeVoiceNoteTts(channel.toUpperCase())).toBe(true);
|
||||
|
||||
@@ -9,7 +9,7 @@ import {
|
||||
unlinkSync,
|
||||
} from "node:fs";
|
||||
import path from "node:path";
|
||||
import { normalizeChannelId, type ChannelId } from "openclaw/plugin-sdk/channel-targets";
|
||||
import { resolveChannelTtsVoiceDelivery } from "openclaw/plugin-sdk/channel-targets";
|
||||
import type {
|
||||
OpenClawConfig,
|
||||
ResolvedTtsPersona,
|
||||
@@ -738,52 +738,34 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
|
||||
lastTtsAttempt = entry;
|
||||
}
|
||||
|
||||
const VOICE_DELIVERY_CHANNELS = new Set([
|
||||
"bluebubbles",
|
||||
"telegram",
|
||||
"feishu",
|
||||
"whatsapp",
|
||||
"matrix",
|
||||
"discord",
|
||||
]);
|
||||
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
|
||||
const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]);
|
||||
const AUDIO_FILE_VOICE_MEMO_CHANNELS = new Set(["bluebubbles"]);
|
||||
|
||||
function resolveChannelId(channel: string | undefined): ChannelId | null {
|
||||
return channel ? normalizeChannelId(channel) : null;
|
||||
}
|
||||
|
||||
function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
|
||||
const channelId = resolveChannelId(channel);
|
||||
return channelId !== null && VOICE_DELIVERY_CHANNELS.has(channelId);
|
||||
return resolveChannelTtsVoiceDelivery(channel) !== undefined;
|
||||
}
|
||||
|
||||
function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
|
||||
const channelId = resolveChannelId(channel);
|
||||
return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
|
||||
const delivery = resolveChannelTtsVoiceDelivery(channel);
|
||||
return delivery?.synthesisTarget === "voice-note" && delivery.transcodesAudio === true;
|
||||
}
|
||||
|
||||
function resolveTtsSynthesisTarget(channel: string | undefined): "audio-file" | "voice-note" {
|
||||
const channelId = resolveChannelId(channel);
|
||||
return channelId !== null && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
|
||||
return resolveChannelTtsVoiceDelivery(channel)?.synthesisTarget ?? "audio-file";
|
||||
}
|
||||
|
||||
function supportsAudioFileVoiceMemoOutput(params: {
|
||||
fileExtension?: string;
|
||||
outputFormat?: string;
|
||||
audioFileFormats?: readonly string[];
|
||||
}): boolean {
|
||||
const formats = new Set(params.audioFileFormats?.map((format) => format.trim().toLowerCase()));
|
||||
if (formats.size === 0) {
|
||||
return false;
|
||||
}
|
||||
const extension = params.fileExtension?.trim().toLowerCase();
|
||||
if (extension === ".mp3" || extension === ".caf") {
|
||||
if (extension && formats.has(extension.replace(/^\./, ""))) {
|
||||
return true;
|
||||
}
|
||||
const outputFormat = params.outputFormat?.trim().toLowerCase();
|
||||
return (
|
||||
outputFormat === "mp3" ||
|
||||
outputFormat === "caf" ||
|
||||
outputFormat === "audio/mpeg" ||
|
||||
outputFormat === "audio/x-caf"
|
||||
);
|
||||
return outputFormat ? formats.has(outputFormat) : false;
|
||||
}
|
||||
|
||||
function shouldDeliverTtsAsVoice(params: {
|
||||
@@ -793,17 +775,24 @@ function shouldDeliverTtsAsVoice(params: {
|
||||
fileExtension?: string;
|
||||
outputFormat?: string;
|
||||
}): boolean {
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
if (channelId === null || !supportsNativeVoiceNoteTts(channelId)) {
|
||||
const delivery = resolveChannelTtsVoiceDelivery(params.channel);
|
||||
if (!delivery) {
|
||||
return false;
|
||||
}
|
||||
if (AUDIO_FILE_VOICE_MEMO_CHANNELS.has(channelId)) {
|
||||
return params.target === "audio-file" && supportsAudioFileVoiceMemoOutput(params);
|
||||
if (delivery.synthesisTarget === "audio-file") {
|
||||
return (
|
||||
params.target === "audio-file" &&
|
||||
supportsAudioFileVoiceMemoOutput({
|
||||
fileExtension: params.fileExtension,
|
||||
outputFormat: params.outputFormat,
|
||||
audioFileFormats: delivery.audioFileFormats,
|
||||
})
|
||||
);
|
||||
}
|
||||
if (params.target !== "voice-note") {
|
||||
return false;
|
||||
}
|
||||
return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
|
||||
return params.voiceCompatible === true || delivery.transcodesAudio === true;
|
||||
}
|
||||
|
||||
export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
|
||||
|
||||
@@ -142,6 +142,11 @@ export function createTelegramPluginBase(params: {
|
||||
reactions: true,
|
||||
threads: true,
|
||||
media: true,
|
||||
tts: {
|
||||
voice: {
|
||||
synthesisTarget: "voice-note",
|
||||
},
|
||||
},
|
||||
polls: true,
|
||||
nativeCommands: true,
|
||||
blockStreaming: true,
|
||||
|
||||
@@ -212,6 +212,12 @@ export function createWhatsAppPluginBase(params: {
|
||||
polls: true,
|
||||
reactions: true,
|
||||
media: true,
|
||||
tts: {
|
||||
voice: {
|
||||
synthesisTarget: "voice-note",
|
||||
transcodesAudio: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
reload: { configPrefixes: ["web"], noopPrefixes: ["channels.whatsapp"] },
|
||||
gatewayMethods: ["web.login.start", "web.login.wait"],
|
||||
|
||||
111
src/channels/plugins/tts-capabilities.test.ts
Normal file
111
src/channels/plugins/tts-capabilities.test.ts
Normal file
@@ -0,0 +1,111 @@
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { createEmptyPluginRegistry } from "../../plugins/registry-empty.js";
|
||||
import { setActivePluginRegistry } from "../../plugins/runtime.js";
|
||||
import {
|
||||
createChannelTestPluginBase,
|
||||
createTestRegistry,
|
||||
} from "../../test-utils/channel-plugins.js";
|
||||
import { resolveChannelTtsVoiceDelivery } from "./tts-capabilities.js";
|
||||
import type { ChannelPlugin } from "./types.js";
|
||||
|
||||
function createChannelPlugin(
|
||||
id: string,
|
||||
capabilities: ChannelPlugin["capabilities"],
|
||||
): ChannelPlugin {
|
||||
return createChannelTestPluginBase({
|
||||
id,
|
||||
label: id,
|
||||
capabilities,
|
||||
config: {
|
||||
listAccountIds: () => ["default"],
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
describe("resolveChannelTtsVoiceDelivery", () => {
|
||||
afterEach(() => {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
});
|
||||
|
||||
it("reads voice delivery behavior from channel plugin capabilities", () => {
|
||||
setActivePluginRegistry(
|
||||
createTestRegistry([
|
||||
{
|
||||
pluginId: "bluebubbles",
|
||||
plugin: createChannelPlugin("bluebubbles", {
|
||||
chatTypes: ["direct"],
|
||||
tts: {
|
||||
voice: {
|
||||
synthesisTarget: "audio-file",
|
||||
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
||||
},
|
||||
},
|
||||
}),
|
||||
source: "test",
|
||||
},
|
||||
{
|
||||
pluginId: "discord",
|
||||
plugin: createChannelPlugin("discord", {
|
||||
chatTypes: ["direct"],
|
||||
tts: { voice: { synthesisTarget: "voice-note" } },
|
||||
}),
|
||||
source: "test",
|
||||
},
|
||||
{
|
||||
pluginId: "feishu",
|
||||
plugin: createChannelPlugin("feishu", {
|
||||
chatTypes: ["direct"],
|
||||
tts: { voice: { synthesisTarget: "voice-note", transcodesAudio: true } },
|
||||
}),
|
||||
source: "test",
|
||||
},
|
||||
{
|
||||
pluginId: "matrix",
|
||||
plugin: createChannelPlugin("matrix", {
|
||||
chatTypes: ["direct"],
|
||||
tts: { voice: { synthesisTarget: "voice-note" } },
|
||||
}),
|
||||
source: "test",
|
||||
},
|
||||
{
|
||||
pluginId: "telegram",
|
||||
plugin: createChannelPlugin("telegram", {
|
||||
chatTypes: ["direct"],
|
||||
tts: { voice: { synthesisTarget: "voice-note" } },
|
||||
}),
|
||||
source: "test",
|
||||
},
|
||||
{
|
||||
pluginId: "whatsapp",
|
||||
plugin: createChannelPlugin("whatsapp", {
|
||||
chatTypes: ["direct"],
|
||||
tts: { voice: { synthesisTarget: "voice-note", transcodesAudio: true } },
|
||||
}),
|
||||
source: "test",
|
||||
},
|
||||
]),
|
||||
);
|
||||
expect(resolveChannelTtsVoiceDelivery("bluebubbles")).toEqual({
|
||||
synthesisTarget: "audio-file",
|
||||
audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
|
||||
});
|
||||
expect(resolveChannelTtsVoiceDelivery("discord")).toEqual({
|
||||
synthesisTarget: "voice-note",
|
||||
});
|
||||
expect(resolveChannelTtsVoiceDelivery("feishu")).toEqual({
|
||||
synthesisTarget: "voice-note",
|
||||
transcodesAudio: true,
|
||||
});
|
||||
expect(resolveChannelTtsVoiceDelivery("matrix")).toEqual({
|
||||
synthesisTarget: "voice-note",
|
||||
});
|
||||
expect(resolveChannelTtsVoiceDelivery("telegram")).toEqual({
|
||||
synthesisTarget: "voice-note",
|
||||
});
|
||||
expect(resolveChannelTtsVoiceDelivery("whatsapp")).toEqual({
|
||||
synthesisTarget: "voice-note",
|
||||
transcodesAudio: true,
|
||||
});
|
||||
expect(resolveChannelTtsVoiceDelivery("slack")).toBeUndefined();
|
||||
});
|
||||
});
|
||||
13
src/channels/plugins/tts-capabilities.ts
Normal file
13
src/channels/plugins/tts-capabilities.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
import { normalizeChannelId } from "./registry.js";
|
||||
import { getChannelPlugin } from "./registry.js";
|
||||
import type { ChannelTtsVoiceDeliveryCapabilities } from "./types.core.js";
|
||||
|
||||
export function resolveChannelTtsVoiceDelivery(
|
||||
channel: string | undefined,
|
||||
): ChannelTtsVoiceDeliveryCapabilities | undefined {
|
||||
const channelId = normalizeChannelId(channel);
|
||||
if (!channelId) {
|
||||
return undefined;
|
||||
}
|
||||
return getChannelPlugin(channelId)?.capabilities.tts?.voice;
|
||||
}
|
||||
@@ -272,6 +272,13 @@ export type ChannelGroupContext = {
|
||||
senderE164?: string | null;
|
||||
};
|
||||
|
||||
/** TTS voice delivery behavior advertised by a channel plugin. */
|
||||
export type ChannelTtsVoiceDeliveryCapabilities = {
|
||||
synthesisTarget: "audio-file" | "voice-note";
|
||||
transcodesAudio?: boolean;
|
||||
audioFileFormats?: readonly string[];
|
||||
};
|
||||
|
||||
/** Static capability flags advertised by a channel plugin. */
|
||||
export type ChannelCapabilities = {
|
||||
chatTypes: Array<ChatType | "thread">;
|
||||
@@ -284,6 +291,9 @@ export type ChannelCapabilities = {
|
||||
groupManagement?: boolean;
|
||||
threads?: boolean;
|
||||
media?: boolean;
|
||||
tts?: {
|
||||
voice?: ChannelTtsVoiceDeliveryCapabilities;
|
||||
};
|
||||
nativeCommands?: boolean;
|
||||
blockStreaming?: boolean;
|
||||
};
|
||||
|
||||
@@ -39,6 +39,7 @@ export {
|
||||
} from "../channels/plugins/chat-target-prefixes.js";
|
||||
export type { ChannelId } from "../channels/plugins/types.public.js";
|
||||
export { normalizeChannelId } from "../channels/plugins/registry.js";
|
||||
export { resolveChannelTtsVoiceDelivery } from "../channels/plugins/tts-capabilities.js";
|
||||
export {
|
||||
buildUnresolvedTargetResults,
|
||||
resolveTargetsWithOptionalToken,
|
||||
|
||||
@@ -112,6 +112,8 @@ export type TtsTestFacade = {
|
||||
channel: string | undefined;
|
||||
target: TtsSpeechTarget | undefined;
|
||||
voiceCompatible: boolean | undefined;
|
||||
fileExtension?: string;
|
||||
outputFormat?: string;
|
||||
}) => boolean;
|
||||
summarizeText: (...args: unknown[]) => Promise<SummarizeResult>;
|
||||
getResolvedSpeechProviderConfig: (
|
||||
|
||||
Reference in New Issue
Block a user