diff --git a/CHANGELOG.md b/CHANGELOG.md index 86d2f7eb9f1..bffeffa3455 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -150,6 +150,8 @@ Docs: https://docs.openclaw.ai same inbound audio twice. Fixes #70580. - TTS/BlueBubbles: deliver compatible auto-TTS audio as iMessage voice memo bubbles instead of plain MP3/CAF file attachments. Fixes #16848. +- TTS: resolve voice-note and voice-memo routing from channel plugin + capabilities instead of speech-core-owned channel id lists. - ACP: send subagent and async-task completion wakes to external ACP harnesses as plain prompts instead of OpenClaw internal runtime-context envelopes, while keeping those envelopes out of ACP transcripts. diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index c7c1dbe1f1a..79ff8ee2da4 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -690c1cd4c0c2c3d31577958120e14ac0bf555af529e03aa5e7965b1d04659c49 plugin-sdk-api-baseline.json -a0e6ba472ddd3acea34c0a8fda8cbb7d1172b1671a671d5fef5a9f42d749ce0d plugin-sdk-api-baseline.jsonl +a81b6ddeb1fd24bf234a3b7ba1d51d18d7060afa49378dd92988f326e140db13 plugin-sdk-api-baseline.json +90a6e45404c2c017c23ab9ee75e71503ec683a680f64266504fdab69e43f288b plugin-sdk-api-baseline.jsonl diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 752fee30196..c84b328aad1 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -646,6 +646,44 @@ or `messages.tts.prefsPath`. These override the effective config from `messages.tts` plus the active `agents.list[].tts` block for that host. +## Output formats (fixed) + +TTS voice delivery is channel-capability driven. Channel plugins advertise +whether voice-style TTS should ask providers for a native `voice-note` target or +keep normal `audio-file` synthesis and only mark compatible output for voice +delivery. + +- **Voice-note capable channels**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). + - 48kHz / 64kbps is a good voice message tradeoff. +- **Feishu / WhatsApp**: when a voice-note reply is produced as MP3/WebM/WAV/M4A + or another likely audio file, the channel plugin transcodes it to 48kHz + Ogg/Opus with `ffmpeg` before sending the native voice message. WhatsApp sends + the result through the Baileys `audio` payload with `ptt: true` and + `audio/ogg; codecs=opus`. If conversion fails, Feishu receives the original + file as an attachment; WhatsApp send fails rather than posting an incompatible + PTT payload. +- **BlueBubbles**: keeps provider synthesis on the normal audio-file path; MP3 + and CAF outputs are marked for iMessage voice memo delivery. +- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). + - 44.1kHz / 128kbps is the default balance for speech clarity. +- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For channel-advertised voice-note targets, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery when the channel advertises transcoding. +- **Xiaomi MiMo**: MP3 by default, or WAV when configured. For channel-advertised voice-note targets, OpenClaw transcodes Xiaomi output to 48kHz Opus with `ffmpeg` before delivery when the channel advertises transcoding. +- **Local CLI**: uses the configured `outputFormat`. Voice-note targets are + converted to Ogg/Opus and telephony output is converted to raw 16 kHz mono PCM + with `ffmpeg`. +- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony. +- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony. +- **Inworld**: MP3 for normal audio attachments, native `OGG_OPUS` for voice-note targets, and raw `PCM` at 22050 Hz for Talk/telephony. +- **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path. +- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). + - The bundled transport accepts an `outputFormat`, but not all formats are available from the service. + - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). + - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need + guaranteed Opus voice messages. + - If the configured Microsoft output format fails, OpenClaw retries with MP3. + +OpenAI/ElevenLabs output formats are fixed per channel (see above). + ## Auto-TTS behavior When `messages.tts.auto` is enabled, OpenClaw: diff --git a/extensions/amazon-bedrock/index.test.ts b/extensions/amazon-bedrock/index.test.ts index 7e0e924023f..571d14881f0 100644 --- a/extensions/amazon-bedrock/index.test.ts +++ b/extensions/amazon-bedrock/index.test.ts @@ -1,13 +1,16 @@ import { readFileSync } from "node:fs"; import { resolve } from "node:path"; -import { beforeEach, describe, expect, it, vi } from "vitest"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../../src/config/config.js"; import { buildPluginApi } from "../../src/plugins/api-builder.js"; import type { PluginRuntime } from "../../src/plugins/runtime/types.js"; import { registerSingleProviderPlugin } from "../../test/helpers/plugins/plugin-registration.js"; import { resetBedrockDiscoveryCacheForTest } from "./discovery.js"; import amazonBedrockPlugin from "./index.js"; -import { resetBedrockAppProfileCacheEligibilityForTest } from "./register.sync.runtime.js"; +import { + resetBedrockAppProfileCacheEligibilityForTest, + setBedrockAppProfileControlPlaneForTest, +} from "./register.sync.runtime.js"; type BedrockClientResult = | { @@ -211,6 +214,19 @@ describe("amazon-bedrock provider plugin", () => { sendBedrockCommand.mockClear(); resetBedrockDiscoveryCacheForTest(); resetBedrockAppProfileCacheEligibilityForTest(); + setBedrockAppProfileControlPlaneForTest((region) => ({ + async getInferenceProfile(input) { + class GetInferenceProfileCommand { + constructor(readonly input: Record = {}) {} + } + bedrockClientConfigs.push(region ? { region } : {}); + return await sendBedrockCommand(new GetInferenceProfileCommand(input)); + }, + })); + }); + + afterEach(() => { + setBedrockAppProfileControlPlaneForTest(undefined); }); it("marks Claude 4.6 Bedrock models as adaptive by default", async () => { diff --git a/extensions/amazon-bedrock/register.sync.runtime.ts b/extensions/amazon-bedrock/register.sync.runtime.ts index ed7535c6173..018fa4c03e9 100644 --- a/extensions/amazon-bedrock/register.sync.runtime.ts +++ b/extensions/amazon-bedrock/register.sync.runtime.ts @@ -153,10 +153,42 @@ function resolvedModelSupportsCaching(modelArn: string): boolean { */ const appProfileCacheEligibleCache = new Map(); +type BedrockGetInferenceProfileResponse = { + models?: Array<{ modelArn?: string }>; +}; + +type BedrockControlPlane = { + getInferenceProfile: (input: { + inferenceProfileIdentifier: string; + }) => Promise; +}; + +type BedrockControlPlaneFactory = (region: string | undefined) => BedrockControlPlane; + +let bedrockControlPlaneOverride: BedrockControlPlaneFactory | undefined; + export function resetBedrockAppProfileCacheEligibilityForTest(): void { appProfileCacheEligibleCache.clear(); } +export function setBedrockAppProfileControlPlaneForTest( + controlPlane: BedrockControlPlaneFactory | undefined, +): void { + bedrockControlPlaneOverride = controlPlane; + resetBedrockAppProfileCacheEligibilityForTest(); +} + +async function createBedrockControlPlane(region: string | undefined): Promise { + if (bedrockControlPlaneOverride) { + return bedrockControlPlaneOverride(region); + } + const { BedrockClient, GetInferenceProfileCommand } = await import("@aws-sdk/client-bedrock"); + const client = new BedrockClient(region ? { region } : {}); + return { + getInferenceProfile: async (input) => await client.send(new GetInferenceProfileCommand(input)), + }; +} + async function resolveAppProfileCacheEligible( modelId: string, fallbackRegion: string | undefined, @@ -165,12 +197,9 @@ async function resolveAppProfileCacheEligible( return appProfileCacheEligibleCache.get(modelId)!; } try { - const { BedrockClient, GetInferenceProfileCommand } = await import("@aws-sdk/client-bedrock"); const region = extractRegionFromArn(modelId) ?? fallbackRegion; - const client = new BedrockClient(region ? { region } : {}); - const resp = await client.send( - new GetInferenceProfileCommand({ inferenceProfileIdentifier: modelId }), - ); + const controlPlane = await createBedrockControlPlane(region); + const resp = await controlPlane.getInferenceProfile({ inferenceProfileIdentifier: modelId }); const models = resp.models ?? []; const eligible = models.length > 0 && diff --git a/extensions/bluebubbles/src/channel-shared.ts b/extensions/bluebubbles/src/channel-shared.ts index 2f1f854a2ab..fc260c55610 100644 --- a/extensions/bluebubbles/src/channel-shared.ts +++ b/extensions/bluebubbles/src/channel-shared.ts @@ -31,6 +31,12 @@ export const bluebubblesMeta = { export const bluebubblesCapabilities: ChannelPlugin["capabilities"] = { chatTypes: ["direct", "group"], media: true, + tts: { + voice: { + synthesisTarget: "audio-file", + audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"], + }, + }, reactions: true, edit: true, unsend: true, diff --git a/extensions/discord/src/shared.ts b/extensions/discord/src/shared.ts index f9776a7e011..c38ae5859e3 100644 --- a/extensions/discord/src/shared.ts +++ b/extensions/discord/src/shared.ts @@ -96,6 +96,11 @@ export function createDiscordPluginBase(params: { reactions: true, threads: true, media: true, + tts: { + voice: { + synthesisTarget: "voice-note", + }, + }, nativeCommands: true, }, commands: { diff --git a/extensions/feishu/src/channel.ts b/extensions/feishu/src/channel.ts index 07844d57a0c..13965f204e2 100644 --- a/extensions/feishu/src/channel.ts +++ b/extensions/feishu/src/channel.ts @@ -588,6 +588,12 @@ export const feishuPlugin: ChannelPlugin = reactions: true, threads: true, media: true, + tts: { + voice: { + synthesisTarget: "voice-note", + }, + }, }, reload: { configPrefixes: ["channels.matrix"] }, configSchema: buildChannelConfigSchema(MatrixConfigSchema), diff --git a/extensions/openai/index.test.ts b/extensions/openai/index.test.ts index d79233b0dc3..bb5cfa8f78e 100644 --- a/extensions/openai/index.test.ts +++ b/extensions/openai/index.test.ts @@ -21,12 +21,6 @@ const runtimeMocks = vi.hoisted(() => ({ refreshOpenAICodexToken: vi.fn(), })); -type OpenAIRefreshDelegateGlobal = typeof globalThis & { - __OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__?: (...args: unknown[]) => unknown; -}; - -const openAIRefreshDelegateGlobal = () => globalThis as OpenAIRefreshDelegateGlobal; - vi.mock("openclaw/plugin-sdk/runtime-env", async () => { const actual = await vi.importActual( "openclaw/plugin-sdk/runtime-env", @@ -41,12 +35,10 @@ vi.mock("@mariozechner/pi-ai/oauth", () => ({ getOAuthApiKey: vi.fn(), getOAuthProviders: () => [], loginOpenAICodex: vi.fn(), - refreshOpenAICodexToken: vi.fn((...args: unknown[]) => - openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__?.(...args), - ), + refreshOpenAICodexToken: vi.fn(), })); -import { refreshOpenAICodexToken } from "./openai-codex-provider.runtime.js"; +import { createOpenAICodexProviderRuntime } from "./openai-codex-provider.runtime.js"; const _registerOpenAIPlugin = async () => registerProviderPlugin({ @@ -312,19 +304,19 @@ describe("openai plugin", () => { expires: Date.now() + 60_000, }; runtimeMocks.refreshOpenAICodexToken.mockResolvedValue(refreshed); - openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__ = - runtimeMocks.refreshOpenAICodexToken; - try { - await expect(refreshOpenAICodexToken("refresh-token")).resolves.toBe(refreshed); + const runtime = createOpenAICodexProviderRuntime({ + ensureGlobalUndiciEnvProxyDispatcher: runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher, + getOAuthApiKey: vi.fn(), + refreshOpenAICodexToken: runtimeMocks.refreshOpenAICodexToken, + }); - expect(runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher).toHaveBeenCalledOnce(); - expect(runtimeMocks.refreshOpenAICodexToken).toHaveBeenCalledOnce(); - expect( - runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher.mock.invocationCallOrder[0], - ).toBeLessThan(runtimeMocks.refreshOpenAICodexToken.mock.invocationCallOrder[0]); - } finally { - delete openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__; - } + await expect(runtime.refreshOpenAICodexToken("refresh-token")).resolves.toBe(refreshed); + + expect(runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher).toHaveBeenCalledOnce(); + expect(runtimeMocks.refreshOpenAICodexToken).toHaveBeenCalledOnce(); + expect( + runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher.mock.invocationCallOrder[0], + ).toBeLessThan(runtimeMocks.refreshOpenAICodexToken.mock.invocationCallOrder[0]); }); it("registers provider-owned OpenAI tool compat hooks for openai and codex", async () => { diff --git a/extensions/openai/openai-codex-provider.runtime.ts b/extensions/openai/openai-codex-provider.runtime.ts index 4b029f40f4e..7f328bc7c80 100644 --- a/extensions/openai/openai-codex-provider.runtime.ts +++ b/extensions/openai/openai-codex-provider.runtime.ts @@ -4,16 +4,42 @@ import { } from "@mariozechner/pi-ai/oauth"; import { ensureGlobalUndiciEnvProxyDispatcher } from "openclaw/plugin-sdk/runtime-env"; +type OpenAICodexProviderRuntimeDeps = { + ensureGlobalUndiciEnvProxyDispatcher: typeof ensureGlobalUndiciEnvProxyDispatcher; + getOAuthApiKey: typeof getOAuthApiKeyFromPi; + refreshOpenAICodexToken: typeof refreshOpenAICodexTokenFromPi; +}; + +export function createOpenAICodexProviderRuntime(deps: OpenAICodexProviderRuntimeDeps): { + getOAuthApiKey: typeof getOAuthApiKey; + refreshOpenAICodexToken: typeof refreshOpenAICodexToken; +} { + return { + async getOAuthApiKey(...args) { + deps.ensureGlobalUndiciEnvProxyDispatcher(); + return await deps.getOAuthApiKey(...args); + }, + async refreshOpenAICodexToken(...args) { + deps.ensureGlobalUndiciEnvProxyDispatcher(); + return await deps.refreshOpenAICodexToken(...args); + }, + }; +} + +const runtime = createOpenAICodexProviderRuntime({ + ensureGlobalUndiciEnvProxyDispatcher, + getOAuthApiKey: getOAuthApiKeyFromPi, + refreshOpenAICodexToken: refreshOpenAICodexTokenFromPi, +}); + export async function getOAuthApiKey( ...args: Parameters ): Promise>> { - ensureGlobalUndiciEnvProxyDispatcher(); - return await getOAuthApiKeyFromPi(...args); + return await runtime.getOAuthApiKey(...args); } export async function refreshOpenAICodexToken( ...args: Parameters ): Promise>> { - ensureGlobalUndiciEnvProxyDispatcher(); - return await refreshOpenAICodexTokenFromPi(...args); + return await runtime.refreshOpenAICodexToken(...args); } diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index c91ece82bda..1da0bc9f196 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -30,6 +30,22 @@ const getSpeechProviderMock = vi.hoisted(() => vi.fn()); vi.mock("openclaw/plugin-sdk/channel-targets", () => ({ normalizeChannelId: (channel: string | undefined) => channel?.trim().toLowerCase() ?? null, + resolveChannelTtsVoiceDelivery: (channel: string | undefined) => { + const normalized = channel?.trim().toLowerCase(); + if (normalized === "bluebubbles") { + return { + synthesisTarget: "audio-file", + audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"], + }; + } + if (normalized === "feishu" || normalized === "whatsapp") { + return { synthesisTarget: "voice-note", transcodesAudio: true }; + } + if (normalized === "discord" || normalized === "matrix" || normalized === "telegram") { + return { synthesisTarget: "voice-note" }; + } + return undefined; + }, })); vi.mock("../api.js", async () => { @@ -152,7 +168,7 @@ describe("speech-core native voice-note routing", () => { installSpeechProviders([createMockSpeechProvider()]); }); - it("keeps native voice-note channel support centralized", () => { + it("resolves voice delivery support from channel capabilities", () => { for (const channel of nativeVoiceNoteChannels) { expect(_test.supportsNativeVoiceNoteTts(channel)).toBe(true); expect(_test.supportsNativeVoiceNoteTts(channel.toUpperCase())).toBe(true); diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 23fb38ccc7a..c0932a682b2 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -9,7 +9,7 @@ import { unlinkSync, } from "node:fs"; import path from "node:path"; -import { normalizeChannelId, type ChannelId } from "openclaw/plugin-sdk/channel-targets"; +import { resolveChannelTtsVoiceDelivery } from "openclaw/plugin-sdk/channel-targets"; import type { OpenClawConfig, ResolvedTtsPersona, @@ -738,52 +738,34 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { lastTtsAttempt = entry; } -const VOICE_DELIVERY_CHANNELS = new Set([ - "bluebubbles", - "telegram", - "feishu", - "whatsapp", - "matrix", - "discord", -]); -const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]); -const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]); -const AUDIO_FILE_VOICE_MEMO_CHANNELS = new Set(["bluebubbles"]); - -function resolveChannelId(channel: string | undefined): ChannelId | null { - return channel ? normalizeChannelId(channel) : null; -} - function supportsNativeVoiceNoteTts(channel: string | undefined): boolean { - const channelId = resolveChannelId(channel); - return channelId !== null && VOICE_DELIVERY_CHANNELS.has(channelId); + return resolveChannelTtsVoiceDelivery(channel) !== undefined; } function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean { - const channelId = resolveChannelId(channel); - return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId); + const delivery = resolveChannelTtsVoiceDelivery(channel); + return delivery?.synthesisTarget === "voice-note" && delivery.transcodesAudio === true; } function resolveTtsSynthesisTarget(channel: string | undefined): "audio-file" | "voice-note" { - const channelId = resolveChannelId(channel); - return channelId !== null && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file"; + return resolveChannelTtsVoiceDelivery(channel)?.synthesisTarget ?? "audio-file"; } function supportsAudioFileVoiceMemoOutput(params: { fileExtension?: string; outputFormat?: string; + audioFileFormats?: readonly string[]; }): boolean { + const formats = new Set(params.audioFileFormats?.map((format) => format.trim().toLowerCase())); + if (formats.size === 0) { + return false; + } const extension = params.fileExtension?.trim().toLowerCase(); - if (extension === ".mp3" || extension === ".caf") { + if (extension && formats.has(extension.replace(/^\./, ""))) { return true; } const outputFormat = params.outputFormat?.trim().toLowerCase(); - return ( - outputFormat === "mp3" || - outputFormat === "caf" || - outputFormat === "audio/mpeg" || - outputFormat === "audio/x-caf" - ); + return outputFormat ? formats.has(outputFormat) : false; } function shouldDeliverTtsAsVoice(params: { @@ -793,17 +775,24 @@ function shouldDeliverTtsAsVoice(params: { fileExtension?: string; outputFormat?: string; }): boolean { - const channelId = resolveChannelId(params.channel); - if (channelId === null || !supportsNativeVoiceNoteTts(channelId)) { + const delivery = resolveChannelTtsVoiceDelivery(params.channel); + if (!delivery) { return false; } - if (AUDIO_FILE_VOICE_MEMO_CHANNELS.has(channelId)) { - return params.target === "audio-file" && supportsAudioFileVoiceMemoOutput(params); + if (delivery.synthesisTarget === "audio-file") { + return ( + params.target === "audio-file" && + supportsAudioFileVoiceMemoOutput({ + fileExtension: params.fileExtension, + outputFormat: params.outputFormat, + audioFileFormats: delivery.audioFileFormats, + }) + ); } if (params.target !== "voice-note") { return false; } - return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel); + return params.voiceCompatible === true || delivery.transcodesAudio === true; } export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] { diff --git a/extensions/telegram/src/shared.ts b/extensions/telegram/src/shared.ts index 40ec20df11b..a0cdc6404e0 100644 --- a/extensions/telegram/src/shared.ts +++ b/extensions/telegram/src/shared.ts @@ -142,6 +142,11 @@ export function createTelegramPluginBase(params: { reactions: true, threads: true, media: true, + tts: { + voice: { + synthesisTarget: "voice-note", + }, + }, polls: true, nativeCommands: true, blockStreaming: true, diff --git a/extensions/whatsapp/src/shared.ts b/extensions/whatsapp/src/shared.ts index 79c1872a33a..e2e0f9cbaf0 100644 --- a/extensions/whatsapp/src/shared.ts +++ b/extensions/whatsapp/src/shared.ts @@ -212,6 +212,12 @@ export function createWhatsAppPluginBase(params: { polls: true, reactions: true, media: true, + tts: { + voice: { + synthesisTarget: "voice-note", + transcodesAudio: true, + }, + }, }, reload: { configPrefixes: ["web"], noopPrefixes: ["channels.whatsapp"] }, gatewayMethods: ["web.login.start", "web.login.wait"], diff --git a/src/channels/plugins/tts-capabilities.test.ts b/src/channels/plugins/tts-capabilities.test.ts new file mode 100644 index 00000000000..4892bc0adce --- /dev/null +++ b/src/channels/plugins/tts-capabilities.test.ts @@ -0,0 +1,111 @@ +import { afterEach, describe, expect, it } from "vitest"; +import { createEmptyPluginRegistry } from "../../plugins/registry-empty.js"; +import { setActivePluginRegistry } from "../../plugins/runtime.js"; +import { + createChannelTestPluginBase, + createTestRegistry, +} from "../../test-utils/channel-plugins.js"; +import { resolveChannelTtsVoiceDelivery } from "./tts-capabilities.js"; +import type { ChannelPlugin } from "./types.js"; + +function createChannelPlugin( + id: string, + capabilities: ChannelPlugin["capabilities"], +): ChannelPlugin { + return createChannelTestPluginBase({ + id, + label: id, + capabilities, + config: { + listAccountIds: () => ["default"], + }, + }); +} + +describe("resolveChannelTtsVoiceDelivery", () => { + afterEach(() => { + setActivePluginRegistry(createEmptyPluginRegistry()); + }); + + it("reads voice delivery behavior from channel plugin capabilities", () => { + setActivePluginRegistry( + createTestRegistry([ + { + pluginId: "bluebubbles", + plugin: createChannelPlugin("bluebubbles", { + chatTypes: ["direct"], + tts: { + voice: { + synthesisTarget: "audio-file", + audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"], + }, + }, + }), + source: "test", + }, + { + pluginId: "discord", + plugin: createChannelPlugin("discord", { + chatTypes: ["direct"], + tts: { voice: { synthesisTarget: "voice-note" } }, + }), + source: "test", + }, + { + pluginId: "feishu", + plugin: createChannelPlugin("feishu", { + chatTypes: ["direct"], + tts: { voice: { synthesisTarget: "voice-note", transcodesAudio: true } }, + }), + source: "test", + }, + { + pluginId: "matrix", + plugin: createChannelPlugin("matrix", { + chatTypes: ["direct"], + tts: { voice: { synthesisTarget: "voice-note" } }, + }), + source: "test", + }, + { + pluginId: "telegram", + plugin: createChannelPlugin("telegram", { + chatTypes: ["direct"], + tts: { voice: { synthesisTarget: "voice-note" } }, + }), + source: "test", + }, + { + pluginId: "whatsapp", + plugin: createChannelPlugin("whatsapp", { + chatTypes: ["direct"], + tts: { voice: { synthesisTarget: "voice-note", transcodesAudio: true } }, + }), + source: "test", + }, + ]), + ); + expect(resolveChannelTtsVoiceDelivery("bluebubbles")).toEqual({ + synthesisTarget: "audio-file", + audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"], + }); + expect(resolveChannelTtsVoiceDelivery("discord")).toEqual({ + synthesisTarget: "voice-note", + }); + expect(resolveChannelTtsVoiceDelivery("feishu")).toEqual({ + synthesisTarget: "voice-note", + transcodesAudio: true, + }); + expect(resolveChannelTtsVoiceDelivery("matrix")).toEqual({ + synthesisTarget: "voice-note", + }); + expect(resolveChannelTtsVoiceDelivery("telegram")).toEqual({ + synthesisTarget: "voice-note", + }); + expect(resolveChannelTtsVoiceDelivery("whatsapp")).toEqual({ + synthesisTarget: "voice-note", + transcodesAudio: true, + }); + expect(resolveChannelTtsVoiceDelivery("slack")).toBeUndefined(); + }); +}); diff --git a/src/channels/plugins/tts-capabilities.ts b/src/channels/plugins/tts-capabilities.ts new file mode 100644 index 00000000000..c3a1a6ffcfb --- /dev/null +++ b/src/channels/plugins/tts-capabilities.ts @@ -0,0 +1,13 @@ +import { normalizeChannelId } from "./registry.js"; +import { getChannelPlugin } from "./registry.js"; +import type { ChannelTtsVoiceDeliveryCapabilities } from "./types.core.js"; + +export function resolveChannelTtsVoiceDelivery( + channel: string | undefined, +): ChannelTtsVoiceDeliveryCapabilities | undefined { + const channelId = normalizeChannelId(channel); + if (!channelId) { + return undefined; + } + return getChannelPlugin(channelId)?.capabilities.tts?.voice; +} diff --git a/src/channels/plugins/types.core.ts b/src/channels/plugins/types.core.ts index c476246f8fe..8433d65163b 100644 --- a/src/channels/plugins/types.core.ts +++ b/src/channels/plugins/types.core.ts @@ -272,6 +272,13 @@ export type ChannelGroupContext = { senderE164?: string | null; }; +/** TTS voice delivery behavior advertised by a channel plugin. */ +export type ChannelTtsVoiceDeliveryCapabilities = { + synthesisTarget: "audio-file" | "voice-note"; + transcodesAudio?: boolean; + audioFileFormats?: readonly string[]; +}; + /** Static capability flags advertised by a channel plugin. */ export type ChannelCapabilities = { chatTypes: Array; @@ -284,6 +291,9 @@ export type ChannelCapabilities = { groupManagement?: boolean; threads?: boolean; media?: boolean; + tts?: { + voice?: ChannelTtsVoiceDeliveryCapabilities; + }; nativeCommands?: boolean; blockStreaming?: boolean; }; diff --git a/src/plugin-sdk/channel-targets.ts b/src/plugin-sdk/channel-targets.ts index 15ffbf118c5..2d6003ca739 100644 --- a/src/plugin-sdk/channel-targets.ts +++ b/src/plugin-sdk/channel-targets.ts @@ -39,6 +39,7 @@ export { } from "../channels/plugins/chat-target-prefixes.js"; export type { ChannelId } from "../channels/plugins/types.public.js"; export { normalizeChannelId } from "../channels/plugins/registry.js"; +export { resolveChannelTtsVoiceDelivery } from "../channels/plugins/tts-capabilities.js"; export { buildUnresolvedTargetResults, resolveTargetsWithOptionalToken, diff --git a/src/plugin-sdk/tts-runtime.types.ts b/src/plugin-sdk/tts-runtime.types.ts index 60e0c24a4ca..a3a608daaa8 100644 --- a/src/plugin-sdk/tts-runtime.types.ts +++ b/src/plugin-sdk/tts-runtime.types.ts @@ -112,6 +112,8 @@ export type TtsTestFacade = { channel: string | undefined; target: TtsSpeechTarget | undefined; voiceCompatible: boolean | undefined; + fileExtension?: string; + outputFormat?: string; }) => boolean; summarizeText: (...args: unknown[]) => Promise; getResolvedSpeechProviderConfig: (