refactor(tts): resolve voice delivery from channel capabilities

2026-04-30 14:02:56 +08:00 · 2026-04-26 06:51:19 +01:00
parent 2784710f4d
commit d613c8e29b
20 changed files with 349 additions and 71 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -150,6 +150,8 @@ Docs: https://docs.openclaw.ai
  same inbound audio twice. Fixes #70580.
 - TTS/BlueBubbles: deliver compatible auto-TTS audio as iMessage voice memo
  bubbles instead of plain MP3/CAF file attachments. Fixes #16848.
+- TTS: resolve voice-note and voice-memo routing from channel plugin
+  capabilities instead of speech-core-owned channel id lists.
 - ACP: send subagent and async-task completion wakes to external ACP harnesses as
  plain prompts instead of OpenClaw internal runtime-context envelopes, while
  keeping those envelopes out of ACP transcripts.
--- a/docs/.generated/plugin-sdk-api-baseline.sha256
+++ b/docs/.generated/plugin-sdk-api-baseline.sha256
@@ -1,2 +1,2 @@
-690c1cd4c0c2c3d31577958120e14ac0bf555af529e03aa5e7965b1d04659c49  plugin-sdk-api-baseline.json
-a0e6ba472ddd3acea34c0a8fda8cbb7d1172b1671a671d5fef5a9f42d749ce0d  plugin-sdk-api-baseline.jsonl
+a81b6ddeb1fd24bf234a3b7ba1d51d18d7060afa49378dd92988f326e140db13  plugin-sdk-api-baseline.json
+90a6e45404c2c017c23ab9ee75e71503ec683a680f64266504fdab69e43f288b  plugin-sdk-api-baseline.jsonl
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -646,6 +646,44 @@ or `messages.tts.prefsPath`.
 These override the effective config from `messages.tts` plus the active
 `agents.list[].tts` block for that host.

+## Output formats (fixed)
+
+TTS voice delivery is channel-capability driven. Channel plugins advertise
+whether voice-style TTS should ask providers for a native `voice-note` target or
+keep normal `audio-file` synthesis and only mark compatible output for voice
+delivery.
+
+- **Voice-note capable channels**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
+  - 48kHz / 64kbps is a good voice message tradeoff.
+- **Feishu / WhatsApp**: when a voice-note reply is produced as MP3/WebM/WAV/M4A
+  or another likely audio file, the channel plugin transcodes it to 48kHz
+  Ogg/Opus with `ffmpeg` before sending the native voice message. WhatsApp sends
+  the result through the Baileys `audio` payload with `ptt: true` and
+  `audio/ogg; codecs=opus`. If conversion fails, Feishu receives the original
+  file as an attachment; WhatsApp send fails rather than posting an incompatible
+  PTT payload.
+- **BlueBubbles**: keeps provider synthesis on the normal audio-file path; MP3
+  and CAF outputs are marked for iMessage voice memo delivery.
+- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
+  - 44.1kHz / 128kbps is the default balance for speech clarity.
+- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For channel-advertised voice-note targets, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery when the channel advertises transcoding.
+- **Xiaomi MiMo**: MP3 by default, or WAV when configured. For channel-advertised voice-note targets, OpenClaw transcodes Xiaomi output to 48kHz Opus with `ffmpeg` before delivery when the channel advertises transcoding.
+- **Local CLI**: uses the configured `outputFormat`. Voice-note targets are
+  converted to Ogg/Opus and telephony output is converted to raw 16 kHz mono PCM
+  with `ffmpeg`.
+- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony.
+- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony.
+- **Inworld**: MP3 for normal audio attachments, native `OGG_OPUS` for voice-note targets, and raw `PCM` at 22050 Hz for Talk/telephony.
+- **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path.
+- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
+  - The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
+  - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
+  - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
+    guaranteed Opus voice messages.
+  - If the configured Microsoft output format fails, OpenClaw retries with MP3.
+
+OpenAI/ElevenLabs output formats are fixed per channel (see above).
+
 ## Auto-TTS behavior

 When `messages.tts.auto` is enabled, OpenClaw:
--- a/extensions/amazon-bedrock/index.test.ts
+++ b/extensions/amazon-bedrock/index.test.ts
@@ -1,13 +1,16 @@
 import { readFileSync } from "node:fs";
 import { resolve } from "node:path";
-import { beforeEach, describe, expect, it, vi } from "vitest";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../../src/config/config.js";
 import { buildPluginApi } from "../../src/plugins/api-builder.js";
 import type { PluginRuntime } from "../../src/plugins/runtime/types.js";
 import { registerSingleProviderPlugin } from "../../test/helpers/plugins/plugin-registration.js";
 import { resetBedrockDiscoveryCacheForTest } from "./discovery.js";
 import amazonBedrockPlugin from "./index.js";
-import { resetBedrockAppProfileCacheEligibilityForTest } from "./register.sync.runtime.js";
+import {
+  resetBedrockAppProfileCacheEligibilityForTest,
+  setBedrockAppProfileControlPlaneForTest,
+} from "./register.sync.runtime.js";

 type BedrockClientResult =
  | {
@@ -211,6 +214,19 @@ describe("amazon-bedrock provider plugin", () => {
    sendBedrockCommand.mockClear();
    resetBedrockDiscoveryCacheForTest();
    resetBedrockAppProfileCacheEligibilityForTest();
+    setBedrockAppProfileControlPlaneForTest((region) => ({
+      async getInferenceProfile(input) {
+        class GetInferenceProfileCommand {
+          constructor(readonly input: Record<string, unknown> = {}) {}
+        }
+        bedrockClientConfigs.push(region ? { region } : {});
+        return await sendBedrockCommand(new GetInferenceProfileCommand(input));
+      },
+    }));
+  });
+
+  afterEach(() => {
+    setBedrockAppProfileControlPlaneForTest(undefined);
  });

  it("marks Claude 4.6 Bedrock models as adaptive by default", async () => {
--- a/extensions/amazon-bedrock/register.sync.runtime.ts
+++ b/extensions/amazon-bedrock/register.sync.runtime.ts
@@ -153,10 +153,42 @@ function resolvedModelSupportsCaching(modelArn: string): boolean {
 */
 const appProfileCacheEligibleCache = new Map<string, boolean>();

+type BedrockGetInferenceProfileResponse = {
+  models?: Array<{ modelArn?: string }>;
+};
+
+type BedrockControlPlane = {
+  getInferenceProfile: (input: {
+    inferenceProfileIdentifier: string;
+  }) => Promise<BedrockGetInferenceProfileResponse>;
+};
+
+type BedrockControlPlaneFactory = (region: string | undefined) => BedrockControlPlane;
+
+let bedrockControlPlaneOverride: BedrockControlPlaneFactory | undefined;
+
 export function resetBedrockAppProfileCacheEligibilityForTest(): void {
  appProfileCacheEligibleCache.clear();
 }

+export function setBedrockAppProfileControlPlaneForTest(
+  controlPlane: BedrockControlPlaneFactory | undefined,
+): void {
+  bedrockControlPlaneOverride = controlPlane;
+  resetBedrockAppProfileCacheEligibilityForTest();
+}
+
+async function createBedrockControlPlane(region: string | undefined): Promise<BedrockControlPlane> {
+  if (bedrockControlPlaneOverride) {
+    return bedrockControlPlaneOverride(region);
+  }
+  const { BedrockClient, GetInferenceProfileCommand } = await import("@aws-sdk/client-bedrock");
+  const client = new BedrockClient(region ? { region } : {});
+  return {
+    getInferenceProfile: async (input) => await client.send(new GetInferenceProfileCommand(input)),
+  };
+}
+
 async function resolveAppProfileCacheEligible(
  modelId: string,
  fallbackRegion: string | undefined,
@@ -165,12 +197,9 @@ async function resolveAppProfileCacheEligible(
    return appProfileCacheEligibleCache.get(modelId)!;
  }
  try {
-    const { BedrockClient, GetInferenceProfileCommand } = await import("@aws-sdk/client-bedrock");
    const region = extractRegionFromArn(modelId) ?? fallbackRegion;
-    const client = new BedrockClient(region ? { region } : {});
-    const resp = await client.send(
-      new GetInferenceProfileCommand({ inferenceProfileIdentifier: modelId }),
-    );
+    const controlPlane = await createBedrockControlPlane(region);
+    const resp = await controlPlane.getInferenceProfile({ inferenceProfileIdentifier: modelId });
    const models = resp.models ?? [];
    const eligible =
      models.length > 0 &&
--- a/extensions/bluebubbles/src/channel-shared.ts
+++ b/extensions/bluebubbles/src/channel-shared.ts
@@ -31,6 +31,12 @@ export const bluebubblesMeta = {
 export const bluebubblesCapabilities: ChannelPlugin<ResolvedBlueBubblesAccount>["capabilities"] = {
  chatTypes: ["direct", "group"],
  media: true,
+  tts: {
+    voice: {
+      synthesisTarget: "audio-file",
+      audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
+    },
+  },
  reactions: true,
  edit: true,
  unsend: true,
--- a/extensions/discord/src/shared.ts
+++ b/extensions/discord/src/shared.ts
@@ -96,6 +96,11 @@ export function createDiscordPluginBase(params: {
      reactions: true,
      threads: true,
      media: true,
+      tts: {
+        voice: {
+          synthesisTarget: "voice-note",
+        },
+      },
      nativeCommands: true,
    },
    commands: {
--- a/extensions/feishu/src/channel.ts
+++ b/extensions/feishu/src/channel.ts
@@ -588,6 +588,12 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
        polls: false,
        threads: true,
        media: true,
+        tts: {
+          voice: {
+            synthesisTarget: "voice-note",
+            transcodesAudio: true,
+          },
+        },
        reactions: true,
        edit: true,
        reply: true,
--- a/extensions/matrix/src/channel.ts
+++ b/extensions/matrix/src/channel.ts
@@ -334,6 +334,11 @@ export const matrixPlugin: ChannelPlugin<ResolvedMatrixAccount, MatrixProbe> =
        reactions: true,
        threads: true,
        media: true,
+        tts: {
+          voice: {
+            synthesisTarget: "voice-note",
+          },
+        },
      },
      reload: { configPrefixes: ["channels.matrix"] },
      configSchema: buildChannelConfigSchema(MatrixConfigSchema),
--- a/extensions/openai/index.test.ts
+++ b/extensions/openai/index.test.ts
@@ -21,12 +21,6 @@ const runtimeMocks = vi.hoisted(() => ({
  refreshOpenAICodexToken: vi.fn(),
 }));

-type OpenAIRefreshDelegateGlobal = typeof globalThis & {
-  __OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__?: (...args: unknown[]) => unknown;
-};
-
-const openAIRefreshDelegateGlobal = () => globalThis as OpenAIRefreshDelegateGlobal;
-
 vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
  const actual = await vi.importActual<typeof import("openclaw/plugin-sdk/runtime-env")>(
    "openclaw/plugin-sdk/runtime-env",
@@ -41,12 +35,10 @@ vi.mock("@mariozechner/pi-ai/oauth", () => ({
  getOAuthApiKey: vi.fn(),
  getOAuthProviders: () => [],
  loginOpenAICodex: vi.fn(),
-  refreshOpenAICodexToken: vi.fn((...args: unknown[]) =>
-    openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__?.(...args),
-  ),
+  refreshOpenAICodexToken: vi.fn(),
 }));

-import { refreshOpenAICodexToken } from "./openai-codex-provider.runtime.js";
+import { createOpenAICodexProviderRuntime } from "./openai-codex-provider.runtime.js";

 const _registerOpenAIPlugin = async () =>
  registerProviderPlugin({
@@ -312,19 +304,19 @@ describe("openai plugin", () => {
      expires: Date.now() + 60_000,
    };
    runtimeMocks.refreshOpenAICodexToken.mockResolvedValue(refreshed);
-    openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__ =
-      runtimeMocks.refreshOpenAICodexToken;
-    try {
-      await expect(refreshOpenAICodexToken("refresh-token")).resolves.toBe(refreshed);
+    const runtime = createOpenAICodexProviderRuntime({
+      ensureGlobalUndiciEnvProxyDispatcher: runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher,
+      getOAuthApiKey: vi.fn(),
+      refreshOpenAICodexToken: runtimeMocks.refreshOpenAICodexToken,
+    });

-      expect(runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher).toHaveBeenCalledOnce();
-      expect(runtimeMocks.refreshOpenAICodexToken).toHaveBeenCalledOnce();
-      expect(
-        runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher.mock.invocationCallOrder[0],
-      ).toBeLessThan(runtimeMocks.refreshOpenAICodexToken.mock.invocationCallOrder[0]);
-    } finally {
-      delete openAIRefreshDelegateGlobal().__OPENCLAW_TEST_REFRESH_OPENAI_CODEX_TOKEN__;
-    }
+    await expect(runtime.refreshOpenAICodexToken("refresh-token")).resolves.toBe(refreshed);
+
+    expect(runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher).toHaveBeenCalledOnce();
+    expect(runtimeMocks.refreshOpenAICodexToken).toHaveBeenCalledOnce();
+    expect(
+      runtimeMocks.ensureGlobalUndiciEnvProxyDispatcher.mock.invocationCallOrder[0],
+    ).toBeLessThan(runtimeMocks.refreshOpenAICodexToken.mock.invocationCallOrder[0]);
  });

  it("registers provider-owned OpenAI tool compat hooks for openai and codex", async () => {
--- a/extensions/openai/openai-codex-provider.runtime.ts
+++ b/extensions/openai/openai-codex-provider.runtime.ts
@@ -4,16 +4,42 @@ import {
 } from "@mariozechner/pi-ai/oauth";
 import { ensureGlobalUndiciEnvProxyDispatcher } from "openclaw/plugin-sdk/runtime-env";

+type OpenAICodexProviderRuntimeDeps = {
+  ensureGlobalUndiciEnvProxyDispatcher: typeof ensureGlobalUndiciEnvProxyDispatcher;
+  getOAuthApiKey: typeof getOAuthApiKeyFromPi;
+  refreshOpenAICodexToken: typeof refreshOpenAICodexTokenFromPi;
+};
+
+export function createOpenAICodexProviderRuntime(deps: OpenAICodexProviderRuntimeDeps): {
+  getOAuthApiKey: typeof getOAuthApiKey;
+  refreshOpenAICodexToken: typeof refreshOpenAICodexToken;
+} {
+  return {
+    async getOAuthApiKey(...args) {
+      deps.ensureGlobalUndiciEnvProxyDispatcher();
+      return await deps.getOAuthApiKey(...args);
+    },
+    async refreshOpenAICodexToken(...args) {
+      deps.ensureGlobalUndiciEnvProxyDispatcher();
+      return await deps.refreshOpenAICodexToken(...args);
+    },
+  };
+}
+
+const runtime = createOpenAICodexProviderRuntime({
+  ensureGlobalUndiciEnvProxyDispatcher,
+  getOAuthApiKey: getOAuthApiKeyFromPi,
+  refreshOpenAICodexToken: refreshOpenAICodexTokenFromPi,
+});
+
 export async function getOAuthApiKey(
  ...args: Parameters<typeof getOAuthApiKeyFromPi>
 ): Promise<Awaited<ReturnType<typeof getOAuthApiKeyFromPi>>> {
-  ensureGlobalUndiciEnvProxyDispatcher();
-  return await getOAuthApiKeyFromPi(...args);
+  return await runtime.getOAuthApiKey(...args);
 }

 export async function refreshOpenAICodexToken(
  ...args: Parameters<typeof refreshOpenAICodexTokenFromPi>
 ): Promise<Awaited<ReturnType<typeof refreshOpenAICodexTokenFromPi>>> {
-  ensureGlobalUndiciEnvProxyDispatcher();
-  return await refreshOpenAICodexTokenFromPi(...args);
+  return await runtime.refreshOpenAICodexToken(...args);
 }
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -30,6 +30,22 @@ const getSpeechProviderMock = vi.hoisted(() => vi.fn());

 vi.mock("openclaw/plugin-sdk/channel-targets", () => ({
  normalizeChannelId: (channel: string | undefined) => channel?.trim().toLowerCase() ?? null,
+  resolveChannelTtsVoiceDelivery: (channel: string | undefined) => {
+    const normalized = channel?.trim().toLowerCase();
+    if (normalized === "bluebubbles") {
+      return {
+        synthesisTarget: "audio-file",
+        audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
+      };
+    }
+    if (normalized === "feishu" || normalized === "whatsapp") {
+      return { synthesisTarget: "voice-note", transcodesAudio: true };
+    }
+    if (normalized === "discord" || normalized === "matrix" || normalized === "telegram") {
+      return { synthesisTarget: "voice-note" };
+    }
+    return undefined;
+  },
 }));

 vi.mock("../api.js", async () => {
@@ -152,7 +168,7 @@ describe("speech-core native voice-note routing", () => {
    installSpeechProviders([createMockSpeechProvider()]);
  });

-  it("keeps native voice-note channel support centralized", () => {
+  it("resolves voice delivery support from channel capabilities", () => {
    for (const channel of nativeVoiceNoteChannels) {
      expect(_test.supportsNativeVoiceNoteTts(channel)).toBe(true);
      expect(_test.supportsNativeVoiceNoteTts(channel.toUpperCase())).toBe(true);
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -9,7 +9,7 @@ import {
  unlinkSync,
 } from "node:fs";
 import path from "node:path";
-import { normalizeChannelId, type ChannelId } from "openclaw/plugin-sdk/channel-targets";
+import { resolveChannelTtsVoiceDelivery } from "openclaw/plugin-sdk/channel-targets";
 import type {
  OpenClawConfig,
  ResolvedTtsPersona,
@@ -738,52 +738,34 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
  lastTtsAttempt = entry;
 }

-const VOICE_DELIVERY_CHANNELS = new Set([
-  "bluebubbles",
-  "telegram",
-  "feishu",
-  "whatsapp",
-  "matrix",
-  "discord",
-]);
-const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
-const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]);
-const AUDIO_FILE_VOICE_MEMO_CHANNELS = new Set(["bluebubbles"]);
-
-function resolveChannelId(channel: string | undefined): ChannelId | null {
-  return channel ? normalizeChannelId(channel) : null;
-}
-
 function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
-  const channelId = resolveChannelId(channel);
-  return channelId !== null && VOICE_DELIVERY_CHANNELS.has(channelId);
+  return resolveChannelTtsVoiceDelivery(channel) !== undefined;
 }

 function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
-  const channelId = resolveChannelId(channel);
-  return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
+  const delivery = resolveChannelTtsVoiceDelivery(channel);
+  return delivery?.synthesisTarget === "voice-note" && delivery.transcodesAudio === true;
 }

 function resolveTtsSynthesisTarget(channel: string | undefined): "audio-file" | "voice-note" {
-  const channelId = resolveChannelId(channel);
-  return channelId !== null && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
+  return resolveChannelTtsVoiceDelivery(channel)?.synthesisTarget ?? "audio-file";
 }

 function supportsAudioFileVoiceMemoOutput(params: {
  fileExtension?: string;
  outputFormat?: string;
+  audioFileFormats?: readonly string[];
 }): boolean {
+  const formats = new Set(params.audioFileFormats?.map((format) => format.trim().toLowerCase()));
+  if (formats.size === 0) {
+    return false;
+  }
  const extension = params.fileExtension?.trim().toLowerCase();
-  if (extension === ".mp3" || extension === ".caf") {
+  if (extension && formats.has(extension.replace(/^\./, ""))) {
    return true;
  }
  const outputFormat = params.outputFormat?.trim().toLowerCase();
-  return (
-    outputFormat === "mp3" ||
-    outputFormat === "caf" ||
-    outputFormat === "audio/mpeg" ||
-    outputFormat === "audio/x-caf"
-  );
+  return outputFormat ? formats.has(outputFormat) : false;
 }

 function shouldDeliverTtsAsVoice(params: {
@@ -793,17 +775,24 @@ function shouldDeliverTtsAsVoice(params: {
  fileExtension?: string;
  outputFormat?: string;
 }): boolean {
-  const channelId = resolveChannelId(params.channel);
-  if (channelId === null || !supportsNativeVoiceNoteTts(channelId)) {
+  const delivery = resolveChannelTtsVoiceDelivery(params.channel);
+  if (!delivery) {
    return false;
  }
-  if (AUDIO_FILE_VOICE_MEMO_CHANNELS.has(channelId)) {
-    return params.target === "audio-file" && supportsAudioFileVoiceMemoOutput(params);
+  if (delivery.synthesisTarget === "audio-file") {
+    return (
+      params.target === "audio-file" &&
+      supportsAudioFileVoiceMemoOutput({
+        fileExtension: params.fileExtension,
+        outputFormat: params.outputFormat,
+        audioFileFormats: delivery.audioFileFormats,
+      })
+    );
  }
  if (params.target !== "voice-note") {
    return false;
  }
-  return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
+  return params.voiceCompatible === true || delivery.transcodesAudio === true;
 }

 export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
--- a/extensions/telegram/src/shared.ts
+++ b/extensions/telegram/src/shared.ts
@@ -142,6 +142,11 @@ export function createTelegramPluginBase(params: {
      reactions: true,
      threads: true,
      media: true,
+      tts: {
+        voice: {
+          synthesisTarget: "voice-note",
+        },
+      },
      polls: true,
      nativeCommands: true,
      blockStreaming: true,
--- a/extensions/whatsapp/src/shared.ts
+++ b/extensions/whatsapp/src/shared.ts
@@ -212,6 +212,12 @@ export function createWhatsAppPluginBase(params: {
      polls: true,
      reactions: true,
      media: true,
+      tts: {
+        voice: {
+          synthesisTarget: "voice-note",
+          transcodesAudio: true,
+        },
+      },
    },
    reload: { configPrefixes: ["web"], noopPrefixes: ["channels.whatsapp"] },
    gatewayMethods: ["web.login.start", "web.login.wait"],
--- a/src/channels/plugins/tts-capabilities.test.ts
+++ b/src/channels/plugins/tts-capabilities.test.ts
@@ -0,0 +1,111 @@
+import { afterEach, describe, expect, it } from "vitest";
+import { createEmptyPluginRegistry } from "../../plugins/registry-empty.js";
+import { setActivePluginRegistry } from "../../plugins/runtime.js";
+import {
+  createChannelTestPluginBase,
+  createTestRegistry,
+} from "../../test-utils/channel-plugins.js";
+import { resolveChannelTtsVoiceDelivery } from "./tts-capabilities.js";
+import type { ChannelPlugin } from "./types.js";
+
+function createChannelPlugin(
+  id: string,
+  capabilities: ChannelPlugin["capabilities"],
+): ChannelPlugin {
+  return createChannelTestPluginBase({
+    id,
+    label: id,
+    capabilities,
+    config: {
+      listAccountIds: () => ["default"],
+    },
+  });
+}
+
+describe("resolveChannelTtsVoiceDelivery", () => {
+  afterEach(() => {
+    setActivePluginRegistry(createEmptyPluginRegistry());
+  });
+
+  it("reads voice delivery behavior from channel plugin capabilities", () => {
+    setActivePluginRegistry(
+      createTestRegistry([
+        {
+          pluginId: "bluebubbles",
+          plugin: createChannelPlugin("bluebubbles", {
+            chatTypes: ["direct"],
+            tts: {
+              voice: {
+                synthesisTarget: "audio-file",
+                audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
+              },
+            },
+          }),
+          source: "test",
+        },
+        {
+          pluginId: "discord",
+          plugin: createChannelPlugin("discord", {
+            chatTypes: ["direct"],
+            tts: { voice: { synthesisTarget: "voice-note" } },
+          }),
+          source: "test",
+        },
+        {
+          pluginId: "feishu",
+          plugin: createChannelPlugin("feishu", {
+            chatTypes: ["direct"],
+            tts: { voice: { synthesisTarget: "voice-note", transcodesAudio: true } },
+          }),
+          source: "test",
+        },
+        {
+          pluginId: "matrix",
+          plugin: createChannelPlugin("matrix", {
+            chatTypes: ["direct"],
+            tts: { voice: { synthesisTarget: "voice-note" } },
+          }),
+          source: "test",
+        },
+        {
+          pluginId: "telegram",
+          plugin: createChannelPlugin("telegram", {
+            chatTypes: ["direct"],
+            tts: { voice: { synthesisTarget: "voice-note" } },
+          }),
+          source: "test",
+        },
+        {
+          pluginId: "whatsapp",
+          plugin: createChannelPlugin("whatsapp", {
+            chatTypes: ["direct"],
+            tts: { voice: { synthesisTarget: "voice-note", transcodesAudio: true } },
+          }),
+          source: "test",
+        },
+      ]),
+    );
+    expect(resolveChannelTtsVoiceDelivery("bluebubbles")).toEqual({
+      synthesisTarget: "audio-file",
+      audioFileFormats: ["mp3", "caf", "audio/mpeg", "audio/x-caf"],
+    });
+    expect(resolveChannelTtsVoiceDelivery("discord")).toEqual({
+      synthesisTarget: "voice-note",
+    });
+    expect(resolveChannelTtsVoiceDelivery("feishu")).toEqual({
+      synthesisTarget: "voice-note",
+      transcodesAudio: true,
+    });
+    expect(resolveChannelTtsVoiceDelivery("matrix")).toEqual({
+      synthesisTarget: "voice-note",
+    });
+    expect(resolveChannelTtsVoiceDelivery("telegram")).toEqual({
+      synthesisTarget: "voice-note",
+    });
+    expect(resolveChannelTtsVoiceDelivery("whatsapp")).toEqual({
+      synthesisTarget: "voice-note",
+      transcodesAudio: true,
+    });
+    expect(resolveChannelTtsVoiceDelivery("slack")).toBeUndefined();
+  });
+});
--- a/src/channels/plugins/tts-capabilities.ts
+++ b/src/channels/plugins/tts-capabilities.ts
@@ -0,0 +1,13 @@
+import { normalizeChannelId } from "./registry.js";
+import { getChannelPlugin } from "./registry.js";
+import type { ChannelTtsVoiceDeliveryCapabilities } from "./types.core.js";
+
+export function resolveChannelTtsVoiceDelivery(
+  channel: string | undefined,
+): ChannelTtsVoiceDeliveryCapabilities | undefined {
+  const channelId = normalizeChannelId(channel);
+  if (!channelId) {
+    return undefined;
+  }
+  return getChannelPlugin(channelId)?.capabilities.tts?.voice;
+}
--- a/src/channels/plugins/types.core.ts
+++ b/src/channels/plugins/types.core.ts
@@ -272,6 +272,13 @@ export type ChannelGroupContext = {
  senderE164?: string | null;
 };

+/** TTS voice delivery behavior advertised by a channel plugin. */
+export type ChannelTtsVoiceDeliveryCapabilities = {
+  synthesisTarget: "audio-file" | "voice-note";
+  transcodesAudio?: boolean;
+  audioFileFormats?: readonly string[];
+};
+
 /** Static capability flags advertised by a channel plugin. */
 export type ChannelCapabilities = {
  chatTypes: Array<ChatType | "thread">;
@@ -284,6 +291,9 @@ export type ChannelCapabilities = {
  groupManagement?: boolean;
  threads?: boolean;
  media?: boolean;
+  tts?: {
+    voice?: ChannelTtsVoiceDeliveryCapabilities;
+  };
  nativeCommands?: boolean;
  blockStreaming?: boolean;
 };
--- a/src/plugin-sdk/channel-targets.ts
+++ b/src/plugin-sdk/channel-targets.ts
@@ -39,6 +39,7 @@ export {
 } from "../channels/plugins/chat-target-prefixes.js";
 export type { ChannelId } from "../channels/plugins/types.public.js";
 export { normalizeChannelId } from "../channels/plugins/registry.js";
+export { resolveChannelTtsVoiceDelivery } from "../channels/plugins/tts-capabilities.js";
 export {
  buildUnresolvedTargetResults,
  resolveTargetsWithOptionalToken,
--- a/src/plugin-sdk/tts-runtime.types.ts
+++ b/src/plugin-sdk/tts-runtime.types.ts
@@ -112,6 +112,8 @@ export type TtsTestFacade = {
    channel: string | undefined;
    target: TtsSpeechTarget | undefined;
    voiceCompatible: boolean | undefined;
+    fileExtension?: string;
+    outputFormat?: string;
  }) => boolean;
  summarizeText: (...args: unknown[]) => Promise<SummarizeResult>;
  getResolvedSpeechProviderConfig: (