diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09d59976580..b9c10facbc7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
- Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`.
- Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key.
- Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
+- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support.
- Providers/OpenAI: add image generation and reference-image editing through Codex OAuth, so `openai/gpt-image-2` works without an `OPENAI_API_KEY`. Fixes #70703.
- Providers/OpenRouter: add image generation and reference-image editing through `image_generate`, so OpenRouter image models work with `OPENROUTER_API_KEY`. Fixes #55066 via #67668. Thanks @notamicrodose.
- Image generation: let agents request provider-supported quality and output format hints, and pass OpenAI-specific background, moderation, compression, and user hints through the `image_generate` tool. (#70503) Thanks @ottodeng.
diff --git a/docs/providers/google.md b/docs/providers/google.md
index 97aacb9e59d..d805a1bf6f2 100644
--- a/docs/providers/google.md
+++ b/docs/providers/google.md
@@ -132,6 +132,7 @@ Choose your preferred auth method and follow the setup steps.
| Image generation | Yes |
| Music generation | Yes |
| Text-to-speech | Yes |
+| Realtime voice | Yes (Google Live API) |
| Image understanding | Yes |
| Audio transcription | Yes |
| Video understanding | Yes |
@@ -281,6 +282,63 @@ A Google Cloud Console API key restricted to the Gemini API is valid for this
provider. This is not the separate Cloud Text-to-Speech API path.
+## Realtime voice
+
+The bundled `google` plugin registers a realtime voice provider backed by the
+Gemini Live API for backend audio bridges such as Voice Call and Google Meet.
+
+| Setting | Config path | Default |
+| --------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------- |
+| Model | `plugins.entries.voice-call.config.realtime.providers.google.model` | `gemini-2.5-flash-native-audio-preview-12-2025` |
+| Voice | `...google.voice` | `Kore` |
+| Temperature | `...google.temperature` | (unset) |
+| VAD start sensitivity | `...google.startSensitivity` | (unset) |
+| VAD end sensitivity | `...google.endSensitivity` | (unset) |
+| Silence duration | `...google.silenceDurationMs` | (unset) |
+| API key | `...google.apiKey` | Falls back to `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` |
+
+Example Voice Call realtime config:
+
+```json5
+{
+ plugins: {
+ entries: {
+ "voice-call": {
+ enabled: true,
+ config: {
+ realtime: {
+ enabled: true,
+ provider: "google",
+ providers: {
+ google: {
+ model: "gemini-2.5-flash-native-audio-preview-12-2025",
+ voice: "Kore",
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+}
+```
+
+
+Google Live API uses bidirectional audio and function calling over a WebSocket.
+OpenClaw adapts telephony/Meet bridge audio to Gemini's PCM Live API stream and
+keeps tool calls on the shared realtime voice contract. Leave `temperature`
+unset unless you need sampling changes; OpenClaw omits non-positive values
+because Google Live can return transcripts without audio for `temperature: 0`.
+Gemini API transcription is enabled without `languageCodes`; the current Google
+SDK rejects language-code hints on this API path.
+
+
+
+Control UI Talk browser sessions still require a realtime voice provider with a
+browser WebRTC session implementation. Today that path is OpenAI Realtime; the
+Google provider is for backend realtime bridges.
+
+
## Advanced configuration
diff --git a/docs/web/control-ui.md b/docs/web/control-ui.md
index 0520c1b0ce7..5aa4e101708 100644
--- a/docs/web/control-ui.md
+++ b/docs/web/control-ui.md
@@ -156,12 +156,14 @@ Cron jobs panel notes:
- `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `...`, `...`, `...`, `...`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`.
- `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery).
- The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options.
-- Talk mode uses the registered realtime voice provider. Configure OpenAI with
- `talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the
- Voice Call realtime provider config. The browser never receives the standard
- OpenAI API key; it receives only the ephemeral Realtime client secret. The
- Realtime session prompt is assembled by the Gateway; `talk.realtime.session`
- does not accept caller-provided instruction overrides.
+- Talk mode uses a registered realtime voice provider that supports browser
+ WebRTC sessions. Configure OpenAI with `talk.provider: "openai"` plus
+ `talk.providers.openai.apiKey`, or reuse the Voice Call realtime provider
+ config. The browser never receives the standard OpenAI API key; it receives
+ only the ephemeral Realtime client secret. Google Live realtime voice is
+ supported for backend Voice Call and Google Meet bridges, but not this browser
+ WebRTC path yet. The Realtime session prompt is assembled by the Gateway;
+ `talk.realtime.session` does not accept caller-provided instruction overrides.
- In the Chat composer, the Talk control is the waves button next to the
microphone dictation button. When Talk starts, the composer status row shows
`Connecting Talk...`, then `Talk live` while audio is connected, or
diff --git a/extensions/google/index.ts b/extensions/google/index.ts
index c022b655c70..8474ef7bd61 100644
--- a/extensions/google/index.ts
+++ b/extensions/google/index.ts
@@ -11,6 +11,7 @@ import {
} from "./generation-provider-metadata.js";
import { geminiMemoryEmbeddingProviderAdapter } from "./memory-embedding-adapter.js";
import { registerGoogleProvider } from "./provider-registration.js";
+import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
import { buildGoogleSpeechProvider } from "./speech-provider.js";
import { createGeminiWebSearchProvider } from "./src/gemini-web-search-provider.js";
@@ -156,6 +157,7 @@ export default definePluginEntry({
api.registerImageGenerationProvider(createLazyGoogleImageGenerationProvider());
api.registerMediaUnderstandingProvider(createLazyGoogleMediaUnderstandingProvider());
api.registerMusicGenerationProvider(createLazyGoogleMusicGenerationProvider());
+ api.registerRealtimeVoiceProvider(buildGoogleRealtimeVoiceProvider());
api.registerSpeechProvider(buildGoogleSpeechProvider());
api.registerVideoGenerationProvider(createLazyGoogleVideoGenerationProvider());
api.registerWebSearchProvider(createGeminiWebSearchProvider());
diff --git a/extensions/google/openclaw.plugin.json b/extensions/google/openclaw.plugin.json
index 02834c11ee2..a1641758254 100644
--- a/extensions/google/openclaw.plugin.json
+++ b/extensions/google/openclaw.plugin.json
@@ -49,6 +49,7 @@
"memoryEmbeddingProviders": ["gemini"],
"imageGenerationProviders": ["google"],
"musicGenerationProviders": ["google"],
+ "realtimeVoiceProviders": ["google"],
"speechProviders": ["google"],
"videoGenerationProviders": ["google"],
"webSearchProviders": ["gemini"]
diff --git a/extensions/google/realtime-voice-provider.test.ts b/extensions/google/realtime-voice-provider.test.ts
new file mode 100644
index 00000000000..626a75eb03c
--- /dev/null
+++ b/extensions/google/realtime-voice-provider.test.ts
@@ -0,0 +1,354 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
+
+type MockGoogleLiveSession = {
+ close: ReturnType;
+ sendClientContent: ReturnType;
+ sendRealtimeInput: ReturnType;
+ sendToolResponse: ReturnType;
+};
+
+type MockGoogleLiveConnectParams = {
+ model: string;
+ config: Record;
+ callbacks: {
+ onopen: () => void;
+ onmessage: (message: Record) => void;
+ onerror: (event: { error?: unknown; message?: string }) => void;
+ onclose: () => void;
+ };
+};
+
+const { connectMock, session } = vi.hoisted(() => {
+ const session: MockGoogleLiveSession = {
+ close: vi.fn(),
+ sendClientContent: vi.fn(),
+ sendRealtimeInput: vi.fn(),
+ sendToolResponse: vi.fn(),
+ };
+ const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
+ return { connectMock, session };
+});
+
+vi.mock("./google-genai-runtime.js", () => ({
+ createGoogleGenAI: vi.fn(() => ({
+ live: {
+ connect: connectMock,
+ },
+ })),
+}));
+
+function lastConnectParams(): MockGoogleLiveConnectParams {
+ const params = connectMock.mock.calls.at(-1)?.[0];
+ if (!params) {
+ throw new Error("expected google live connect call");
+ }
+ return params;
+}
+
+describe("buildGoogleRealtimeVoiceProvider", () => {
+ beforeEach(() => {
+ connectMock.mockClear();
+ session.close.mockClear();
+ session.sendClientContent.mockClear();
+ session.sendRealtimeInput.mockClear();
+ session.sendToolResponse.mockClear();
+ delete process.env.GEMINI_API_KEY;
+ delete process.env.GOOGLE_API_KEY;
+ });
+
+ it("normalizes provider config and cfg model-provider key fallback", () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const resolved = provider.resolveConfig?.({
+ cfg: {
+ models: {
+ providers: {
+ google: {
+ apiKey: "cfg-key",
+ },
+ },
+ },
+ } as never,
+ rawConfig: {
+ providers: {
+ google: {
+ model: "gemini-live-2.5-flash-preview",
+ voice: "Puck",
+ temperature: 0.4,
+ silenceDurationMs: 700,
+ startSensitivity: "high",
+ },
+ },
+ },
+ });
+
+ expect(resolved).toEqual({
+ apiKey: "cfg-key",
+ model: "gemini-live-2.5-flash-preview",
+ voice: "Puck",
+ temperature: 0.4,
+ apiVersion: undefined,
+ prefixPaddingMs: undefined,
+ silenceDurationMs: 700,
+ startSensitivity: "high",
+ endSensitivity: undefined,
+ enableAffectiveDialog: undefined,
+ thinkingLevel: undefined,
+ thinkingBudget: undefined,
+ });
+ });
+
+ it("connects with Google Live setup config and tool declarations", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const bridge = provider.createBridge({
+ providerConfig: {
+ apiKey: "gemini-key",
+ model: "gemini-live-2.5-flash-preview",
+ voice: "Kore",
+ temperature: 0.3,
+ startSensitivity: "low",
+ },
+ instructions: "Speak briefly.",
+ tools: [
+ {
+ type: "function",
+ name: "lookup",
+ description: "Look something up",
+ parameters: {
+ type: "object",
+ properties: {
+ query: { type: "string" },
+ },
+ required: ["query"],
+ },
+ },
+ ],
+ onAudio: vi.fn(),
+ onClearAudio: vi.fn(),
+ });
+
+ await bridge.connect();
+
+ expect(connectMock).toHaveBeenCalledTimes(1);
+ expect(lastConnectParams()).toMatchObject({
+ model: "gemini-live-2.5-flash-preview",
+ config: {
+ responseModalities: ["AUDIO"],
+ temperature: 0.3,
+ systemInstruction: "Speak briefly.",
+ speechConfig: {
+ voiceConfig: {
+ prebuiltVoiceConfig: {
+ voiceName: "Kore",
+ },
+ },
+ },
+ outputAudioTranscription: {},
+ tools: [
+ {
+ functionDeclarations: [
+ {
+ name: "lookup",
+ description: "Look something up",
+ parametersJsonSchema: {
+ type: "object",
+ properties: {
+ query: { type: "string" },
+ },
+ required: ["query"],
+ },
+ },
+ ],
+ },
+ ],
+ },
+ });
+ });
+
+ it("omits zero temperature for native audio responses", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const bridge = provider.createBridge({
+ providerConfig: {
+ apiKey: "gemini-key",
+ temperature: 0,
+ },
+ onAudio: vi.fn(),
+ onClearAudio: vi.fn(),
+ });
+
+ await bridge.connect();
+
+ expect(lastConnectParams().config).not.toHaveProperty("temperature");
+ });
+
+ it("waits for setup completion before draining audio and firing ready", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const onReady = vi.fn();
+ const bridge = provider.createBridge({
+ providerConfig: { apiKey: "gemini-key" },
+ onAudio: vi.fn(),
+ onClearAudio: vi.fn(),
+ onReady,
+ });
+
+ await bridge.connect();
+ lastConnectParams().callbacks.onopen();
+ bridge.sendAudio(Buffer.from([0xff, 0xff]));
+
+ expect(session.sendRealtimeInput).not.toHaveBeenCalled();
+ expect(onReady).not.toHaveBeenCalled();
+
+ lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
+
+ expect(onReady).toHaveBeenCalledTimes(1);
+ expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1);
+ expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({
+ data: expect.any(String),
+ mimeType: "audio/pcm;rate=16000",
+ });
+ });
+
+ it("marks the Google audio stream complete after sustained telephony silence", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const bridge = provider.createBridge({
+ providerConfig: { apiKey: "gemini-key", silenceDurationMs: 60 },
+ onAudio: vi.fn(),
+ onClearAudio: vi.fn(),
+ });
+
+ await bridge.connect();
+ lastConnectParams().callbacks.onopen();
+ lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
+
+ const silence20ms = Buffer.alloc(160, 0xff);
+ bridge.sendAudio(silence20ms);
+ bridge.sendAudio(silence20ms);
+ bridge.sendAudio(silence20ms);
+
+ expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
+
+ const callsAfterStreamEnd = session.sendRealtimeInput.mock.calls.length;
+ bridge.sendAudio(silence20ms);
+ expect(session.sendRealtimeInput).toHaveBeenCalledTimes(callsAfterStreamEnd);
+
+ session.sendRealtimeInput.mockClear();
+ bridge.sendAudio(Buffer.alloc(160, 0x7f));
+ bridge.sendAudio(silence20ms);
+ bridge.sendAudio(silence20ms);
+ bridge.sendAudio(silence20ms);
+
+ expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
+ });
+
+ it("sends text prompts as ordered client turns", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const bridge = provider.createBridge({
+ providerConfig: { apiKey: "gemini-key" },
+ onAudio: vi.fn(),
+ onClearAudio: vi.fn(),
+ });
+
+ await bridge.connect();
+ lastConnectParams().callbacks.onopen();
+ lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
+
+ bridge.sendUserMessage?.(" Say hello. ");
+
+ expect(session.sendClientContent).toHaveBeenCalledWith({
+ turns: [{ role: "user", parts: [{ text: "Say hello." }] }],
+ turnComplete: true,
+ });
+ });
+
+ it("converts Google PCM output to mu-law audio", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const onAudio = vi.fn();
+ const bridge = provider.createBridge({
+ providerConfig: { apiKey: "gemini-key" },
+ onAudio,
+ onClearAudio: vi.fn(),
+ });
+ const pcm24k = Buffer.alloc(480);
+
+ await bridge.connect();
+ lastConnectParams().callbacks.onmessage({
+ setupComplete: { sessionId: "session-1" },
+ serverContent: {
+ modelTurn: {
+ parts: [
+ {
+ inlineData: {
+ mimeType: "audio/L16;codec=pcm;rate=24000",
+ data: pcm24k.toString("base64"),
+ },
+ },
+ ],
+ },
+ },
+ });
+
+ expect(onAudio).toHaveBeenCalledTimes(1);
+ expect(onAudio.mock.calls[0]?.[0]).toBeInstanceOf(Buffer);
+ expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
+ });
+
+ it("does not forward Google thought text as assistant transcript", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const onTranscript = vi.fn();
+ const bridge = provider.createBridge({
+ providerConfig: { apiKey: "gemini-key" },
+ onAudio: vi.fn(),
+ onClearAudio: vi.fn(),
+ onTranscript,
+ });
+
+ await bridge.connect();
+ lastConnectParams().callbacks.onmessage({
+ setupComplete: {},
+ serverContent: {
+ modelTurn: {
+ parts: [{ text: "internal reasoning", thought: true }],
+ },
+ },
+ });
+
+ expect(onTranscript).not.toHaveBeenCalled();
+ });
+
+ it("forwards Live API tool calls and submits matching function responses", async () => {
+ const provider = buildGoogleRealtimeVoiceProvider();
+ const onToolCall = vi.fn();
+ const bridge = provider.createBridge({
+ providerConfig: { apiKey: "gemini-key" },
+ onAudio: vi.fn(),
+ onClearAudio: vi.fn(),
+ onToolCall,
+ });
+
+ await bridge.connect();
+ lastConnectParams().callbacks.onmessage({
+ setupComplete: { sessionId: "session-1" },
+ toolCall: {
+ functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }],
+ },
+ });
+
+ expect(onToolCall).toHaveBeenCalledWith({
+ itemId: "call-1",
+ callId: "call-1",
+ name: "lookup",
+ args: { query: "hi" },
+ });
+
+ bridge.submitToolResult("call-1", { result: "ok" });
+
+ expect(session.sendToolResponse).toHaveBeenCalledWith({
+ functionResponses: [
+ {
+ id: "call-1",
+ response: { result: "ok" },
+ },
+ ],
+ });
+ });
+});
diff --git a/extensions/google/realtime-voice-provider.ts b/extensions/google/realtime-voice-provider.ts
new file mode 100644
index 00000000000..269792a42c8
--- /dev/null
+++ b/extensions/google/realtime-voice-provider.ts
@@ -0,0 +1,535 @@
+import { randomUUID } from "node:crypto";
+import {
+ EndSensitivity,
+ Modality,
+ StartSensitivity,
+ type FunctionDeclaration,
+ type FunctionResponse,
+ type LiveServerContent,
+ type LiveServerMessage,
+ type LiveServerToolCall,
+ type RealtimeInputConfig,
+ type ThinkingConfig,
+} from "@google/genai";
+import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
+import type {
+ RealtimeVoiceBridge,
+ RealtimeVoiceBridgeCreateRequest,
+ RealtimeVoiceProviderConfig,
+ RealtimeVoiceProviderPlugin,
+ RealtimeVoiceTool,
+} from "openclaw/plugin-sdk/realtime-voice";
+import { convertPcmToMulaw8k, mulawToPcm, resamplePcm } from "openclaw/plugin-sdk/realtime-voice";
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
+import { createGoogleGenAI } from "./google-genai-runtime.js";
+
+const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025";
+const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
+const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
+const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
+const TELEPHONY_SAMPLE_RATE = 8000;
+const MAX_PENDING_AUDIO_CHUNKS = 320;
+const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
+
+type GoogleRealtimeSensitivity = "low" | "high";
+type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
+
+type GoogleRealtimeVoiceProviderConfig = {
+ apiKey?: string;
+ model?: string;
+ voice?: string;
+ temperature?: number;
+ apiVersion?: string;
+ prefixPaddingMs?: number;
+ silenceDurationMs?: number;
+ startSensitivity?: GoogleRealtimeSensitivity;
+ endSensitivity?: GoogleRealtimeSensitivity;
+ enableAffectiveDialog?: boolean;
+ thinkingLevel?: GoogleRealtimeThinkingLevel;
+ thinkingBudget?: number;
+};
+
+type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
+ apiKey: string;
+ model?: string;
+ voice?: string;
+ temperature?: number;
+ apiVersion?: string;
+ prefixPaddingMs?: number;
+ silenceDurationMs?: number;
+ startSensitivity?: GoogleRealtimeSensitivity;
+ endSensitivity?: GoogleRealtimeSensitivity;
+ enableAffectiveDialog?: boolean;
+ thinkingLevel?: GoogleRealtimeThinkingLevel;
+ thinkingBudget?: number;
+};
+
+type GoogleLiveSession = {
+ sendClientContent: (params: {
+ turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
+ turnComplete?: boolean;
+ }) => void;
+ sendRealtimeInput: (params: {
+ audio?: { data: string; mimeType: string };
+ audioStreamEnd?: boolean;
+ }) => void;
+ sendToolResponse: (params: { functionResponses: FunctionResponse[] | FunctionResponse }) => void;
+ close: () => void;
+};
+
+function trimToUndefined(value: unknown): string | undefined {
+ return normalizeOptionalString(value);
+}
+
+function asFiniteNumber(value: unknown): number | undefined {
+ return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asBoolean(value: unknown): boolean | undefined {
+ return typeof value === "boolean" ? value : undefined;
+}
+
+function asSensitivity(value: unknown): GoogleRealtimeSensitivity | undefined {
+ const normalized = normalizeOptionalString(value)?.toLowerCase();
+ return normalized === "low" || normalized === "high" ? normalized : undefined;
+}
+
+function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefined {
+ const normalized = normalizeOptionalString(value)?.toLowerCase();
+ return normalized === "minimal" ||
+ normalized === "low" ||
+ normalized === "medium" ||
+ normalized === "high"
+ ? normalized
+ : undefined;
+}
+
+function resolveGoogleRealtimeProviderConfigRecord(
+ config: Record,
+): Record | undefined {
+ const providers =
+ typeof config.providers === "object" &&
+ config.providers !== null &&
+ !Array.isArray(config.providers)
+ ? (config.providers as Record)
+ : undefined;
+ const nested = providers?.google;
+ return typeof nested === "object" && nested !== null && !Array.isArray(nested)
+ ? (nested as Record)
+ : typeof config.google === "object" && config.google !== null && !Array.isArray(config.google)
+ ? (config.google as Record)
+ : config;
+}
+
+function normalizeProviderConfig(
+ config: RealtimeVoiceProviderConfig,
+ cfg?: OpenClawConfig,
+): GoogleRealtimeVoiceProviderConfig {
+ const raw = resolveGoogleRealtimeProviderConfigRecord(config);
+ return {
+ apiKey: normalizeResolvedSecretInputString({
+ value: raw?.apiKey ?? cfg?.models?.providers?.google?.apiKey,
+ path: "plugins.entries.voice-call.config.realtime.providers.google.apiKey",
+ }),
+ model: trimToUndefined(raw?.model),
+ voice: trimToUndefined(raw?.voice),
+ temperature: asFiniteNumber(raw?.temperature),
+ apiVersion: trimToUndefined(raw?.apiVersion),
+ prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs),
+ silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
+ startSensitivity: asSensitivity(raw?.startSensitivity),
+ endSensitivity: asSensitivity(raw?.endSensitivity),
+ enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
+ thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
+ thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
+ };
+}
+
+function resolveEnvApiKey(): string | undefined {
+ return trimToUndefined(process.env.GEMINI_API_KEY) ?? trimToUndefined(process.env.GOOGLE_API_KEY);
+}
+
+function mapStartSensitivity(
+ value: GoogleRealtimeSensitivity | undefined,
+): StartSensitivity | undefined {
+ switch (value) {
+ case "high":
+ return StartSensitivity.START_SENSITIVITY_HIGH;
+ case "low":
+ return StartSensitivity.START_SENSITIVITY_LOW;
+ default:
+ return undefined;
+ }
+}
+
+function mapEndSensitivity(
+ value: GoogleRealtimeSensitivity | undefined,
+): EndSensitivity | undefined {
+ switch (value) {
+ case "high":
+ return EndSensitivity.END_SENSITIVITY_HIGH;
+ case "low":
+ return EndSensitivity.END_SENSITIVITY_LOW;
+ default:
+ return undefined;
+ }
+}
+
+function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
+ if (config.thinkingLevel) {
+ return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
+ }
+ if (typeof config.thinkingBudget === "number") {
+ return { thinkingBudget: config.thinkingBudget };
+ }
+ return undefined;
+}
+
+function buildRealtimeInputConfig(
+ config: GoogleRealtimeVoiceBridgeConfig,
+): RealtimeInputConfig | undefined {
+ const startSensitivity = mapStartSensitivity(config.startSensitivity);
+ const endSensitivity = mapEndSensitivity(config.endSensitivity);
+ const automaticActivityDetection = {
+ ...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}),
+ ...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}),
+ ...(typeof config.prefixPaddingMs === "number"
+ ? { prefixPaddingMs: Math.max(0, Math.floor(config.prefixPaddingMs)) }
+ : {}),
+ ...(typeof config.silenceDurationMs === "number"
+ ? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) }
+ : {}),
+ };
+ return Object.keys(automaticActivityDetection).length > 0
+ ? { automaticActivityDetection }
+ : undefined;
+}
+
+function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] {
+ return (tools ?? []).map((tool) => ({
+ name: tool.name,
+ description: tool.description,
+ parametersJsonSchema: tool.parameters,
+ }));
+}
+
+function parsePcmSampleRate(mimeType: string | undefined): number {
+ const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
+ const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : 24_000;
+}
+
+function isMulawSilence(audio: Buffer): boolean {
+ return audio.length > 0 && audio.every((sample) => sample === 0xff);
+}
+
+class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
+ private session: GoogleLiveSession | null = null;
+ private connected = false;
+ private sessionConfigured = false;
+ private intentionallyClosed = false;
+ private pendingAudio: Buffer[] = [];
+ private sessionReadyFired = false;
+ private consecutiveSilenceMs = 0;
+ private audioStreamEnded = false;
+
+ constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {}
+
+ async connect(): Promise {
+ this.intentionallyClosed = false;
+ this.sessionConfigured = false;
+ this.sessionReadyFired = false;
+ this.consecutiveSilenceMs = 0;
+ this.audioStreamEnded = false;
+
+ const ai = createGoogleGenAI({
+ apiKey: this.config.apiKey,
+ httpOptions: {
+ apiVersion: this.config.apiVersion ?? GOOGLE_REALTIME_DEFAULT_API_VERSION,
+ },
+ });
+
+ const functionDeclarations = buildFunctionDeclarations(this.config.tools);
+ this.session = (await ai.live.connect({
+ model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
+ config: {
+ responseModalities: [Modality.AUDIO],
+ ...(typeof this.config.temperature === "number" && this.config.temperature > 0
+ ? { temperature: this.config.temperature }
+ : {}),
+ speechConfig: {
+ voiceConfig: {
+ prebuiltVoiceConfig: {
+ voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
+ },
+ },
+ },
+ systemInstruction: this.config.instructions,
+ ...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
+ ...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}),
+ inputAudioTranscription: {},
+ outputAudioTranscription: {},
+ ...(typeof this.config.enableAffectiveDialog === "boolean"
+ ? { enableAffectiveDialog: this.config.enableAffectiveDialog }
+ : {}),
+ ...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}),
+ },
+ callbacks: {
+ onopen: () => {
+ this.connected = true;
+ },
+ onmessage: (message) => {
+ this.handleMessage(message);
+ },
+ onerror: (event) => {
+ const error =
+ event.error instanceof Error
+ ? event.error
+ : new Error(
+ typeof event.message === "string" ? event.message : "Google Live API error",
+ );
+ this.config.onError?.(error);
+ },
+ onclose: () => {
+ this.connected = false;
+ this.sessionConfigured = false;
+ const reason = this.intentionallyClosed ? "completed" : "error";
+ this.session = null;
+ this.config.onClose?.(reason);
+ },
+ },
+ })) as GoogleLiveSession;
+ }
+
+ sendAudio(audio: Buffer): void {
+ if (!this.session || !this.connected || !this.sessionConfigured) {
+ if (this.pendingAudio.length < MAX_PENDING_AUDIO_CHUNKS) {
+ this.pendingAudio.push(audio);
+ }
+ return;
+ }
+ const silent = isMulawSilence(audio);
+ if (silent && this.audioStreamEnded) {
+ return;
+ }
+ if (!silent) {
+ this.consecutiveSilenceMs = 0;
+ this.audioStreamEnded = false;
+ }
+
+ const pcm16k = resamplePcm(
+ mulawToPcm(audio),
+ TELEPHONY_SAMPLE_RATE,
+ GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
+ );
+ this.session.sendRealtimeInput({
+ audio: {
+ data: pcm16k.toString("base64"),
+ mimeType: `audio/pcm;rate=${GOOGLE_REALTIME_INPUT_SAMPLE_RATE}`,
+ },
+ });
+
+ if (!silent) {
+ return;
+ }
+
+ const silenceThresholdMs =
+ typeof this.config.silenceDurationMs === "number"
+ ? Math.max(0, Math.floor(this.config.silenceDurationMs))
+ : DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
+ this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000);
+ if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
+ this.session.sendRealtimeInput({ audioStreamEnd: true });
+ this.audioStreamEnded = true;
+ }
+ }
+
+ setMediaTimestamp(_ts: number): void {}
+
+ sendUserMessage(text: string): void {
+ const normalized = text.trim();
+ if (!normalized || !this.session || !this.connected || !this.sessionConfigured) {
+ return;
+ }
+ this.session.sendClientContent({
+ turns: [{ role: "user", parts: [{ text: normalized }] }],
+ turnComplete: true,
+ });
+ }
+
+ triggerGreeting(instructions?: string): void {
+ const greetingPrompt =
+ instructions?.trim() || "Start the call now. Greet the caller naturally and keep it brief.";
+ this.sendUserMessage(greetingPrompt);
+ }
+
+ submitToolResult(callId: string, result: unknown): void {
+ if (!this.session) {
+ return;
+ }
+ this.session.sendToolResponse({
+ functionResponses: [
+ {
+ id: callId,
+ response:
+ result && typeof result === "object"
+ ? (result as Record)
+ : { output: result },
+ },
+ ],
+ });
+ }
+
+ acknowledgeMark(): void {}
+
+ close(): void {
+ this.intentionallyClosed = true;
+ this.connected = false;
+ this.sessionConfigured = false;
+ this.pendingAudio = [];
+ this.consecutiveSilenceMs = 0;
+ this.audioStreamEnded = false;
+ const session = this.session;
+ this.session = null;
+ session?.close();
+ }
+
+ isConnected(): boolean {
+ return this.connected && this.sessionConfigured;
+ }
+
+ private handleMessage(message: LiveServerMessage): void {
+ if (message.setupComplete) {
+ this.handleSetupComplete();
+ }
+ if (message.serverContent) {
+ this.handleServerContent(message.serverContent);
+ }
+ if (message.toolCall) {
+ this.handleToolCall(message.toolCall);
+ }
+ }
+
+ private handleSetupComplete(): void {
+ this.sessionConfigured = true;
+ for (const chunk of this.pendingAudio.splice(0)) {
+ this.sendAudio(chunk);
+ }
+ if (!this.sessionReadyFired) {
+ this.sessionReadyFired = true;
+ this.config.onReady?.();
+ }
+ }
+
+ private handleServerContent(content: LiveServerContent): void {
+ if (content.interrupted) {
+ this.config.onClearAudio();
+ }
+
+ if (content.inputTranscription?.text) {
+ this.config.onTranscript?.(
+ "user",
+ content.inputTranscription.text,
+ content.inputTranscription.finished ?? false,
+ );
+ }
+
+ if (content.outputTranscription?.text) {
+ this.config.onTranscript?.(
+ "assistant",
+ content.outputTranscription.text,
+ content.outputTranscription.finished ?? false,
+ );
+ }
+
+ let emittedAssistantText = false;
+ for (const part of content.modelTurn?.parts ?? []) {
+ if (part.inlineData?.data) {
+ const pcm = Buffer.from(part.inlineData.data, "base64");
+ const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
+ const muLaw = convertPcmToMulaw8k(pcm, sampleRate);
+ if (muLaw.length > 0) {
+ this.config.onAudio(muLaw);
+ this.config.onMark?.(`audio-${randomUUID()}`);
+ }
+ continue;
+ }
+ if (part.thought) {
+ continue;
+ }
+ if (!content.outputTranscription?.text && typeof part.text === "string" && part.text.trim()) {
+ emittedAssistantText = true;
+ this.config.onTranscript?.("assistant", part.text, content.turnComplete ?? false);
+ }
+ }
+
+ if (!emittedAssistantText && content.turnComplete && content.waitingForInput === false) {
+ return;
+ }
+ }
+
+ private handleToolCall(toolCall: LiveServerToolCall): void {
+ for (const call of toolCall.functionCalls ?? []) {
+ const name = call.name?.trim();
+ if (!name) {
+ continue;
+ }
+ const callId = call.id?.trim() || `google-live-${randomUUID()}`;
+ this.config.onToolCall?.({
+ itemId: callId,
+ callId,
+ name,
+ args: call.args ?? {},
+ });
+ }
+ }
+
+ private get realtimeInputConfig(): RealtimeInputConfig | undefined {
+ return buildRealtimeInputConfig(this.config);
+ }
+
+ private get thinkingConfig(): ThinkingConfig | undefined {
+ return buildThinkingConfig(this.config);
+ }
+}
+
+export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
+ return {
+ id: "google",
+ label: "Google Live Voice",
+ autoSelectOrder: 20,
+ resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg),
+ isConfigured: ({ providerConfig }) =>
+ Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()),
+ createBridge: (req) => {
+ const config = normalizeProviderConfig(req.providerConfig);
+ const apiKey = config.apiKey || resolveEnvApiKey();
+ if (!apiKey) {
+ throw new Error("Google Gemini API key missing");
+ }
+ return new GoogleRealtimeVoiceBridge({
+ ...req,
+ apiKey,
+ model: config.model,
+ voice: config.voice,
+ temperature: config.temperature,
+ apiVersion: config.apiVersion,
+ prefixPaddingMs: config.prefixPaddingMs,
+ silenceDurationMs: config.silenceDurationMs,
+ startSensitivity: config.startSensitivity,
+ endSensitivity: config.endSensitivity,
+ enableAffectiveDialog: config.enableAffectiveDialog,
+ thinkingLevel: config.thinkingLevel,
+ thinkingBudget: config.thinkingBudget,
+ });
+ },
+ };
+}
+
+export {
+ GOOGLE_REALTIME_DEFAULT_API_VERSION,
+ GOOGLE_REALTIME_DEFAULT_MODEL,
+ GOOGLE_REALTIME_DEFAULT_VOICE,
+};
+export type { GoogleRealtimeVoiceProviderConfig };
diff --git a/extensions/voice-call/src/telephony-audio.ts b/extensions/voice-call/src/telephony-audio.ts
index e87111c3880..ca128275b41 100644
--- a/extensions/voice-call/src/telephony-audio.ts
+++ b/extensions/voice-call/src/telephony-audio.ts
@@ -1,105 +1,8 @@
-const TELEPHONY_SAMPLE_RATE = 8000;
-const RESAMPLE_FILTER_TAPS = 31;
-const RESAMPLE_CUTOFF_GUARD = 0.94;
-
-function clamp16(value: number): number {
- return Math.max(-32768, Math.min(32767, value));
-}
-
-function sinc(x: number): number {
- if (x === 0) {
- return 1;
- }
- return Math.sin(Math.PI * x) / (Math.PI * x);
-}
-
-/**
- * Build a finite low-pass kernel centered on `srcPos`.
- * The kernel is windowed (Hann) to reduce ringing artifacts.
- */
-function sampleBandlimited(
- input: Buffer,
- inputSamples: number,
- srcPos: number,
- cutoffCyclesPerSample: number,
-): number {
- const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
- const center = Math.floor(srcPos);
- let weighted = 0;
- let weightSum = 0;
-
- for (let tap = -half; tap <= half; tap++) {
- const sampleIndex = center + tap;
- if (sampleIndex < 0 || sampleIndex >= inputSamples) {
- continue;
- }
-
- const distance = sampleIndex - srcPos;
- const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
- const tapIndex = tap + half;
- const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
- const coeff = lowPass * window;
- weighted += input.readInt16LE(sampleIndex * 2) * coeff;
- weightSum += coeff;
- }
-
- if (weightSum === 0) {
- const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
- return input.readInt16LE(nearest * 2);
- }
-
- return weighted / weightSum;
-}
-
-/**
- * Resample 16-bit PCM (little-endian mono) to 8kHz using a windowed low-pass kernel.
- */
-export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
- if (inputSampleRate === TELEPHONY_SAMPLE_RATE) {
- return input;
- }
- const inputSamples = Math.floor(input.length / 2);
- if (inputSamples === 0) {
- return Buffer.alloc(0);
- }
-
- const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
- const outputSamples = Math.floor(inputSamples / ratio);
- const output = Buffer.alloc(outputSamples * 2);
- const maxCutoff = 0.5;
- const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
- const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
-
- for (let i = 0; i < outputSamples; i++) {
- const srcPos = i * ratio;
- const sample = Math.round(
- sampleBandlimited(input, inputSamples, srcPos, cutoffCyclesPerSample),
- );
- output.writeInt16LE(clamp16(sample), i * 2);
- }
-
- return output;
-}
-
-/**
- * Convert 16-bit PCM to 8-bit mu-law (G.711).
- */
-export function pcmToMulaw(pcm: Buffer): Buffer {
- const samples = Math.floor(pcm.length / 2);
- const mulaw = Buffer.alloc(samples);
-
- for (let i = 0; i < samples; i++) {
- const sample = pcm.readInt16LE(i * 2);
- mulaw[i] = linearToMulaw(sample);
- }
-
- return mulaw;
-}
-
-export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
- const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
- return pcmToMulaw(pcm8k);
-}
+export {
+ convertPcmToMulaw8k,
+ pcmToMulaw,
+ resamplePcmTo8k,
+} from "openclaw/plugin-sdk/realtime-voice";
/**
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
@@ -111,25 +14,3 @@ export function chunkAudio(audio: Buffer, chunkSize = 160): Generator CLIP) {
- sample = CLIP;
- }
-
- sample += BIAS;
- let exponent = 7;
- for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
- expMask >>= 1;
- }
-
- const mantissa = (sample >> (exponent + 3)) & 0x0f;
- return ~(sign | (exponent << 4) | mantissa) & 0xff;
-}
diff --git a/extensions/whatsapp/src/session.test.ts b/extensions/whatsapp/src/session.test.ts
index b24bf1e87c3..be3d70dfc41 100644
--- a/extensions/whatsapp/src/session.test.ts
+++ b/extensions/whatsapp/src/session.test.ts
@@ -341,10 +341,13 @@ describe("web session", () => {
sock.ev.emit("creds.update", {});
sock.ev.emit("creds.update", {});
- await flushCredsUpdate();
- expect(inFlight).toBe(1);
-
- (release as (() => void) | null)?.();
+ try {
+ await vi.waitFor(() => {
+ expect(inFlight).toBe(1);
+ });
+ } finally {
+ (release as (() => void) | null)?.();
+ }
await waitForCredsSaveQueue(authDir);
@@ -389,13 +392,16 @@ describe("web session", () => {
onError,
);
- await flushCredsUpdate();
+ try {
+ await vi.waitFor(() => {
+ expect(inFlightA).toBe(1);
+ expect(inFlightB).toBe(1);
+ });
+ } finally {
+ (releaseA as (() => void) | null)?.();
+ (releaseB as (() => void) | null)?.();
+ }
- expect(inFlightA).toBe(1);
- expect(inFlightB).toBe(1);
-
- (releaseA as (() => void) | null)?.();
- (releaseB as (() => void) | null)?.();
await Promise.all([waitForCredsSaveQueue(authDirA), waitForCredsSaveQueue(authDirB)]);
expect(inFlightA).toBe(0);
diff --git a/extensions/zalo/src/monitor.lifecycle.test.ts b/extensions/zalo/src/monitor.lifecycle.test.ts
index 7d17b9d9cc8..f526e5e81fd 100644
--- a/extensions/zalo/src/monitor.lifecycle.test.ts
+++ b/extensions/zalo/src/monitor.lifecycle.test.ts
@@ -152,7 +152,7 @@ describe("monitorZaloProvider lifecycle", () => {
abort.abort();
- await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1));
+ await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1), { timeout: 5000 });
expect(deleteWebhookMock).toHaveBeenCalledWith("test-token", undefined, 5000);
expect(settled).toBe(false);
expect(registry.httpRoutes).toHaveLength(2);
diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts
index 07aedef6299..45ef379b472 100644
--- a/src/plugin-sdk/realtime-voice.ts
+++ b/src/plugin-sdk/realtime-voice.ts
@@ -36,3 +36,10 @@ export {
type RealtimeVoiceBridgeSessionParams,
type RealtimeVoiceMarkStrategy,
} from "../realtime-voice/session-runtime.js";
+export {
+ convertPcmToMulaw8k,
+ mulawToPcm,
+ pcmToMulaw,
+ resamplePcm,
+ resamplePcmTo8k,
+} from "../realtime-voice/audio-codec.js";
diff --git a/src/realtime-voice/audio-codec.ts b/src/realtime-voice/audio-codec.ts
new file mode 100644
index 00000000000..9151cecc9b6
--- /dev/null
+++ b/src/realtime-voice/audio-codec.ts
@@ -0,0 +1,138 @@
+const TELEPHONY_SAMPLE_RATE = 8000;
+const RESAMPLE_FILTER_TAPS = 31;
+const RESAMPLE_CUTOFF_GUARD = 0.94;
+
+function clamp16(value: number): number {
+ return Math.max(-32768, Math.min(32767, value));
+}
+
+function sinc(x: number): number {
+ if (x === 0) {
+ return 1;
+ }
+ return Math.sin(Math.PI * x) / (Math.PI * x);
+}
+
+function sampleBandlimited(
+ input: Buffer,
+ inputSamples: number,
+ srcPos: number,
+ cutoffCyclesPerSample: number,
+): number {
+ const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
+ const center = Math.floor(srcPos);
+ let weighted = 0;
+ let weightSum = 0;
+
+ for (let tap = -half; tap <= half; tap += 1) {
+ const sampleIndex = center + tap;
+ if (sampleIndex < 0 || sampleIndex >= inputSamples) {
+ continue;
+ }
+
+ const distance = sampleIndex - srcPos;
+ const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
+ const tapIndex = tap + half;
+ const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
+ const coeff = lowPass * window;
+ weighted += input.readInt16LE(sampleIndex * 2) * coeff;
+ weightSum += coeff;
+ }
+
+ if (weightSum === 0) {
+ const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
+ return input.readInt16LE(nearest * 2);
+ }
+
+ return weighted / weightSum;
+}
+
+export function resamplePcm(
+ input: Buffer,
+ inputSampleRate: number,
+ outputSampleRate: number,
+): Buffer {
+ if (inputSampleRate === outputSampleRate) {
+ return input;
+ }
+ const inputSamples = Math.floor(input.length / 2);
+ if (inputSamples === 0) {
+ return Buffer.alloc(0);
+ }
+
+ const ratio = inputSampleRate / outputSampleRate;
+ const outputSamples = Math.floor(inputSamples / ratio);
+ const output = Buffer.alloc(outputSamples * 2);
+ const maxCutoff = 0.5;
+ const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
+ const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
+
+ for (let i = 0; i < outputSamples; i += 1) {
+ const sample = Math.round(
+ sampleBandlimited(input, inputSamples, i * ratio, cutoffCyclesPerSample),
+ );
+ output.writeInt16LE(clamp16(sample), i * 2);
+ }
+
+ return output;
+}
+
+export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
+ return resamplePcm(input, inputSampleRate, TELEPHONY_SAMPLE_RATE);
+}
+
+export function pcmToMulaw(pcm: Buffer): Buffer {
+ const samples = Math.floor(pcm.length / 2);
+ const mulaw = Buffer.alloc(samples);
+
+ for (let i = 0; i < samples; i += 1) {
+ const sample = pcm.readInt16LE(i * 2);
+ mulaw[i] = linearToMulaw(sample);
+ }
+
+ return mulaw;
+}
+
+export function mulawToPcm(mulaw: Buffer): Buffer {
+ const pcm = Buffer.alloc(mulaw.length * 2);
+ for (let i = 0; i < mulaw.length; i += 1) {
+ pcm.writeInt16LE(clamp16(mulawToLinear(mulaw[i] ?? 0)), i * 2);
+ }
+ return pcm;
+}
+
+export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
+ return pcmToMulaw(resamplePcmTo8k(pcm, inputSampleRate));
+}
+
+function linearToMulaw(sample: number): number {
+ const BIAS = 132;
+ const CLIP = 32635;
+
+ const sign = sample < 0 ? 0x80 : 0;
+ if (sample < 0) {
+ sample = -sample;
+ }
+ if (sample > CLIP) {
+ sample = CLIP;
+ }
+
+ sample += BIAS;
+ let exponent = 7;
+ for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent -= 1) {
+ expMask >>= 1;
+ }
+
+ const mantissa = (sample >> (exponent + 3)) & 0x0f;
+ return ~(sign | (exponent << 4) | mantissa) & 0xff;
+}
+
+function mulawToLinear(value: number): number {
+ const muLaw = ~value & 0xff;
+ const sign = muLaw & 0x80;
+ const exponent = (muLaw >> 4) & 0x07;
+ const mantissa = muLaw & 0x0f;
+ let sample = ((mantissa << 3) + 132) << exponent;
+ sample -= 132;
+ return sign ? -sample : sample;
+}
diff --git a/test/helpers/plugins/plugin-registration-contract-cases.ts b/test/helpers/plugins/plugin-registration-contract-cases.ts
index 2c227cf8fbf..b2bdcc325f9 100644
--- a/test/helpers/plugins/plugin-registration-contract-cases.ts
+++ b/test/helpers/plugins/plugin-registration-contract-cases.ts
@@ -55,6 +55,7 @@ export const pluginRegistrationContractCases = {
pluginId: "google",
providerIds: ["google", "google-gemini-cli"],
webSearchProviderIds: ["gemini"],
+ realtimeVoiceProviderIds: ["google"],
speechProviderIds: ["google"],
mediaUnderstandingProviderIds: ["google"],
imageGenerationProviderIds: ["google"],