feat(google): add realtime voice provider

2026-04-30 14:02:56 +08:00 · 2026-04-24 09:08:09 +01:00
parent c138368040
commit b5e5f2cede
13 changed files with 1127 additions and 141 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`.
 - Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key.
 - Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
+- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support.
 - Providers/OpenAI: add image generation and reference-image editing through Codex OAuth, so `openai/gpt-image-2` works without an `OPENAI_API_KEY`. Fixes #70703.
 - Providers/OpenRouter: add image generation and reference-image editing through `image_generate`, so OpenRouter image models work with `OPENROUTER_API_KEY`. Fixes #55066 via #67668. Thanks @notamicrodose.
 - Image generation: let agents request provider-supported quality and output format hints, and pass OpenAI-specific background, moderation, compression, and user hints through the `image_generate` tool. (#70503) Thanks @ottodeng.
--- a/docs/providers/google.md
+++ b/docs/providers/google.md
@@ -132,6 +132,7 @@ Choose your preferred auth method and follow the setup steps.
 | Image generation       | Yes                           |
 | Music generation       | Yes                           |
 | Text-to-speech         | Yes                           |
+| Realtime voice         | Yes (Google Live API)         |
 | Image understanding    | Yes                           |
 | Audio transcription    | Yes                           |
 | Video understanding    | Yes                           |
@@ -281,6 +282,63 @@ A Google Cloud Console API key restricted to the Gemini API is valid for this
 provider. This is not the separate Cloud Text-to-Speech API path.
 </Note>

+## Realtime voice
+
+The bundled `google` plugin registers a realtime voice provider backed by the
+Gemini Live API for backend audio bridges such as Voice Call and Google Meet.
+
+| Setting               | Config path                                                         | Default                                                                               |
+| --------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------- |
+| Model                 | `plugins.entries.voice-call.config.realtime.providers.google.model` | `gemini-2.5-flash-native-audio-preview-12-2025`                                       |
+| Voice                 | `...google.voice`                                                   | `Kore`                                                                                |
+| Temperature           | `...google.temperature`                                             | (unset)                                                                               |
+| VAD start sensitivity | `...google.startSensitivity`                                        | (unset)                                                                               |
+| VAD end sensitivity   | `...google.endSensitivity`                                          | (unset)                                                                               |
+| Silence duration      | `...google.silenceDurationMs`                                       | (unset)                                                                               |
+| API key               | `...google.apiKey`                                                  | Falls back to `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` |
+
+Example Voice Call realtime config:
+
+```json5
+{
+  plugins: {
+    entries: {
+      "voice-call": {
+        enabled: true,
+        config: {
+          realtime: {
+            enabled: true,
+            provider: "google",
+            providers: {
+              google: {
+                model: "gemini-2.5-flash-native-audio-preview-12-2025",
+                voice: "Kore",
+              },
+            },
+          },
+        },
+      },
+    },
+  },
+}
+```
+
+<Note>
+Google Live API uses bidirectional audio and function calling over a WebSocket.
+OpenClaw adapts telephony/Meet bridge audio to Gemini's PCM Live API stream and
+keeps tool calls on the shared realtime voice contract. Leave `temperature`
+unset unless you need sampling changes; OpenClaw omits non-positive values
+because Google Live can return transcripts without audio for `temperature: 0`.
+Gemini API transcription is enabled without `languageCodes`; the current Google
+SDK rejects language-code hints on this API path.
+</Note>
+
+<Note>
+Control UI Talk browser sessions still require a realtime voice provider with a
+browser WebRTC session implementation. Today that path is OpenAI Realtime; the
+Google provider is for backend realtime bridges.
+</Note>
+
 ## Advanced configuration

 <AccordionGroup>
--- a/docs/web/control-ui.md
+++ b/docs/web/control-ui.md
@@ -156,12 +156,14 @@ Cron jobs panel notes:
 - `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `<tool_call>...</tool_call>`, `<function_call>...</function_call>`, `<tool_calls>...</tool_calls>`, `<function_calls>...</function_calls>`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`.
 - `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery).
 - The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options.
- Talk mode uses the registered realtime voice provider. Configure OpenAI with
-  `talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the
-  Voice Call realtime provider config. The browser never receives the standard
-  OpenAI API key; it receives only the ephemeral Realtime client secret. The
-  Realtime session prompt is assembled by the Gateway; `talk.realtime.session`
-  does not accept caller-provided instruction overrides.
+- Talk mode uses a registered realtime voice provider that supports browser
+  WebRTC sessions. Configure OpenAI with `talk.provider: "openai"` plus
+  `talk.providers.openai.apiKey`, or reuse the Voice Call realtime provider
+  config. The browser never receives the standard OpenAI API key; it receives
+  only the ephemeral Realtime client secret. Google Live realtime voice is
+  supported for backend Voice Call and Google Meet bridges, but not this browser
+  WebRTC path yet. The Realtime session prompt is assembled by the Gateway;
+  `talk.realtime.session` does not accept caller-provided instruction overrides.
 - In the Chat composer, the Talk control is the waves button next to the
  microphone dictation button. When Talk starts, the composer status row shows
  `Connecting Talk...`, then `Talk live` while audio is connected, or
--- a/extensions/google/index.ts
+++ b/extensions/google/index.ts
@@ -11,6 +11,7 @@ import {
 } from "./generation-provider-metadata.js";
 import { geminiMemoryEmbeddingProviderAdapter } from "./memory-embedding-adapter.js";
 import { registerGoogleProvider } from "./provider-registration.js";
+import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
 import { buildGoogleSpeechProvider } from "./speech-provider.js";
 import { createGeminiWebSearchProvider } from "./src/gemini-web-search-provider.js";

@@ -156,6 +157,7 @@ export default definePluginEntry({
    api.registerImageGenerationProvider(createLazyGoogleImageGenerationProvider());
    api.registerMediaUnderstandingProvider(createLazyGoogleMediaUnderstandingProvider());
    api.registerMusicGenerationProvider(createLazyGoogleMusicGenerationProvider());
+    api.registerRealtimeVoiceProvider(buildGoogleRealtimeVoiceProvider());
    api.registerSpeechProvider(buildGoogleSpeechProvider());
    api.registerVideoGenerationProvider(createLazyGoogleVideoGenerationProvider());
    api.registerWebSearchProvider(createGeminiWebSearchProvider());
--- a/extensions/google/openclaw.plugin.json
+++ b/extensions/google/openclaw.plugin.json
@@ -49,6 +49,7 @@
    "memoryEmbeddingProviders": ["gemini"],
    "imageGenerationProviders": ["google"],
    "musicGenerationProviders": ["google"],
+    "realtimeVoiceProviders": ["google"],
    "speechProviders": ["google"],
    "videoGenerationProviders": ["google"],
    "webSearchProviders": ["gemini"]
--- a/extensions/google/realtime-voice-provider.test.ts
+++ b/extensions/google/realtime-voice-provider.test.ts
@@ -0,0 +1,354 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
+
+type MockGoogleLiveSession = {
+  close: ReturnType<typeof vi.fn>;
+  sendClientContent: ReturnType<typeof vi.fn>;
+  sendRealtimeInput: ReturnType<typeof vi.fn>;
+  sendToolResponse: ReturnType<typeof vi.fn>;
+};
+
+type MockGoogleLiveConnectParams = {
+  model: string;
+  config: Record<string, unknown>;
+  callbacks: {
+    onopen: () => void;
+    onmessage: (message: Record<string, unknown>) => void;
+    onerror: (event: { error?: unknown; message?: string }) => void;
+    onclose: () => void;
+  };
+};
+
+const { connectMock, session } = vi.hoisted(() => {
+  const session: MockGoogleLiveSession = {
+    close: vi.fn(),
+    sendClientContent: vi.fn(),
+    sendRealtimeInput: vi.fn(),
+    sendToolResponse: vi.fn(),
+  };
+  const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
+  return { connectMock, session };
+});
+
+vi.mock("./google-genai-runtime.js", () => ({
+  createGoogleGenAI: vi.fn(() => ({
+    live: {
+      connect: connectMock,
+    },
+  })),
+}));
+
+function lastConnectParams(): MockGoogleLiveConnectParams {
+  const params = connectMock.mock.calls.at(-1)?.[0];
+  if (!params) {
+    throw new Error("expected google live connect call");
+  }
+  return params;
+}
+
+describe("buildGoogleRealtimeVoiceProvider", () => {
+  beforeEach(() => {
+    connectMock.mockClear();
+    session.close.mockClear();
+    session.sendClientContent.mockClear();
+    session.sendRealtimeInput.mockClear();
+    session.sendToolResponse.mockClear();
+    delete process.env.GEMINI_API_KEY;
+    delete process.env.GOOGLE_API_KEY;
+  });
+
+  it("normalizes provider config and cfg model-provider key fallback", () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const resolved = provider.resolveConfig?.({
+      cfg: {
+        models: {
+          providers: {
+            google: {
+              apiKey: "cfg-key",
+            },
+          },
+        },
+      } as never,
+      rawConfig: {
+        providers: {
+          google: {
+            model: "gemini-live-2.5-flash-preview",
+            voice: "Puck",
+            temperature: 0.4,
+            silenceDurationMs: 700,
+            startSensitivity: "high",
+          },
+        },
+      },
+    });
+
+    expect(resolved).toEqual({
+      apiKey: "cfg-key",
+      model: "gemini-live-2.5-flash-preview",
+      voice: "Puck",
+      temperature: 0.4,
+      apiVersion: undefined,
+      prefixPaddingMs: undefined,
+      silenceDurationMs: 700,
+      startSensitivity: "high",
+      endSensitivity: undefined,
+      enableAffectiveDialog: undefined,
+      thinkingLevel: undefined,
+      thinkingBudget: undefined,
+    });
+  });
+
+  it("connects with Google Live setup config and tool declarations", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const bridge = provider.createBridge({
+      providerConfig: {
+        apiKey: "gemini-key",
+        model: "gemini-live-2.5-flash-preview",
+        voice: "Kore",
+        temperature: 0.3,
+        startSensitivity: "low",
+      },
+      instructions: "Speak briefly.",
+      tools: [
+        {
+          type: "function",
+          name: "lookup",
+          description: "Look something up",
+          parameters: {
+            type: "object",
+            properties: {
+              query: { type: "string" },
+            },
+            required: ["query"],
+          },
+        },
+      ],
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+    });
+
+    await bridge.connect();
+
+    expect(connectMock).toHaveBeenCalledTimes(1);
+    expect(lastConnectParams()).toMatchObject({
+      model: "gemini-live-2.5-flash-preview",
+      config: {
+        responseModalities: ["AUDIO"],
+        temperature: 0.3,
+        systemInstruction: "Speak briefly.",
+        speechConfig: {
+          voiceConfig: {
+            prebuiltVoiceConfig: {
+              voiceName: "Kore",
+            },
+          },
+        },
+        outputAudioTranscription: {},
+        tools: [
+          {
+            functionDeclarations: [
+              {
+                name: "lookup",
+                description: "Look something up",
+                parametersJsonSchema: {
+                  type: "object",
+                  properties: {
+                    query: { type: "string" },
+                  },
+                  required: ["query"],
+                },
+              },
+            ],
+          },
+        ],
+      },
+    });
+  });
+
+  it("omits zero temperature for native audio responses", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const bridge = provider.createBridge({
+      providerConfig: {
+        apiKey: "gemini-key",
+        temperature: 0,
+      },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+    });
+
+    await bridge.connect();
+
+    expect(lastConnectParams().config).not.toHaveProperty("temperature");
+  });
+
+  it("waits for setup completion before draining audio and firing ready", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const onReady = vi.fn();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "gemini-key" },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+      onReady,
+    });
+
+    await bridge.connect();
+    lastConnectParams().callbacks.onopen();
+    bridge.sendAudio(Buffer.from([0xff, 0xff]));
+
+    expect(session.sendRealtimeInput).not.toHaveBeenCalled();
+    expect(onReady).not.toHaveBeenCalled();
+
+    lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
+
+    expect(onReady).toHaveBeenCalledTimes(1);
+    expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1);
+    expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({
+      data: expect.any(String),
+      mimeType: "audio/pcm;rate=16000",
+    });
+  });
+
+  it("marks the Google audio stream complete after sustained telephony silence", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "gemini-key", silenceDurationMs: 60 },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+    });
+
+    await bridge.connect();
+    lastConnectParams().callbacks.onopen();
+    lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
+
+    const silence20ms = Buffer.alloc(160, 0xff);
+    bridge.sendAudio(silence20ms);
+    bridge.sendAudio(silence20ms);
+    bridge.sendAudio(silence20ms);
+
+    expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
+
+    const callsAfterStreamEnd = session.sendRealtimeInput.mock.calls.length;
+    bridge.sendAudio(silence20ms);
+    expect(session.sendRealtimeInput).toHaveBeenCalledTimes(callsAfterStreamEnd);
+
+    session.sendRealtimeInput.mockClear();
+    bridge.sendAudio(Buffer.alloc(160, 0x7f));
+    bridge.sendAudio(silence20ms);
+    bridge.sendAudio(silence20ms);
+    bridge.sendAudio(silence20ms);
+
+    expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
+  });
+
+  it("sends text prompts as ordered client turns", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "gemini-key" },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+    });
+
+    await bridge.connect();
+    lastConnectParams().callbacks.onopen();
+    lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
+
+    bridge.sendUserMessage?.(" Say hello. ");
+
+    expect(session.sendClientContent).toHaveBeenCalledWith({
+      turns: [{ role: "user", parts: [{ text: "Say hello." }] }],
+      turnComplete: true,
+    });
+  });
+
+  it("converts Google PCM output to mu-law audio", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const onAudio = vi.fn();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "gemini-key" },
+      onAudio,
+      onClearAudio: vi.fn(),
+    });
+    const pcm24k = Buffer.alloc(480);
+
+    await bridge.connect();
+    lastConnectParams().callbacks.onmessage({
+      setupComplete: { sessionId: "session-1" },
+      serverContent: {
+        modelTurn: {
+          parts: [
+            {
+              inlineData: {
+                mimeType: "audio/L16;codec=pcm;rate=24000",
+                data: pcm24k.toString("base64"),
+              },
+            },
+          ],
+        },
+      },
+    });
+
+    expect(onAudio).toHaveBeenCalledTimes(1);
+    expect(onAudio.mock.calls[0]?.[0]).toBeInstanceOf(Buffer);
+    expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
+  });
+
+  it("does not forward Google thought text as assistant transcript", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const onTranscript = vi.fn();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "gemini-key" },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+      onTranscript,
+    });
+
+    await bridge.connect();
+    lastConnectParams().callbacks.onmessage({
+      setupComplete: {},
+      serverContent: {
+        modelTurn: {
+          parts: [{ text: "internal reasoning", thought: true }],
+        },
+      },
+    });
+
+    expect(onTranscript).not.toHaveBeenCalled();
+  });
+
+  it("forwards Live API tool calls and submits matching function responses", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const onToolCall = vi.fn();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "gemini-key" },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+      onToolCall,
+    });
+
+    await bridge.connect();
+    lastConnectParams().callbacks.onmessage({
+      setupComplete: { sessionId: "session-1" },
+      toolCall: {
+        functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }],
+      },
+    });
+
+    expect(onToolCall).toHaveBeenCalledWith({
+      itemId: "call-1",
+      callId: "call-1",
+      name: "lookup",
+      args: { query: "hi" },
+    });
+
+    bridge.submitToolResult("call-1", { result: "ok" });
+
+    expect(session.sendToolResponse).toHaveBeenCalledWith({
+      functionResponses: [
+        {
+          id: "call-1",
+          response: { result: "ok" },
+        },
+      ],
+    });
+  });
+});
--- a/extensions/google/realtime-voice-provider.ts
+++ b/extensions/google/realtime-voice-provider.ts
@@ -0,0 +1,535 @@
+import { randomUUID } from "node:crypto";
+import {
+  EndSensitivity,
+  Modality,
+  StartSensitivity,
+  type FunctionDeclaration,
+  type FunctionResponse,
+  type LiveServerContent,
+  type LiveServerMessage,
+  type LiveServerToolCall,
+  type RealtimeInputConfig,
+  type ThinkingConfig,
+} from "@google/genai";
+import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
+import type {
+  RealtimeVoiceBridge,
+  RealtimeVoiceBridgeCreateRequest,
+  RealtimeVoiceProviderConfig,
+  RealtimeVoiceProviderPlugin,
+  RealtimeVoiceTool,
+} from "openclaw/plugin-sdk/realtime-voice";
+import { convertPcmToMulaw8k, mulawToPcm, resamplePcm } from "openclaw/plugin-sdk/realtime-voice";
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
+import { createGoogleGenAI } from "./google-genai-runtime.js";
+
+const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025";
+const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
+const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
+const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
+const TELEPHONY_SAMPLE_RATE = 8000;
+const MAX_PENDING_AUDIO_CHUNKS = 320;
+const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
+
+type GoogleRealtimeSensitivity = "low" | "high";
+type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
+
+type GoogleRealtimeVoiceProviderConfig = {
+  apiKey?: string;
+  model?: string;
+  voice?: string;
+  temperature?: number;
+  apiVersion?: string;
+  prefixPaddingMs?: number;
+  silenceDurationMs?: number;
+  startSensitivity?: GoogleRealtimeSensitivity;
+  endSensitivity?: GoogleRealtimeSensitivity;
+  enableAffectiveDialog?: boolean;
+  thinkingLevel?: GoogleRealtimeThinkingLevel;
+  thinkingBudget?: number;
+};
+
+type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
+  apiKey: string;
+  model?: string;
+  voice?: string;
+  temperature?: number;
+  apiVersion?: string;
+  prefixPaddingMs?: number;
+  silenceDurationMs?: number;
+  startSensitivity?: GoogleRealtimeSensitivity;
+  endSensitivity?: GoogleRealtimeSensitivity;
+  enableAffectiveDialog?: boolean;
+  thinkingLevel?: GoogleRealtimeThinkingLevel;
+  thinkingBudget?: number;
+};
+
+type GoogleLiveSession = {
+  sendClientContent: (params: {
+    turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
+    turnComplete?: boolean;
+  }) => void;
+  sendRealtimeInput: (params: {
+    audio?: { data: string; mimeType: string };
+    audioStreamEnd?: boolean;
+  }) => void;
+  sendToolResponse: (params: { functionResponses: FunctionResponse[] | FunctionResponse }) => void;
+  close: () => void;
+};
+
+function trimToUndefined(value: unknown): string | undefined {
+  return normalizeOptionalString(value);
+}
+
+function asFiniteNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asBoolean(value: unknown): boolean | undefined {
+  return typeof value === "boolean" ? value : undefined;
+}
+
+function asSensitivity(value: unknown): GoogleRealtimeSensitivity | undefined {
+  const normalized = normalizeOptionalString(value)?.toLowerCase();
+  return normalized === "low" || normalized === "high" ? normalized : undefined;
+}
+
+function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefined {
+  const normalized = normalizeOptionalString(value)?.toLowerCase();
+  return normalized === "minimal" ||
+    normalized === "low" ||
+    normalized === "medium" ||
+    normalized === "high"
+    ? normalized
+    : undefined;
+}
+
+function resolveGoogleRealtimeProviderConfigRecord(
+  config: Record<string, unknown>,
+): Record<string, unknown> | undefined {
+  const providers =
+    typeof config.providers === "object" &&
+    config.providers !== null &&
+    !Array.isArray(config.providers)
+      ? (config.providers as Record<string, unknown>)
+      : undefined;
+  const nested = providers?.google;
+  return typeof nested === "object" && nested !== null && !Array.isArray(nested)
+    ? (nested as Record<string, unknown>)
+    : typeof config.google === "object" && config.google !== null && !Array.isArray(config.google)
+      ? (config.google as Record<string, unknown>)
+      : config;
+}
+
+function normalizeProviderConfig(
+  config: RealtimeVoiceProviderConfig,
+  cfg?: OpenClawConfig,
+): GoogleRealtimeVoiceProviderConfig {
+  const raw = resolveGoogleRealtimeProviderConfigRecord(config);
+  return {
+    apiKey: normalizeResolvedSecretInputString({
+      value: raw?.apiKey ?? cfg?.models?.providers?.google?.apiKey,
+      path: "plugins.entries.voice-call.config.realtime.providers.google.apiKey",
+    }),
+    model: trimToUndefined(raw?.model),
+    voice: trimToUndefined(raw?.voice),
+    temperature: asFiniteNumber(raw?.temperature),
+    apiVersion: trimToUndefined(raw?.apiVersion),
+    prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs),
+    silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
+    startSensitivity: asSensitivity(raw?.startSensitivity),
+    endSensitivity: asSensitivity(raw?.endSensitivity),
+    enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
+    thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
+    thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
+  };
+}
+
+function resolveEnvApiKey(): string | undefined {
+  return trimToUndefined(process.env.GEMINI_API_KEY) ?? trimToUndefined(process.env.GOOGLE_API_KEY);
+}
+
+function mapStartSensitivity(
+  value: GoogleRealtimeSensitivity | undefined,
+): StartSensitivity | undefined {
+  switch (value) {
+    case "high":
+      return StartSensitivity.START_SENSITIVITY_HIGH;
+    case "low":
+      return StartSensitivity.START_SENSITIVITY_LOW;
+    default:
+      return undefined;
+  }
+}
+
+function mapEndSensitivity(
+  value: GoogleRealtimeSensitivity | undefined,
+): EndSensitivity | undefined {
+  switch (value) {
+    case "high":
+      return EndSensitivity.END_SENSITIVITY_HIGH;
+    case "low":
+      return EndSensitivity.END_SENSITIVITY_LOW;
+    default:
+      return undefined;
+  }
+}
+
+function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
+  if (config.thinkingLevel) {
+    return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
+  }
+  if (typeof config.thinkingBudget === "number") {
+    return { thinkingBudget: config.thinkingBudget };
+  }
+  return undefined;
+}
+
+function buildRealtimeInputConfig(
+  config: GoogleRealtimeVoiceBridgeConfig,
+): RealtimeInputConfig | undefined {
+  const startSensitivity = mapStartSensitivity(config.startSensitivity);
+  const endSensitivity = mapEndSensitivity(config.endSensitivity);
+  const automaticActivityDetection = {
+    ...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}),
+    ...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}),
+    ...(typeof config.prefixPaddingMs === "number"
+      ? { prefixPaddingMs: Math.max(0, Math.floor(config.prefixPaddingMs)) }
+      : {}),
+    ...(typeof config.silenceDurationMs === "number"
+      ? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) }
+      : {}),
+  };
+  return Object.keys(automaticActivityDetection).length > 0
+    ? { automaticActivityDetection }
+    : undefined;
+}
+
+function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] {
+  return (tools ?? []).map((tool) => ({
+    name: tool.name,
+    description: tool.description,
+    parametersJsonSchema: tool.parameters,
+  }));
+}
+
+function parsePcmSampleRate(mimeType: string | undefined): number {
+  const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
+  const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
+  return Number.isFinite(parsed) && parsed > 0 ? parsed : 24_000;
+}
+
+function isMulawSilence(audio: Buffer): boolean {
+  return audio.length > 0 && audio.every((sample) => sample === 0xff);
+}
+
+class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
+  private session: GoogleLiveSession | null = null;
+  private connected = false;
+  private sessionConfigured = false;
+  private intentionallyClosed = false;
+  private pendingAudio: Buffer[] = [];
+  private sessionReadyFired = false;
+  private consecutiveSilenceMs = 0;
+  private audioStreamEnded = false;
+
+  constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {}
+
+  async connect(): Promise<void> {
+    this.intentionallyClosed = false;
+    this.sessionConfigured = false;
+    this.sessionReadyFired = false;
+    this.consecutiveSilenceMs = 0;
+    this.audioStreamEnded = false;
+
+    const ai = createGoogleGenAI({
+      apiKey: this.config.apiKey,
+      httpOptions: {
+        apiVersion: this.config.apiVersion ?? GOOGLE_REALTIME_DEFAULT_API_VERSION,
+      },
+    });
+
+    const functionDeclarations = buildFunctionDeclarations(this.config.tools);
+    this.session = (await ai.live.connect({
+      model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
+      config: {
+        responseModalities: [Modality.AUDIO],
+        ...(typeof this.config.temperature === "number" && this.config.temperature > 0
+          ? { temperature: this.config.temperature }
+          : {}),
+        speechConfig: {
+          voiceConfig: {
+            prebuiltVoiceConfig: {
+              voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
+            },
+          },
+        },
+        systemInstruction: this.config.instructions,
+        ...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
+        ...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}),
+        inputAudioTranscription: {},
+        outputAudioTranscription: {},
+        ...(typeof this.config.enableAffectiveDialog === "boolean"
+          ? { enableAffectiveDialog: this.config.enableAffectiveDialog }
+          : {}),
+        ...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}),
+      },
+      callbacks: {
+        onopen: () => {
+          this.connected = true;
+        },
+        onmessage: (message) => {
+          this.handleMessage(message);
+        },
+        onerror: (event) => {
+          const error =
+            event.error instanceof Error
+              ? event.error
+              : new Error(
+                  typeof event.message === "string" ? event.message : "Google Live API error",
+                );
+          this.config.onError?.(error);
+        },
+        onclose: () => {
+          this.connected = false;
+          this.sessionConfigured = false;
+          const reason = this.intentionallyClosed ? "completed" : "error";
+          this.session = null;
+          this.config.onClose?.(reason);
+        },
+      },
+    })) as GoogleLiveSession;
+  }
+
+  sendAudio(audio: Buffer): void {
+    if (!this.session || !this.connected || !this.sessionConfigured) {
+      if (this.pendingAudio.length < MAX_PENDING_AUDIO_CHUNKS) {
+        this.pendingAudio.push(audio);
+      }
+      return;
+    }
+    const silent = isMulawSilence(audio);
+    if (silent && this.audioStreamEnded) {
+      return;
+    }
+    if (!silent) {
+      this.consecutiveSilenceMs = 0;
+      this.audioStreamEnded = false;
+    }
+
+    const pcm16k = resamplePcm(
+      mulawToPcm(audio),
+      TELEPHONY_SAMPLE_RATE,
+      GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
+    );
+    this.session.sendRealtimeInput({
+      audio: {
+        data: pcm16k.toString("base64"),
+        mimeType: `audio/pcm;rate=${GOOGLE_REALTIME_INPUT_SAMPLE_RATE}`,
+      },
+    });
+
+    if (!silent) {
+      return;
+    }
+
+    const silenceThresholdMs =
+      typeof this.config.silenceDurationMs === "number"
+        ? Math.max(0, Math.floor(this.config.silenceDurationMs))
+        : DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
+    this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000);
+    if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
+      this.session.sendRealtimeInput({ audioStreamEnd: true });
+      this.audioStreamEnded = true;
+    }
+  }
+
+  setMediaTimestamp(_ts: number): void {}
+
+  sendUserMessage(text: string): void {
+    const normalized = text.trim();
+    if (!normalized || !this.session || !this.connected || !this.sessionConfigured) {
+      return;
+    }
+    this.session.sendClientContent({
+      turns: [{ role: "user", parts: [{ text: normalized }] }],
+      turnComplete: true,
+    });
+  }
+
+  triggerGreeting(instructions?: string): void {
+    const greetingPrompt =
+      instructions?.trim() || "Start the call now. Greet the caller naturally and keep it brief.";
+    this.sendUserMessage(greetingPrompt);
+  }
+
+  submitToolResult(callId: string, result: unknown): void {
+    if (!this.session) {
+      return;
+    }
+    this.session.sendToolResponse({
+      functionResponses: [
+        {
+          id: callId,
+          response:
+            result && typeof result === "object"
+              ? (result as Record<string, unknown>)
+              : { output: result },
+        },
+      ],
+    });
+  }
+
+  acknowledgeMark(): void {}
+
+  close(): void {
+    this.intentionallyClosed = true;
+    this.connected = false;
+    this.sessionConfigured = false;
+    this.pendingAudio = [];
+    this.consecutiveSilenceMs = 0;
+    this.audioStreamEnded = false;
+    const session = this.session;
+    this.session = null;
+    session?.close();
+  }
+
+  isConnected(): boolean {
+    return this.connected && this.sessionConfigured;
+  }
+
+  private handleMessage(message: LiveServerMessage): void {
+    if (message.setupComplete) {
+      this.handleSetupComplete();
+    }
+    if (message.serverContent) {
+      this.handleServerContent(message.serverContent);
+    }
+    if (message.toolCall) {
+      this.handleToolCall(message.toolCall);
+    }
+  }
+
+  private handleSetupComplete(): void {
+    this.sessionConfigured = true;
+    for (const chunk of this.pendingAudio.splice(0)) {
+      this.sendAudio(chunk);
+    }
+    if (!this.sessionReadyFired) {
+      this.sessionReadyFired = true;
+      this.config.onReady?.();
+    }
+  }
+
+  private handleServerContent(content: LiveServerContent): void {
+    if (content.interrupted) {
+      this.config.onClearAudio();
+    }
+
+    if (content.inputTranscription?.text) {
+      this.config.onTranscript?.(
+        "user",
+        content.inputTranscription.text,
+        content.inputTranscription.finished ?? false,
+      );
+    }
+
+    if (content.outputTranscription?.text) {
+      this.config.onTranscript?.(
+        "assistant",
+        content.outputTranscription.text,
+        content.outputTranscription.finished ?? false,
+      );
+    }
+
+    let emittedAssistantText = false;
+    for (const part of content.modelTurn?.parts ?? []) {
+      if (part.inlineData?.data) {
+        const pcm = Buffer.from(part.inlineData.data, "base64");
+        const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
+        const muLaw = convertPcmToMulaw8k(pcm, sampleRate);
+        if (muLaw.length > 0) {
+          this.config.onAudio(muLaw);
+          this.config.onMark?.(`audio-${randomUUID()}`);
+        }
+        continue;
+      }
+      if (part.thought) {
+        continue;
+      }
+      if (!content.outputTranscription?.text && typeof part.text === "string" && part.text.trim()) {
+        emittedAssistantText = true;
+        this.config.onTranscript?.("assistant", part.text, content.turnComplete ?? false);
+      }
+    }
+
+    if (!emittedAssistantText && content.turnComplete && content.waitingForInput === false) {
+      return;
+    }
+  }
+
+  private handleToolCall(toolCall: LiveServerToolCall): void {
+    for (const call of toolCall.functionCalls ?? []) {
+      const name = call.name?.trim();
+      if (!name) {
+        continue;
+      }
+      const callId = call.id?.trim() || `google-live-${randomUUID()}`;
+      this.config.onToolCall?.({
+        itemId: callId,
+        callId,
+        name,
+        args: call.args ?? {},
+      });
+    }
+  }
+
+  private get realtimeInputConfig(): RealtimeInputConfig | undefined {
+    return buildRealtimeInputConfig(this.config);
+  }
+
+  private get thinkingConfig(): ThinkingConfig | undefined {
+    return buildThinkingConfig(this.config);
+  }
+}
+
+export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
+  return {
+    id: "google",
+    label: "Google Live Voice",
+    autoSelectOrder: 20,
+    resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg),
+    isConfigured: ({ providerConfig }) =>
+      Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()),
+    createBridge: (req) => {
+      const config = normalizeProviderConfig(req.providerConfig);
+      const apiKey = config.apiKey || resolveEnvApiKey();
+      if (!apiKey) {
+        throw new Error("Google Gemini API key missing");
+      }
+      return new GoogleRealtimeVoiceBridge({
+        ...req,
+        apiKey,
+        model: config.model,
+        voice: config.voice,
+        temperature: config.temperature,
+        apiVersion: config.apiVersion,
+        prefixPaddingMs: config.prefixPaddingMs,
+        silenceDurationMs: config.silenceDurationMs,
+        startSensitivity: config.startSensitivity,
+        endSensitivity: config.endSensitivity,
+        enableAffectiveDialog: config.enableAffectiveDialog,
+        thinkingLevel: config.thinkingLevel,
+        thinkingBudget: config.thinkingBudget,
+      });
+    },
+  };
+}
+
+export {
+  GOOGLE_REALTIME_DEFAULT_API_VERSION,
+  GOOGLE_REALTIME_DEFAULT_MODEL,
+  GOOGLE_REALTIME_DEFAULT_VOICE,
+};
+export type { GoogleRealtimeVoiceProviderConfig };
--- a/extensions/voice-call/src/telephony-audio.ts
+++ b/extensions/voice-call/src/telephony-audio.ts
@@ -1,105 +1,8 @@
-const TELEPHONY_SAMPLE_RATE = 8000;
-const RESAMPLE_FILTER_TAPS = 31;
-const RESAMPLE_CUTOFF_GUARD = 0.94;
-
-function clamp16(value: number): number {
-  return Math.max(-32768, Math.min(32767, value));
-}
-
-function sinc(x: number): number {
-  if (x === 0) {
-    return 1;
-  }
-  return Math.sin(Math.PI * x) / (Math.PI * x);
-}
-
-/**
- * Build a finite low-pass kernel centered on `srcPos`.
- * The kernel is windowed (Hann) to reduce ringing artifacts.
- */
-function sampleBandlimited(
-  input: Buffer,
-  inputSamples: number,
-  srcPos: number,
-  cutoffCyclesPerSample: number,
-): number {
-  const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
-  const center = Math.floor(srcPos);
-  let weighted = 0;
-  let weightSum = 0;
-
-  for (let tap = -half; tap <= half; tap++) {
-    const sampleIndex = center + tap;
-    if (sampleIndex < 0 || sampleIndex >= inputSamples) {
-      continue;
-    }
-
-    const distance = sampleIndex - srcPos;
-    const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
-    const tapIndex = tap + half;
-    const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
-    const coeff = lowPass * window;
-    weighted += input.readInt16LE(sampleIndex * 2) * coeff;
-    weightSum += coeff;
-  }
-
-  if (weightSum === 0) {
-    const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
-    return input.readInt16LE(nearest * 2);
-  }
-
-  return weighted / weightSum;
-}
-
-/**
- * Resample 16-bit PCM (little-endian mono) to 8kHz using a windowed low-pass kernel.
- */
-export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
-  if (inputSampleRate === TELEPHONY_SAMPLE_RATE) {
-    return input;
-  }
-  const inputSamples = Math.floor(input.length / 2);
-  if (inputSamples === 0) {
-    return Buffer.alloc(0);
-  }
-
-  const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
-  const outputSamples = Math.floor(inputSamples / ratio);
-  const output = Buffer.alloc(outputSamples * 2);
-  const maxCutoff = 0.5;
-  const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
-  const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
-
-  for (let i = 0; i < outputSamples; i++) {
-    const srcPos = i * ratio;
-    const sample = Math.round(
-      sampleBandlimited(input, inputSamples, srcPos, cutoffCyclesPerSample),
-    );
-    output.writeInt16LE(clamp16(sample), i * 2);
-  }
-
-  return output;
-}
-
-/**
- * Convert 16-bit PCM to 8-bit mu-law (G.711).
- */
-export function pcmToMulaw(pcm: Buffer): Buffer {
-  const samples = Math.floor(pcm.length / 2);
-  const mulaw = Buffer.alloc(samples);
-
-  for (let i = 0; i < samples; i++) {
-    const sample = pcm.readInt16LE(i * 2);
-    mulaw[i] = linearToMulaw(sample);
-  }
-
-  return mulaw;
-}
-
-export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
-  const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
-  return pcmToMulaw(pcm8k);
-}
+export {
+  convertPcmToMulaw8k,
+  pcmToMulaw,
+  resamplePcmTo8k,
+} from "openclaw/plugin-sdk/realtime-voice";

 /**
 * Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
@@ -111,25 +14,3 @@ export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, vo
    }
  })();
 }
-
-function linearToMulaw(sample: number): number {
-  const BIAS = 132;
-  const CLIP = 32635;
-
-  const sign = sample < 0 ? 0x80 : 0;
-  if (sample < 0) {
-    sample = -sample;
-  }
-  if (sample > CLIP) {
-    sample = CLIP;
-  }
-
-  sample += BIAS;
-  let exponent = 7;
-  for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
-    expMask >>= 1;
-  }
-
-  const mantissa = (sample >> (exponent + 3)) & 0x0f;
-  return ~(sign | (exponent << 4) | mantissa) & 0xff;
-}
--- a/extensions/whatsapp/src/session.test.ts
+++ b/extensions/whatsapp/src/session.test.ts
@@ -341,10 +341,13 @@ describe("web session", () => {
    sock.ev.emit("creds.update", {});
    sock.ev.emit("creds.update", {});

-    await flushCredsUpdate();
-    expect(inFlight).toBe(1);
-
-    (release as (() => void) | null)?.();
+    try {
+      await vi.waitFor(() => {
+        expect(inFlight).toBe(1);
+      });
+    } finally {
+      (release as (() => void) | null)?.();
+    }

    await waitForCredsSaveQueue(authDir);

@@ -389,13 +392,16 @@ describe("web session", () => {
      onError,
    );

-    await flushCredsUpdate();
+    try {
+      await vi.waitFor(() => {
+        expect(inFlightA).toBe(1);
+        expect(inFlightB).toBe(1);
+      });
+    } finally {
+      (releaseA as (() => void) | null)?.();
+      (releaseB as (() => void) | null)?.();
+    }

-    expect(inFlightA).toBe(1);
-    expect(inFlightB).toBe(1);
-
-    (releaseA as (() => void) | null)?.();
-    (releaseB as (() => void) | null)?.();
    await Promise.all([waitForCredsSaveQueue(authDirA), waitForCredsSaveQueue(authDirB)]);

    expect(inFlightA).toBe(0);
--- a/extensions/zalo/src/monitor.lifecycle.test.ts
+++ b/extensions/zalo/src/monitor.lifecycle.test.ts
@@ -152,7 +152,7 @@ describe("monitorZaloProvider lifecycle", () => {

    abort.abort();

-    await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1));
+    await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1), { timeout: 5000 });
    expect(deleteWebhookMock).toHaveBeenCalledWith("test-token", undefined, 5000);
    expect(settled).toBe(false);
    expect(registry.httpRoutes).toHaveLength(2);
--- a/src/plugin-sdk/realtime-voice.ts
+++ b/src/plugin-sdk/realtime-voice.ts
@@ -36,3 +36,10 @@ export {
  type RealtimeVoiceBridgeSessionParams,
  type RealtimeVoiceMarkStrategy,
 } from "../realtime-voice/session-runtime.js";
+export {
+  convertPcmToMulaw8k,
+  mulawToPcm,
+  pcmToMulaw,
+  resamplePcm,
+  resamplePcmTo8k,
+} from "../realtime-voice/audio-codec.js";
--- a/src/realtime-voice/audio-codec.ts
+++ b/src/realtime-voice/audio-codec.ts
@@ -0,0 +1,138 @@
+const TELEPHONY_SAMPLE_RATE = 8000;
+const RESAMPLE_FILTER_TAPS = 31;
+const RESAMPLE_CUTOFF_GUARD = 0.94;
+
+function clamp16(value: number): number {
+  return Math.max(-32768, Math.min(32767, value));
+}
+
+function sinc(x: number): number {
+  if (x === 0) {
+    return 1;
+  }
+  return Math.sin(Math.PI * x) / (Math.PI * x);
+}
+
+function sampleBandlimited(
+  input: Buffer,
+  inputSamples: number,
+  srcPos: number,
+  cutoffCyclesPerSample: number,
+): number {
+  const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
+  const center = Math.floor(srcPos);
+  let weighted = 0;
+  let weightSum = 0;
+
+  for (let tap = -half; tap <= half; tap += 1) {
+    const sampleIndex = center + tap;
+    if (sampleIndex < 0 || sampleIndex >= inputSamples) {
+      continue;
+    }
+
+    const distance = sampleIndex - srcPos;
+    const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
+    const tapIndex = tap + half;
+    const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
+    const coeff = lowPass * window;
+    weighted += input.readInt16LE(sampleIndex * 2) * coeff;
+    weightSum += coeff;
+  }
+
+  if (weightSum === 0) {
+    const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
+    return input.readInt16LE(nearest * 2);
+  }
+
+  return weighted / weightSum;
+}
+
+export function resamplePcm(
+  input: Buffer,
+  inputSampleRate: number,
+  outputSampleRate: number,
+): Buffer {
+  if (inputSampleRate === outputSampleRate) {
+    return input;
+  }
+  const inputSamples = Math.floor(input.length / 2);
+  if (inputSamples === 0) {
+    return Buffer.alloc(0);
+  }
+
+  const ratio = inputSampleRate / outputSampleRate;
+  const outputSamples = Math.floor(inputSamples / ratio);
+  const output = Buffer.alloc(outputSamples * 2);
+  const maxCutoff = 0.5;
+  const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
+  const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
+
+  for (let i = 0; i < outputSamples; i += 1) {
+    const sample = Math.round(
+      sampleBandlimited(input, inputSamples, i * ratio, cutoffCyclesPerSample),
+    );
+    output.writeInt16LE(clamp16(sample), i * 2);
+  }
+
+  return output;
+}
+
+export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
+  return resamplePcm(input, inputSampleRate, TELEPHONY_SAMPLE_RATE);
+}
+
+export function pcmToMulaw(pcm: Buffer): Buffer {
+  const samples = Math.floor(pcm.length / 2);
+  const mulaw = Buffer.alloc(samples);
+
+  for (let i = 0; i < samples; i += 1) {
+    const sample = pcm.readInt16LE(i * 2);
+    mulaw[i] = linearToMulaw(sample);
+  }
+
+  return mulaw;
+}
+
+export function mulawToPcm(mulaw: Buffer): Buffer {
+  const pcm = Buffer.alloc(mulaw.length * 2);
+  for (let i = 0; i < mulaw.length; i += 1) {
+    pcm.writeInt16LE(clamp16(mulawToLinear(mulaw[i] ?? 0)), i * 2);
+  }
+  return pcm;
+}
+
+export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
+  return pcmToMulaw(resamplePcmTo8k(pcm, inputSampleRate));
+}
+
+function linearToMulaw(sample: number): number {
+  const BIAS = 132;
+  const CLIP = 32635;
+
+  const sign = sample < 0 ? 0x80 : 0;
+  if (sample < 0) {
+    sample = -sample;
+  }
+  if (sample > CLIP) {
+    sample = CLIP;
+  }
+
+  sample += BIAS;
+  let exponent = 7;
+  for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent -= 1) {
+    expMask >>= 1;
+  }
+
+  const mantissa = (sample >> (exponent + 3)) & 0x0f;
+  return ~(sign | (exponent << 4) | mantissa) & 0xff;
+}
+
+function mulawToLinear(value: number): number {
+  const muLaw = ~value & 0xff;
+  const sign = muLaw & 0x80;
+  const exponent = (muLaw >> 4) & 0x07;
+  const mantissa = muLaw & 0x0f;
+  let sample = ((mantissa << 3) + 132) << exponent;
+  sample -= 132;
+  return sign ? -sample : sample;
+}
--- a/test/helpers/plugins/plugin-registration-contract-cases.ts
+++ b/test/helpers/plugins/plugin-registration-contract-cases.ts
@@ -55,6 +55,7 @@ export const pluginRegistrationContractCases = {
    pluginId: "google",
    providerIds: ["google", "google-gemini-cli"],
    webSearchProviderIds: ["gemini"],
+    realtimeVoiceProviderIds: ["google"],
    speechProviderIds: ["google"],
    mediaUnderstandingProviderIds: ["google"],
    imageGenerationProviderIds: ["google"],