feat(google): add realtime voice provider

This commit is contained in:
Peter Steinberger
2026-04-24 09:08:09 +01:00
parent c138368040
commit b5e5f2cede
13 changed files with 1127 additions and 141 deletions

View File

@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
- Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`.
- Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key.
- Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support.
- Providers/OpenAI: add image generation and reference-image editing through Codex OAuth, so `openai/gpt-image-2` works without an `OPENAI_API_KEY`. Fixes #70703.
- Providers/OpenRouter: add image generation and reference-image editing through `image_generate`, so OpenRouter image models work with `OPENROUTER_API_KEY`. Fixes #55066 via #67668. Thanks @notamicrodose.
- Image generation: let agents request provider-supported quality and output format hints, and pass OpenAI-specific background, moderation, compression, and user hints through the `image_generate` tool. (#70503) Thanks @ottodeng.

View File

@@ -132,6 +132,7 @@ Choose your preferred auth method and follow the setup steps.
| Image generation | Yes |
| Music generation | Yes |
| Text-to-speech | Yes |
| Realtime voice | Yes (Google Live API) |
| Image understanding | Yes |
| Audio transcription | Yes |
| Video understanding | Yes |
@@ -281,6 +282,63 @@ A Google Cloud Console API key restricted to the Gemini API is valid for this
provider. This is not the separate Cloud Text-to-Speech API path.
</Note>
## Realtime voice
The bundled `google` plugin registers a realtime voice provider backed by the
Gemini Live API for backend audio bridges such as Voice Call and Google Meet.
| Setting | Config path | Default |
| --------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------- |
| Model | `plugins.entries.voice-call.config.realtime.providers.google.model` | `gemini-2.5-flash-native-audio-preview-12-2025` |
| Voice | `...google.voice` | `Kore` |
| Temperature | `...google.temperature` | (unset) |
| VAD start sensitivity | `...google.startSensitivity` | (unset) |
| VAD end sensitivity | `...google.endSensitivity` | (unset) |
| Silence duration | `...google.silenceDurationMs` | (unset) |
| API key | `...google.apiKey` | Falls back to `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` |
Example Voice Call realtime config:
```json5
{
plugins: {
entries: {
"voice-call": {
enabled: true,
config: {
realtime: {
enabled: true,
provider: "google",
providers: {
google: {
model: "gemini-2.5-flash-native-audio-preview-12-2025",
voice: "Kore",
},
},
},
},
},
},
},
}
```
<Note>
Google Live API uses bidirectional audio and function calling over a WebSocket.
OpenClaw adapts telephony/Meet bridge audio to Gemini's PCM Live API stream and
keeps tool calls on the shared realtime voice contract. Leave `temperature`
unset unless you need sampling changes; OpenClaw omits non-positive values
because Google Live can return transcripts without audio for `temperature: 0`.
Gemini API transcription is enabled without `languageCodes`; the current Google
SDK rejects language-code hints on this API path.
</Note>
<Note>
Control UI Talk browser sessions still require a realtime voice provider with a
browser WebRTC session implementation. Today that path is OpenAI Realtime; the
Google provider is for backend realtime bridges.
</Note>
## Advanced configuration
<AccordionGroup>

View File

@@ -156,12 +156,14 @@ Cron jobs panel notes:
- `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `<tool_call>...</tool_call>`, `<function_call>...</function_call>`, `<tool_calls>...</tool_calls>`, `<function_calls>...</function_calls>`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`.
- `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery).
- The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options.
- Talk mode uses the registered realtime voice provider. Configure OpenAI with
`talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the
Voice Call realtime provider config. The browser never receives the standard
OpenAI API key; it receives only the ephemeral Realtime client secret. The
Realtime session prompt is assembled by the Gateway; `talk.realtime.session`
does not accept caller-provided instruction overrides.
- Talk mode uses a registered realtime voice provider that supports browser
WebRTC sessions. Configure OpenAI with `talk.provider: "openai"` plus
`talk.providers.openai.apiKey`, or reuse the Voice Call realtime provider
config. The browser never receives the standard OpenAI API key; it receives
only the ephemeral Realtime client secret. Google Live realtime voice is
supported for backend Voice Call and Google Meet bridges, but not this browser
WebRTC path yet. The Realtime session prompt is assembled by the Gateway;
`talk.realtime.session` does not accept caller-provided instruction overrides.
- In the Chat composer, the Talk control is the waves button next to the
microphone dictation button. When Talk starts, the composer status row shows
`Connecting Talk...`, then `Talk live` while audio is connected, or

View File

@@ -11,6 +11,7 @@ import {
} from "./generation-provider-metadata.js";
import { geminiMemoryEmbeddingProviderAdapter } from "./memory-embedding-adapter.js";
import { registerGoogleProvider } from "./provider-registration.js";
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
import { buildGoogleSpeechProvider } from "./speech-provider.js";
import { createGeminiWebSearchProvider } from "./src/gemini-web-search-provider.js";
@@ -156,6 +157,7 @@ export default definePluginEntry({
api.registerImageGenerationProvider(createLazyGoogleImageGenerationProvider());
api.registerMediaUnderstandingProvider(createLazyGoogleMediaUnderstandingProvider());
api.registerMusicGenerationProvider(createLazyGoogleMusicGenerationProvider());
api.registerRealtimeVoiceProvider(buildGoogleRealtimeVoiceProvider());
api.registerSpeechProvider(buildGoogleSpeechProvider());
api.registerVideoGenerationProvider(createLazyGoogleVideoGenerationProvider());
api.registerWebSearchProvider(createGeminiWebSearchProvider());

View File

@@ -49,6 +49,7 @@
"memoryEmbeddingProviders": ["gemini"],
"imageGenerationProviders": ["google"],
"musicGenerationProviders": ["google"],
"realtimeVoiceProviders": ["google"],
"speechProviders": ["google"],
"videoGenerationProviders": ["google"],
"webSearchProviders": ["gemini"]

View File

@@ -0,0 +1,354 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
type MockGoogleLiveSession = {
close: ReturnType<typeof vi.fn>;
sendClientContent: ReturnType<typeof vi.fn>;
sendRealtimeInput: ReturnType<typeof vi.fn>;
sendToolResponse: ReturnType<typeof vi.fn>;
};
type MockGoogleLiveConnectParams = {
model: string;
config: Record<string, unknown>;
callbacks: {
onopen: () => void;
onmessage: (message: Record<string, unknown>) => void;
onerror: (event: { error?: unknown; message?: string }) => void;
onclose: () => void;
};
};
const { connectMock, session } = vi.hoisted(() => {
const session: MockGoogleLiveSession = {
close: vi.fn(),
sendClientContent: vi.fn(),
sendRealtimeInput: vi.fn(),
sendToolResponse: vi.fn(),
};
const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
return { connectMock, session };
});
vi.mock("./google-genai-runtime.js", () => ({
createGoogleGenAI: vi.fn(() => ({
live: {
connect: connectMock,
},
})),
}));
function lastConnectParams(): MockGoogleLiveConnectParams {
const params = connectMock.mock.calls.at(-1)?.[0];
if (!params) {
throw new Error("expected google live connect call");
}
return params;
}
describe("buildGoogleRealtimeVoiceProvider", () => {
beforeEach(() => {
connectMock.mockClear();
session.close.mockClear();
session.sendClientContent.mockClear();
session.sendRealtimeInput.mockClear();
session.sendToolResponse.mockClear();
delete process.env.GEMINI_API_KEY;
delete process.env.GOOGLE_API_KEY;
});
it("normalizes provider config and cfg model-provider key fallback", () => {
const provider = buildGoogleRealtimeVoiceProvider();
const resolved = provider.resolveConfig?.({
cfg: {
models: {
providers: {
google: {
apiKey: "cfg-key",
},
},
},
} as never,
rawConfig: {
providers: {
google: {
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
silenceDurationMs: 700,
startSensitivity: "high",
},
},
},
});
expect(resolved).toEqual({
apiKey: "cfg-key",
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
apiVersion: undefined,
prefixPaddingMs: undefined,
silenceDurationMs: 700,
startSensitivity: "high",
endSensitivity: undefined,
enableAffectiveDialog: undefined,
thinkingLevel: undefined,
thinkingBudget: undefined,
});
});
it("connects with Google Live setup config and tool declarations", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "gemini-key",
model: "gemini-live-2.5-flash-preview",
voice: "Kore",
temperature: 0.3,
startSensitivity: "low",
},
instructions: "Speak briefly.",
tools: [
{
type: "function",
name: "lookup",
description: "Look something up",
parameters: {
type: "object",
properties: {
query: { type: "string" },
},
required: ["query"],
},
},
],
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
expect(connectMock).toHaveBeenCalledTimes(1);
expect(lastConnectParams()).toMatchObject({
model: "gemini-live-2.5-flash-preview",
config: {
responseModalities: ["AUDIO"],
temperature: 0.3,
systemInstruction: "Speak briefly.",
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Kore",
},
},
},
outputAudioTranscription: {},
tools: [
{
functionDeclarations: [
{
name: "lookup",
description: "Look something up",
parametersJsonSchema: {
type: "object",
properties: {
query: { type: "string" },
},
required: ["query"],
},
},
],
},
],
},
});
});
it("omits zero temperature for native audio responses", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "gemini-key",
temperature: 0,
},
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
expect(lastConnectParams().config).not.toHaveProperty("temperature");
});
it("waits for setup completion before draining audio and firing ready", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onReady = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onReady,
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
bridge.sendAudio(Buffer.from([0xff, 0xff]));
expect(session.sendRealtimeInput).not.toHaveBeenCalled();
expect(onReady).not.toHaveBeenCalled();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
expect(onReady).toHaveBeenCalledTimes(1);
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1);
expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({
data: expect.any(String),
mimeType: "audio/pcm;rate=16000",
});
});
it("marks the Google audio stream complete after sustained telephony silence", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key", silenceDurationMs: 60 },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
const silence20ms = Buffer.alloc(160, 0xff);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
const callsAfterStreamEnd = session.sendRealtimeInput.mock.calls.length;
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(callsAfterStreamEnd);
session.sendRealtimeInput.mockClear();
bridge.sendAudio(Buffer.alloc(160, 0x7f));
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
});
it("sends text prompts as ordered client turns", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
bridge.sendUserMessage?.(" Say hello. ");
expect(session.sendClientContent).toHaveBeenCalledWith({
turns: [{ role: "user", parts: [{ text: "Say hello." }] }],
turnComplete: true,
});
});
it("converts Google PCM output to mu-law audio", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onAudio = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio,
onClearAudio: vi.fn(),
});
const pcm24k = Buffer.alloc(480);
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
serverContent: {
modelTurn: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm24k.toString("base64"),
},
},
],
},
},
});
expect(onAudio).toHaveBeenCalledTimes(1);
expect(onAudio.mock.calls[0]?.[0]).toBeInstanceOf(Buffer);
expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
});
it("does not forward Google thought text as assistant transcript", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onTranscript = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onTranscript,
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: {},
serverContent: {
modelTurn: {
parts: [{ text: "internal reasoning", thought: true }],
},
},
});
expect(onTranscript).not.toHaveBeenCalled();
});
it("forwards Live API tool calls and submits matching function responses", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onToolCall = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onToolCall,
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
toolCall: {
functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }],
},
});
expect(onToolCall).toHaveBeenCalledWith({
itemId: "call-1",
callId: "call-1",
name: "lookup",
args: { query: "hi" },
});
bridge.submitToolResult("call-1", { result: "ok" });
expect(session.sendToolResponse).toHaveBeenCalledWith({
functionResponses: [
{
id: "call-1",
response: { result: "ok" },
},
],
});
});
});

View File

@@ -0,0 +1,535 @@
import { randomUUID } from "node:crypto";
import {
EndSensitivity,
Modality,
StartSensitivity,
type FunctionDeclaration,
type FunctionResponse,
type LiveServerContent,
type LiveServerMessage,
type LiveServerToolCall,
type RealtimeInputConfig,
type ThinkingConfig,
} from "@google/genai";
import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
import type {
RealtimeVoiceBridge,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
RealtimeVoiceTool,
} from "openclaw/plugin-sdk/realtime-voice";
import { convertPcmToMulaw8k, mulawToPcm, resamplePcm } from "openclaw/plugin-sdk/realtime-voice";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import { createGoogleGenAI } from "./google-genai-runtime.js";
const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025";
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
const TELEPHONY_SAMPLE_RATE = 8000;
const MAX_PENDING_AUDIO_CHUNKS = 320;
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
type GoogleRealtimeSensitivity = "low" | "high";
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
type GoogleRealtimeVoiceProviderConfig = {
apiKey?: string;
model?: string;
voice?: string;
temperature?: number;
apiVersion?: string;
prefixPaddingMs?: number;
silenceDurationMs?: number;
startSensitivity?: GoogleRealtimeSensitivity;
endSensitivity?: GoogleRealtimeSensitivity;
enableAffectiveDialog?: boolean;
thinkingLevel?: GoogleRealtimeThinkingLevel;
thinkingBudget?: number;
};
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
apiKey: string;
model?: string;
voice?: string;
temperature?: number;
apiVersion?: string;
prefixPaddingMs?: number;
silenceDurationMs?: number;
startSensitivity?: GoogleRealtimeSensitivity;
endSensitivity?: GoogleRealtimeSensitivity;
enableAffectiveDialog?: boolean;
thinkingLevel?: GoogleRealtimeThinkingLevel;
thinkingBudget?: number;
};
type GoogleLiveSession = {
sendClientContent: (params: {
turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
turnComplete?: boolean;
}) => void;
sendRealtimeInput: (params: {
audio?: { data: string; mimeType: string };
audioStreamEnd?: boolean;
}) => void;
sendToolResponse: (params: { functionResponses: FunctionResponse[] | FunctionResponse }) => void;
close: () => void;
};
function trimToUndefined(value: unknown): string | undefined {
return normalizeOptionalString(value);
}
function asFiniteNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
function asSensitivity(value: unknown): GoogleRealtimeSensitivity | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase();
return normalized === "low" || normalized === "high" ? normalized : undefined;
}
function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase();
return normalized === "minimal" ||
normalized === "low" ||
normalized === "medium" ||
normalized === "high"
? normalized
: undefined;
}
function resolveGoogleRealtimeProviderConfigRecord(
config: Record<string, unknown>,
): Record<string, unknown> | undefined {
const providers =
typeof config.providers === "object" &&
config.providers !== null &&
!Array.isArray(config.providers)
? (config.providers as Record<string, unknown>)
: undefined;
const nested = providers?.google;
return typeof nested === "object" && nested !== null && !Array.isArray(nested)
? (nested as Record<string, unknown>)
: typeof config.google === "object" && config.google !== null && !Array.isArray(config.google)
? (config.google as Record<string, unknown>)
: config;
}
function normalizeProviderConfig(
config: RealtimeVoiceProviderConfig,
cfg?: OpenClawConfig,
): GoogleRealtimeVoiceProviderConfig {
const raw = resolveGoogleRealtimeProviderConfigRecord(config);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey ?? cfg?.models?.providers?.google?.apiKey,
path: "plugins.entries.voice-call.config.realtime.providers.google.apiKey",
}),
model: trimToUndefined(raw?.model),
voice: trimToUndefined(raw?.voice),
temperature: asFiniteNumber(raw?.temperature),
apiVersion: trimToUndefined(raw?.apiVersion),
prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs),
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
startSensitivity: asSensitivity(raw?.startSensitivity),
endSensitivity: asSensitivity(raw?.endSensitivity),
enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
};
}
function resolveEnvApiKey(): string | undefined {
return trimToUndefined(process.env.GEMINI_API_KEY) ?? trimToUndefined(process.env.GOOGLE_API_KEY);
}
function mapStartSensitivity(
value: GoogleRealtimeSensitivity | undefined,
): StartSensitivity | undefined {
switch (value) {
case "high":
return StartSensitivity.START_SENSITIVITY_HIGH;
case "low":
return StartSensitivity.START_SENSITIVITY_LOW;
default:
return undefined;
}
}
function mapEndSensitivity(
value: GoogleRealtimeSensitivity | undefined,
): EndSensitivity | undefined {
switch (value) {
case "high":
return EndSensitivity.END_SENSITIVITY_HIGH;
case "low":
return EndSensitivity.END_SENSITIVITY_LOW;
default:
return undefined;
}
}
function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
if (config.thinkingLevel) {
return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
}
if (typeof config.thinkingBudget === "number") {
return { thinkingBudget: config.thinkingBudget };
}
return undefined;
}
function buildRealtimeInputConfig(
config: GoogleRealtimeVoiceBridgeConfig,
): RealtimeInputConfig | undefined {
const startSensitivity = mapStartSensitivity(config.startSensitivity);
const endSensitivity = mapEndSensitivity(config.endSensitivity);
const automaticActivityDetection = {
...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}),
...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}),
...(typeof config.prefixPaddingMs === "number"
? { prefixPaddingMs: Math.max(0, Math.floor(config.prefixPaddingMs)) }
: {}),
...(typeof config.silenceDurationMs === "number"
? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) }
: {}),
};
return Object.keys(automaticActivityDetection).length > 0
? { automaticActivityDetection }
: undefined;
}
function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] {
return (tools ?? []).map((tool) => ({
name: tool.name,
description: tool.description,
parametersJsonSchema: tool.parameters,
}));
}
function parsePcmSampleRate(mimeType: string | undefined): number {
const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
return Number.isFinite(parsed) && parsed > 0 ? parsed : 24_000;
}
function isMulawSilence(audio: Buffer): boolean {
return audio.length > 0 && audio.every((sample) => sample === 0xff);
}
class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
private session: GoogleLiveSession | null = null;
private connected = false;
private sessionConfigured = false;
private intentionallyClosed = false;
private pendingAudio: Buffer[] = [];
private sessionReadyFired = false;
private consecutiveSilenceMs = 0;
private audioStreamEnded = false;
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {}
async connect(): Promise<void> {
this.intentionallyClosed = false;
this.sessionConfigured = false;
this.sessionReadyFired = false;
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
const ai = createGoogleGenAI({
apiKey: this.config.apiKey,
httpOptions: {
apiVersion: this.config.apiVersion ?? GOOGLE_REALTIME_DEFAULT_API_VERSION,
},
});
const functionDeclarations = buildFunctionDeclarations(this.config.tools);
this.session = (await ai.live.connect({
model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
config: {
responseModalities: [Modality.AUDIO],
...(typeof this.config.temperature === "number" && this.config.temperature > 0
? { temperature: this.config.temperature }
: {}),
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
},
},
},
systemInstruction: this.config.instructions,
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}),
inputAudioTranscription: {},
outputAudioTranscription: {},
...(typeof this.config.enableAffectiveDialog === "boolean"
? { enableAffectiveDialog: this.config.enableAffectiveDialog }
: {}),
...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}),
},
callbacks: {
onopen: () => {
this.connected = true;
},
onmessage: (message) => {
this.handleMessage(message);
},
onerror: (event) => {
const error =
event.error instanceof Error
? event.error
: new Error(
typeof event.message === "string" ? event.message : "Google Live API error",
);
this.config.onError?.(error);
},
onclose: () => {
this.connected = false;
this.sessionConfigured = false;
const reason = this.intentionallyClosed ? "completed" : "error";
this.session = null;
this.config.onClose?.(reason);
},
},
})) as GoogleLiveSession;
}
sendAudio(audio: Buffer): void {
if (!this.session || !this.connected || !this.sessionConfigured) {
if (this.pendingAudio.length < MAX_PENDING_AUDIO_CHUNKS) {
this.pendingAudio.push(audio);
}
return;
}
const silent = isMulawSilence(audio);
if (silent && this.audioStreamEnded) {
return;
}
if (!silent) {
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
}
const pcm16k = resamplePcm(
mulawToPcm(audio),
TELEPHONY_SAMPLE_RATE,
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
);
this.session.sendRealtimeInput({
audio: {
data: pcm16k.toString("base64"),
mimeType: `audio/pcm;rate=${GOOGLE_REALTIME_INPUT_SAMPLE_RATE}`,
},
});
if (!silent) {
return;
}
const silenceThresholdMs =
typeof this.config.silenceDurationMs === "number"
? Math.max(0, Math.floor(this.config.silenceDurationMs))
: DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000);
if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
this.session.sendRealtimeInput({ audioStreamEnd: true });
this.audioStreamEnded = true;
}
}
setMediaTimestamp(_ts: number): void {}
sendUserMessage(text: string): void {
const normalized = text.trim();
if (!normalized || !this.session || !this.connected || !this.sessionConfigured) {
return;
}
this.session.sendClientContent({
turns: [{ role: "user", parts: [{ text: normalized }] }],
turnComplete: true,
});
}
triggerGreeting(instructions?: string): void {
const greetingPrompt =
instructions?.trim() || "Start the call now. Greet the caller naturally and keep it brief.";
this.sendUserMessage(greetingPrompt);
}
submitToolResult(callId: string, result: unknown): void {
if (!this.session) {
return;
}
this.session.sendToolResponse({
functionResponses: [
{
id: callId,
response:
result && typeof result === "object"
? (result as Record<string, unknown>)
: { output: result },
},
],
});
}
acknowledgeMark(): void {}
close(): void {
this.intentionallyClosed = true;
this.connected = false;
this.sessionConfigured = false;
this.pendingAudio = [];
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
const session = this.session;
this.session = null;
session?.close();
}
isConnected(): boolean {
return this.connected && this.sessionConfigured;
}
private handleMessage(message: LiveServerMessage): void {
if (message.setupComplete) {
this.handleSetupComplete();
}
if (message.serverContent) {
this.handleServerContent(message.serverContent);
}
if (message.toolCall) {
this.handleToolCall(message.toolCall);
}
}
private handleSetupComplete(): void {
this.sessionConfigured = true;
for (const chunk of this.pendingAudio.splice(0)) {
this.sendAudio(chunk);
}
if (!this.sessionReadyFired) {
this.sessionReadyFired = true;
this.config.onReady?.();
}
}
private handleServerContent(content: LiveServerContent): void {
if (content.interrupted) {
this.config.onClearAudio();
}
if (content.inputTranscription?.text) {
this.config.onTranscript?.(
"user",
content.inputTranscription.text,
content.inputTranscription.finished ?? false,
);
}
if (content.outputTranscription?.text) {
this.config.onTranscript?.(
"assistant",
content.outputTranscription.text,
content.outputTranscription.finished ?? false,
);
}
let emittedAssistantText = false;
for (const part of content.modelTurn?.parts ?? []) {
if (part.inlineData?.data) {
const pcm = Buffer.from(part.inlineData.data, "base64");
const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
const muLaw = convertPcmToMulaw8k(pcm, sampleRate);
if (muLaw.length > 0) {
this.config.onAudio(muLaw);
this.config.onMark?.(`audio-${randomUUID()}`);
}
continue;
}
if (part.thought) {
continue;
}
if (!content.outputTranscription?.text && typeof part.text === "string" && part.text.trim()) {
emittedAssistantText = true;
this.config.onTranscript?.("assistant", part.text, content.turnComplete ?? false);
}
}
if (!emittedAssistantText && content.turnComplete && content.waitingForInput === false) {
return;
}
}
private handleToolCall(toolCall: LiveServerToolCall): void {
for (const call of toolCall.functionCalls ?? []) {
const name = call.name?.trim();
if (!name) {
continue;
}
const callId = call.id?.trim() || `google-live-${randomUUID()}`;
this.config.onToolCall?.({
itemId: callId,
callId,
name,
args: call.args ?? {},
});
}
}
private get realtimeInputConfig(): RealtimeInputConfig | undefined {
return buildRealtimeInputConfig(this.config);
}
private get thinkingConfig(): ThinkingConfig | undefined {
return buildThinkingConfig(this.config);
}
}
export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
return {
id: "google",
label: "Google Live Voice",
autoSelectOrder: 20,
resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg),
isConfigured: ({ providerConfig }) =>
Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()),
createBridge: (req) => {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = config.apiKey || resolveEnvApiKey();
if (!apiKey) {
throw new Error("Google Gemini API key missing");
}
return new GoogleRealtimeVoiceBridge({
...req,
apiKey,
model: config.model,
voice: config.voice,
temperature: config.temperature,
apiVersion: config.apiVersion,
prefixPaddingMs: config.prefixPaddingMs,
silenceDurationMs: config.silenceDurationMs,
startSensitivity: config.startSensitivity,
endSensitivity: config.endSensitivity,
enableAffectiveDialog: config.enableAffectiveDialog,
thinkingLevel: config.thinkingLevel,
thinkingBudget: config.thinkingBudget,
});
},
};
}
export {
GOOGLE_REALTIME_DEFAULT_API_VERSION,
GOOGLE_REALTIME_DEFAULT_MODEL,
GOOGLE_REALTIME_DEFAULT_VOICE,
};
export type { GoogleRealtimeVoiceProviderConfig };

View File

@@ -1,105 +1,8 @@
const TELEPHONY_SAMPLE_RATE = 8000;
const RESAMPLE_FILTER_TAPS = 31;
const RESAMPLE_CUTOFF_GUARD = 0.94;
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
function sinc(x: number): number {
if (x === 0) {
return 1;
}
return Math.sin(Math.PI * x) / (Math.PI * x);
}
/**
* Build a finite low-pass kernel centered on `srcPos`.
* The kernel is windowed (Hann) to reduce ringing artifacts.
*/
function sampleBandlimited(
input: Buffer,
inputSamples: number,
srcPos: number,
cutoffCyclesPerSample: number,
): number {
const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
const center = Math.floor(srcPos);
let weighted = 0;
let weightSum = 0;
for (let tap = -half; tap <= half; tap++) {
const sampleIndex = center + tap;
if (sampleIndex < 0 || sampleIndex >= inputSamples) {
continue;
}
const distance = sampleIndex - srcPos;
const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
const tapIndex = tap + half;
const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
const coeff = lowPass * window;
weighted += input.readInt16LE(sampleIndex * 2) * coeff;
weightSum += coeff;
}
if (weightSum === 0) {
const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
return input.readInt16LE(nearest * 2);
}
return weighted / weightSum;
}
/**
* Resample 16-bit PCM (little-endian mono) to 8kHz using a windowed low-pass kernel.
*/
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) {
return input;
}
const inputSamples = Math.floor(input.length / 2);
if (inputSamples === 0) {
return Buffer.alloc(0);
}
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
const outputSamples = Math.floor(inputSamples / ratio);
const output = Buffer.alloc(outputSamples * 2);
const maxCutoff = 0.5;
const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
for (let i = 0; i < outputSamples; i++) {
const srcPos = i * ratio;
const sample = Math.round(
sampleBandlimited(input, inputSamples, srcPos, cutoffCyclesPerSample),
);
output.writeInt16LE(clamp16(sample), i * 2);
}
return output;
}
/**
* Convert 16-bit PCM to 8-bit mu-law (G.711).
*/
export function pcmToMulaw(pcm: Buffer): Buffer {
const samples = Math.floor(pcm.length / 2);
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i++) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
return pcmToMulaw(pcm8k);
}
export {
convertPcmToMulaw8k,
pcmToMulaw,
resamplePcmTo8k,
} from "openclaw/plugin-sdk/realtime-voice";
/**
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
@@ -111,25 +14,3 @@ export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, vo
}
})();
}
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) {
sample = -sample;
}
if (sample > CLIP) {
sample = CLIP;
}
sample += BIAS;
let exponent = 7;
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
expMask >>= 1;
}
const mantissa = (sample >> (exponent + 3)) & 0x0f;
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}

View File

@@ -341,10 +341,13 @@ describe("web session", () => {
sock.ev.emit("creds.update", {});
sock.ev.emit("creds.update", {});
await flushCredsUpdate();
expect(inFlight).toBe(1);
(release as (() => void) | null)?.();
try {
await vi.waitFor(() => {
expect(inFlight).toBe(1);
});
} finally {
(release as (() => void) | null)?.();
}
await waitForCredsSaveQueue(authDir);
@@ -389,13 +392,16 @@ describe("web session", () => {
onError,
);
await flushCredsUpdate();
try {
await vi.waitFor(() => {
expect(inFlightA).toBe(1);
expect(inFlightB).toBe(1);
});
} finally {
(releaseA as (() => void) | null)?.();
(releaseB as (() => void) | null)?.();
}
expect(inFlightA).toBe(1);
expect(inFlightB).toBe(1);
(releaseA as (() => void) | null)?.();
(releaseB as (() => void) | null)?.();
await Promise.all([waitForCredsSaveQueue(authDirA), waitForCredsSaveQueue(authDirB)]);
expect(inFlightA).toBe(0);

View File

@@ -152,7 +152,7 @@ describe("monitorZaloProvider lifecycle", () => {
abort.abort();
await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1));
await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1), { timeout: 5000 });
expect(deleteWebhookMock).toHaveBeenCalledWith("test-token", undefined, 5000);
expect(settled).toBe(false);
expect(registry.httpRoutes).toHaveLength(2);

View File

@@ -36,3 +36,10 @@ export {
type RealtimeVoiceBridgeSessionParams,
type RealtimeVoiceMarkStrategy,
} from "../realtime-voice/session-runtime.js";
export {
convertPcmToMulaw8k,
mulawToPcm,
pcmToMulaw,
resamplePcm,
resamplePcmTo8k,
} from "../realtime-voice/audio-codec.js";

View File

@@ -0,0 +1,138 @@
const TELEPHONY_SAMPLE_RATE = 8000;
const RESAMPLE_FILTER_TAPS = 31;
const RESAMPLE_CUTOFF_GUARD = 0.94;
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
function sinc(x: number): number {
if (x === 0) {
return 1;
}
return Math.sin(Math.PI * x) / (Math.PI * x);
}
function sampleBandlimited(
input: Buffer,
inputSamples: number,
srcPos: number,
cutoffCyclesPerSample: number,
): number {
const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
const center = Math.floor(srcPos);
let weighted = 0;
let weightSum = 0;
for (let tap = -half; tap <= half; tap += 1) {
const sampleIndex = center + tap;
if (sampleIndex < 0 || sampleIndex >= inputSamples) {
continue;
}
const distance = sampleIndex - srcPos;
const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
const tapIndex = tap + half;
const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
const coeff = lowPass * window;
weighted += input.readInt16LE(sampleIndex * 2) * coeff;
weightSum += coeff;
}
if (weightSum === 0) {
const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
return input.readInt16LE(nearest * 2);
}
return weighted / weightSum;
}
export function resamplePcm(
input: Buffer,
inputSampleRate: number,
outputSampleRate: number,
): Buffer {
if (inputSampleRate === outputSampleRate) {
return input;
}
const inputSamples = Math.floor(input.length / 2);
if (inputSamples === 0) {
return Buffer.alloc(0);
}
const ratio = inputSampleRate / outputSampleRate;
const outputSamples = Math.floor(inputSamples / ratio);
const output = Buffer.alloc(outputSamples * 2);
const maxCutoff = 0.5;
const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
for (let i = 0; i < outputSamples; i += 1) {
const sample = Math.round(
sampleBandlimited(input, inputSamples, i * ratio, cutoffCyclesPerSample),
);
output.writeInt16LE(clamp16(sample), i * 2);
}
return output;
}
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
return resamplePcm(input, inputSampleRate, TELEPHONY_SAMPLE_RATE);
}
export function pcmToMulaw(pcm: Buffer): Buffer {
const samples = Math.floor(pcm.length / 2);
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i += 1) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
export function mulawToPcm(mulaw: Buffer): Buffer {
const pcm = Buffer.alloc(mulaw.length * 2);
for (let i = 0; i < mulaw.length; i += 1) {
pcm.writeInt16LE(clamp16(mulawToLinear(mulaw[i] ?? 0)), i * 2);
}
return pcm;
}
export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
return pcmToMulaw(resamplePcmTo8k(pcm, inputSampleRate));
}
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) {
sample = -sample;
}
if (sample > CLIP) {
sample = CLIP;
}
sample += BIAS;
let exponent = 7;
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent -= 1) {
expMask >>= 1;
}
const mantissa = (sample >> (exponent + 3)) & 0x0f;
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}
function mulawToLinear(value: number): number {
const muLaw = ~value & 0xff;
const sign = muLaw & 0x80;
const exponent = (muLaw >> 4) & 0x07;
const mantissa = muLaw & 0x0f;
let sample = ((mantissa << 3) + 132) << exponent;
sample -= 132;
return sign ? -sample : sample;
}

View File

@@ -55,6 +55,7 @@ export const pluginRegistrationContractCases = {
pluginId: "google",
providerIds: ["google", "google-gemini-cli"],
webSearchProviderIds: ["gemini"],
realtimeVoiceProviderIds: ["google"],
speechProviderIds: ["google"],
mediaUnderstandingProviderIds: ["google"],
imageGenerationProviderIds: ["google"],