diff --git a/CHANGELOG.md b/CHANGELOG.md index 09d59976580..b9c10facbc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai - Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`. - Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key. - Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc. +- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support. - Providers/OpenAI: add image generation and reference-image editing through Codex OAuth, so `openai/gpt-image-2` works without an `OPENAI_API_KEY`. Fixes #70703. - Providers/OpenRouter: add image generation and reference-image editing through `image_generate`, so OpenRouter image models work with `OPENROUTER_API_KEY`. Fixes #55066 via #67668. Thanks @notamicrodose. - Image generation: let agents request provider-supported quality and output format hints, and pass OpenAI-specific background, moderation, compression, and user hints through the `image_generate` tool. (#70503) Thanks @ottodeng. diff --git a/docs/providers/google.md b/docs/providers/google.md index 97aacb9e59d..d805a1bf6f2 100644 --- a/docs/providers/google.md +++ b/docs/providers/google.md @@ -132,6 +132,7 @@ Choose your preferred auth method and follow the setup steps. | Image generation | Yes | | Music generation | Yes | | Text-to-speech | Yes | +| Realtime voice | Yes (Google Live API) | | Image understanding | Yes | | Audio transcription | Yes | | Video understanding | Yes | @@ -281,6 +282,63 @@ A Google Cloud Console API key restricted to the Gemini API is valid for this provider. This is not the separate Cloud Text-to-Speech API path. +## Realtime voice + +The bundled `google` plugin registers a realtime voice provider backed by the +Gemini Live API for backend audio bridges such as Voice Call and Google Meet. + +| Setting | Config path | Default | +| --------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------- | +| Model | `plugins.entries.voice-call.config.realtime.providers.google.model` | `gemini-2.5-flash-native-audio-preview-12-2025` | +| Voice | `...google.voice` | `Kore` | +| Temperature | `...google.temperature` | (unset) | +| VAD start sensitivity | `...google.startSensitivity` | (unset) | +| VAD end sensitivity | `...google.endSensitivity` | (unset) | +| Silence duration | `...google.silenceDurationMs` | (unset) | +| API key | `...google.apiKey` | Falls back to `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` | + +Example Voice Call realtime config: + +```json5 +{ + plugins: { + entries: { + "voice-call": { + enabled: true, + config: { + realtime: { + enabled: true, + provider: "google", + providers: { + google: { + model: "gemini-2.5-flash-native-audio-preview-12-2025", + voice: "Kore", + }, + }, + }, + }, + }, + }, + }, +} +``` + + +Google Live API uses bidirectional audio and function calling over a WebSocket. +OpenClaw adapts telephony/Meet bridge audio to Gemini's PCM Live API stream and +keeps tool calls on the shared realtime voice contract. Leave `temperature` +unset unless you need sampling changes; OpenClaw omits non-positive values +because Google Live can return transcripts without audio for `temperature: 0`. +Gemini API transcription is enabled without `languageCodes`; the current Google +SDK rejects language-code hints on this API path. + + + +Control UI Talk browser sessions still require a realtime voice provider with a +browser WebRTC session implementation. Today that path is OpenAI Realtime; the +Google provider is for backend realtime bridges. + + ## Advanced configuration diff --git a/docs/web/control-ui.md b/docs/web/control-ui.md index 0520c1b0ce7..5aa4e101708 100644 --- a/docs/web/control-ui.md +++ b/docs/web/control-ui.md @@ -156,12 +156,14 @@ Cron jobs panel notes: - `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `...`, `...`, `...`, `...`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`. - `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery). - The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options. -- Talk mode uses the registered realtime voice provider. Configure OpenAI with - `talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the - Voice Call realtime provider config. The browser never receives the standard - OpenAI API key; it receives only the ephemeral Realtime client secret. The - Realtime session prompt is assembled by the Gateway; `talk.realtime.session` - does not accept caller-provided instruction overrides. +- Talk mode uses a registered realtime voice provider that supports browser + WebRTC sessions. Configure OpenAI with `talk.provider: "openai"` plus + `talk.providers.openai.apiKey`, or reuse the Voice Call realtime provider + config. The browser never receives the standard OpenAI API key; it receives + only the ephemeral Realtime client secret. Google Live realtime voice is + supported for backend Voice Call and Google Meet bridges, but not this browser + WebRTC path yet. The Realtime session prompt is assembled by the Gateway; + `talk.realtime.session` does not accept caller-provided instruction overrides. - In the Chat composer, the Talk control is the waves button next to the microphone dictation button. When Talk starts, the composer status row shows `Connecting Talk...`, then `Talk live` while audio is connected, or diff --git a/extensions/google/index.ts b/extensions/google/index.ts index c022b655c70..8474ef7bd61 100644 --- a/extensions/google/index.ts +++ b/extensions/google/index.ts @@ -11,6 +11,7 @@ import { } from "./generation-provider-metadata.js"; import { geminiMemoryEmbeddingProviderAdapter } from "./memory-embedding-adapter.js"; import { registerGoogleProvider } from "./provider-registration.js"; +import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js"; import { buildGoogleSpeechProvider } from "./speech-provider.js"; import { createGeminiWebSearchProvider } from "./src/gemini-web-search-provider.js"; @@ -156,6 +157,7 @@ export default definePluginEntry({ api.registerImageGenerationProvider(createLazyGoogleImageGenerationProvider()); api.registerMediaUnderstandingProvider(createLazyGoogleMediaUnderstandingProvider()); api.registerMusicGenerationProvider(createLazyGoogleMusicGenerationProvider()); + api.registerRealtimeVoiceProvider(buildGoogleRealtimeVoiceProvider()); api.registerSpeechProvider(buildGoogleSpeechProvider()); api.registerVideoGenerationProvider(createLazyGoogleVideoGenerationProvider()); api.registerWebSearchProvider(createGeminiWebSearchProvider()); diff --git a/extensions/google/openclaw.plugin.json b/extensions/google/openclaw.plugin.json index 02834c11ee2..a1641758254 100644 --- a/extensions/google/openclaw.plugin.json +++ b/extensions/google/openclaw.plugin.json @@ -49,6 +49,7 @@ "memoryEmbeddingProviders": ["gemini"], "imageGenerationProviders": ["google"], "musicGenerationProviders": ["google"], + "realtimeVoiceProviders": ["google"], "speechProviders": ["google"], "videoGenerationProviders": ["google"], "webSearchProviders": ["gemini"] diff --git a/extensions/google/realtime-voice-provider.test.ts b/extensions/google/realtime-voice-provider.test.ts new file mode 100644 index 00000000000..626a75eb03c --- /dev/null +++ b/extensions/google/realtime-voice-provider.test.ts @@ -0,0 +1,354 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js"; + +type MockGoogleLiveSession = { + close: ReturnType; + sendClientContent: ReturnType; + sendRealtimeInput: ReturnType; + sendToolResponse: ReturnType; +}; + +type MockGoogleLiveConnectParams = { + model: string; + config: Record; + callbacks: { + onopen: () => void; + onmessage: (message: Record) => void; + onerror: (event: { error?: unknown; message?: string }) => void; + onclose: () => void; + }; +}; + +const { connectMock, session } = vi.hoisted(() => { + const session: MockGoogleLiveSession = { + close: vi.fn(), + sendClientContent: vi.fn(), + sendRealtimeInput: vi.fn(), + sendToolResponse: vi.fn(), + }; + const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session); + return { connectMock, session }; +}); + +vi.mock("./google-genai-runtime.js", () => ({ + createGoogleGenAI: vi.fn(() => ({ + live: { + connect: connectMock, + }, + })), +})); + +function lastConnectParams(): MockGoogleLiveConnectParams { + const params = connectMock.mock.calls.at(-1)?.[0]; + if (!params) { + throw new Error("expected google live connect call"); + } + return params; +} + +describe("buildGoogleRealtimeVoiceProvider", () => { + beforeEach(() => { + connectMock.mockClear(); + session.close.mockClear(); + session.sendClientContent.mockClear(); + session.sendRealtimeInput.mockClear(); + session.sendToolResponse.mockClear(); + delete process.env.GEMINI_API_KEY; + delete process.env.GOOGLE_API_KEY; + }); + + it("normalizes provider config and cfg model-provider key fallback", () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const resolved = provider.resolveConfig?.({ + cfg: { + models: { + providers: { + google: { + apiKey: "cfg-key", + }, + }, + }, + } as never, + rawConfig: { + providers: { + google: { + model: "gemini-live-2.5-flash-preview", + voice: "Puck", + temperature: 0.4, + silenceDurationMs: 700, + startSensitivity: "high", + }, + }, + }, + }); + + expect(resolved).toEqual({ + apiKey: "cfg-key", + model: "gemini-live-2.5-flash-preview", + voice: "Puck", + temperature: 0.4, + apiVersion: undefined, + prefixPaddingMs: undefined, + silenceDurationMs: 700, + startSensitivity: "high", + endSensitivity: undefined, + enableAffectiveDialog: undefined, + thinkingLevel: undefined, + thinkingBudget: undefined, + }); + }); + + it("connects with Google Live setup config and tool declarations", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { + apiKey: "gemini-key", + model: "gemini-live-2.5-flash-preview", + voice: "Kore", + temperature: 0.3, + startSensitivity: "low", + }, + instructions: "Speak briefly.", + tools: [ + { + type: "function", + name: "lookup", + description: "Look something up", + parameters: { + type: "object", + properties: { + query: { type: "string" }, + }, + required: ["query"], + }, + }, + ], + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + await bridge.connect(); + + expect(connectMock).toHaveBeenCalledTimes(1); + expect(lastConnectParams()).toMatchObject({ + model: "gemini-live-2.5-flash-preview", + config: { + responseModalities: ["AUDIO"], + temperature: 0.3, + systemInstruction: "Speak briefly.", + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { + voiceName: "Kore", + }, + }, + }, + outputAudioTranscription: {}, + tools: [ + { + functionDeclarations: [ + { + name: "lookup", + description: "Look something up", + parametersJsonSchema: { + type: "object", + properties: { + query: { type: "string" }, + }, + required: ["query"], + }, + }, + ], + }, + ], + }, + }); + }); + + it("omits zero temperature for native audio responses", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { + apiKey: "gemini-key", + temperature: 0, + }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + await bridge.connect(); + + expect(lastConnectParams().config).not.toHaveProperty("temperature"); + }); + + it("waits for setup completion before draining audio and firing ready", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const onReady = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + onReady, + }); + + await bridge.connect(); + lastConnectParams().callbacks.onopen(); + bridge.sendAudio(Buffer.from([0xff, 0xff])); + + expect(session.sendRealtimeInput).not.toHaveBeenCalled(); + expect(onReady).not.toHaveBeenCalled(); + + lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } }); + + expect(onReady).toHaveBeenCalledTimes(1); + expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1); + expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({ + data: expect.any(String), + mimeType: "audio/pcm;rate=16000", + }); + }); + + it("marks the Google audio stream complete after sustained telephony silence", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key", silenceDurationMs: 60 }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + await bridge.connect(); + lastConnectParams().callbacks.onopen(); + lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } }); + + const silence20ms = Buffer.alloc(160, 0xff); + bridge.sendAudio(silence20ms); + bridge.sendAudio(silence20ms); + bridge.sendAudio(silence20ms); + + expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true }); + + const callsAfterStreamEnd = session.sendRealtimeInput.mock.calls.length; + bridge.sendAudio(silence20ms); + expect(session.sendRealtimeInput).toHaveBeenCalledTimes(callsAfterStreamEnd); + + session.sendRealtimeInput.mockClear(); + bridge.sendAudio(Buffer.alloc(160, 0x7f)); + bridge.sendAudio(silence20ms); + bridge.sendAudio(silence20ms); + bridge.sendAudio(silence20ms); + + expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true }); + }); + + it("sends text prompts as ordered client turns", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + await bridge.connect(); + lastConnectParams().callbacks.onopen(); + lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } }); + + bridge.sendUserMessage?.(" Say hello. "); + + expect(session.sendClientContent).toHaveBeenCalledWith({ + turns: [{ role: "user", parts: [{ text: "Say hello." }] }], + turnComplete: true, + }); + }); + + it("converts Google PCM output to mu-law audio", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const onAudio = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + onAudio, + onClearAudio: vi.fn(), + }); + const pcm24k = Buffer.alloc(480); + + await bridge.connect(); + lastConnectParams().callbacks.onmessage({ + setupComplete: { sessionId: "session-1" }, + serverContent: { + modelTurn: { + parts: [ + { + inlineData: { + mimeType: "audio/L16;codec=pcm;rate=24000", + data: pcm24k.toString("base64"), + }, + }, + ], + }, + }, + }); + + expect(onAudio).toHaveBeenCalledTimes(1); + expect(onAudio.mock.calls[0]?.[0]).toBeInstanceOf(Buffer); + expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80); + }); + + it("does not forward Google thought text as assistant transcript", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const onTranscript = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + onTranscript, + }); + + await bridge.connect(); + lastConnectParams().callbacks.onmessage({ + setupComplete: {}, + serverContent: { + modelTurn: { + parts: [{ text: "internal reasoning", thought: true }], + }, + }, + }); + + expect(onTranscript).not.toHaveBeenCalled(); + }); + + it("forwards Live API tool calls and submits matching function responses", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const onToolCall = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + onToolCall, + }); + + await bridge.connect(); + lastConnectParams().callbacks.onmessage({ + setupComplete: { sessionId: "session-1" }, + toolCall: { + functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }], + }, + }); + + expect(onToolCall).toHaveBeenCalledWith({ + itemId: "call-1", + callId: "call-1", + name: "lookup", + args: { query: "hi" }, + }); + + bridge.submitToolResult("call-1", { result: "ok" }); + + expect(session.sendToolResponse).toHaveBeenCalledWith({ + functionResponses: [ + { + id: "call-1", + response: { result: "ok" }, + }, + ], + }); + }); +}); diff --git a/extensions/google/realtime-voice-provider.ts b/extensions/google/realtime-voice-provider.ts new file mode 100644 index 00000000000..269792a42c8 --- /dev/null +++ b/extensions/google/realtime-voice-provider.ts @@ -0,0 +1,535 @@ +import { randomUUID } from "node:crypto"; +import { + EndSensitivity, + Modality, + StartSensitivity, + type FunctionDeclaration, + type FunctionResponse, + type LiveServerContent, + type LiveServerMessage, + type LiveServerToolCall, + type RealtimeInputConfig, + type ThinkingConfig, +} from "@google/genai"; +import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard"; +import type { + RealtimeVoiceBridge, + RealtimeVoiceBridgeCreateRequest, + RealtimeVoiceProviderConfig, + RealtimeVoiceProviderPlugin, + RealtimeVoiceTool, +} from "openclaw/plugin-sdk/realtime-voice"; +import { convertPcmToMulaw8k, mulawToPcm, resamplePcm } from "openclaw/plugin-sdk/realtime-voice"; +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime"; +import { createGoogleGenAI } from "./google-genai-runtime.js"; + +const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025"; +const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore"; +const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta"; +const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000; +const TELEPHONY_SAMPLE_RATE = 8000; +const MAX_PENDING_AUDIO_CHUNKS = 320; +const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700; + +type GoogleRealtimeSensitivity = "low" | "high"; +type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high"; + +type GoogleRealtimeVoiceProviderConfig = { + apiKey?: string; + model?: string; + voice?: string; + temperature?: number; + apiVersion?: string; + prefixPaddingMs?: number; + silenceDurationMs?: number; + startSensitivity?: GoogleRealtimeSensitivity; + endSensitivity?: GoogleRealtimeSensitivity; + enableAffectiveDialog?: boolean; + thinkingLevel?: GoogleRealtimeThinkingLevel; + thinkingBudget?: number; +}; + +type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & { + apiKey: string; + model?: string; + voice?: string; + temperature?: number; + apiVersion?: string; + prefixPaddingMs?: number; + silenceDurationMs?: number; + startSensitivity?: GoogleRealtimeSensitivity; + endSensitivity?: GoogleRealtimeSensitivity; + enableAffectiveDialog?: boolean; + thinkingLevel?: GoogleRealtimeThinkingLevel; + thinkingBudget?: number; +}; + +type GoogleLiveSession = { + sendClientContent: (params: { + turns?: Array<{ role: string; parts: Array<{ text: string }> }>; + turnComplete?: boolean; + }) => void; + sendRealtimeInput: (params: { + audio?: { data: string; mimeType: string }; + audioStreamEnd?: boolean; + }) => void; + sendToolResponse: (params: { functionResponses: FunctionResponse[] | FunctionResponse }) => void; + close: () => void; +}; + +function trimToUndefined(value: unknown): string | undefined { + return normalizeOptionalString(value); +} + +function asFiniteNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function asBoolean(value: unknown): boolean | undefined { + return typeof value === "boolean" ? value : undefined; +} + +function asSensitivity(value: unknown): GoogleRealtimeSensitivity | undefined { + const normalized = normalizeOptionalString(value)?.toLowerCase(); + return normalized === "low" || normalized === "high" ? normalized : undefined; +} + +function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefined { + const normalized = normalizeOptionalString(value)?.toLowerCase(); + return normalized === "minimal" || + normalized === "low" || + normalized === "medium" || + normalized === "high" + ? normalized + : undefined; +} + +function resolveGoogleRealtimeProviderConfigRecord( + config: Record, +): Record | undefined { + const providers = + typeof config.providers === "object" && + config.providers !== null && + !Array.isArray(config.providers) + ? (config.providers as Record) + : undefined; + const nested = providers?.google; + return typeof nested === "object" && nested !== null && !Array.isArray(nested) + ? (nested as Record) + : typeof config.google === "object" && config.google !== null && !Array.isArray(config.google) + ? (config.google as Record) + : config; +} + +function normalizeProviderConfig( + config: RealtimeVoiceProviderConfig, + cfg?: OpenClawConfig, +): GoogleRealtimeVoiceProviderConfig { + const raw = resolveGoogleRealtimeProviderConfigRecord(config); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey ?? cfg?.models?.providers?.google?.apiKey, + path: "plugins.entries.voice-call.config.realtime.providers.google.apiKey", + }), + model: trimToUndefined(raw?.model), + voice: trimToUndefined(raw?.voice), + temperature: asFiniteNumber(raw?.temperature), + apiVersion: trimToUndefined(raw?.apiVersion), + prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs), + silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs), + startSensitivity: asSensitivity(raw?.startSensitivity), + endSensitivity: asSensitivity(raw?.endSensitivity), + enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog), + thinkingLevel: asThinkingLevel(raw?.thinkingLevel), + thinkingBudget: asFiniteNumber(raw?.thinkingBudget), + }; +} + +function resolveEnvApiKey(): string | undefined { + return trimToUndefined(process.env.GEMINI_API_KEY) ?? trimToUndefined(process.env.GOOGLE_API_KEY); +} + +function mapStartSensitivity( + value: GoogleRealtimeSensitivity | undefined, +): StartSensitivity | undefined { + switch (value) { + case "high": + return StartSensitivity.START_SENSITIVITY_HIGH; + case "low": + return StartSensitivity.START_SENSITIVITY_LOW; + default: + return undefined; + } +} + +function mapEndSensitivity( + value: GoogleRealtimeSensitivity | undefined, +): EndSensitivity | undefined { + switch (value) { + case "high": + return EndSensitivity.END_SENSITIVITY_HIGH; + case "low": + return EndSensitivity.END_SENSITIVITY_LOW; + default: + return undefined; + } +} + +function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined { + if (config.thinkingLevel) { + return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] }; + } + if (typeof config.thinkingBudget === "number") { + return { thinkingBudget: config.thinkingBudget }; + } + return undefined; +} + +function buildRealtimeInputConfig( + config: GoogleRealtimeVoiceBridgeConfig, +): RealtimeInputConfig | undefined { + const startSensitivity = mapStartSensitivity(config.startSensitivity); + const endSensitivity = mapEndSensitivity(config.endSensitivity); + const automaticActivityDetection = { + ...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}), + ...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}), + ...(typeof config.prefixPaddingMs === "number" + ? { prefixPaddingMs: Math.max(0, Math.floor(config.prefixPaddingMs)) } + : {}), + ...(typeof config.silenceDurationMs === "number" + ? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) } + : {}), + }; + return Object.keys(automaticActivityDetection).length > 0 + ? { automaticActivityDetection } + : undefined; +} + +function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] { + return (tools ?? []).map((tool) => ({ + name: tool.name, + description: tool.description, + parametersJsonSchema: tool.parameters, + })); +} + +function parsePcmSampleRate(mimeType: string | undefined): number { + const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i); + const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN; + return Number.isFinite(parsed) && parsed > 0 ? parsed : 24_000; +} + +function isMulawSilence(audio: Buffer): boolean { + return audio.length > 0 && audio.every((sample) => sample === 0xff); +} + +class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { + private session: GoogleLiveSession | null = null; + private connected = false; + private sessionConfigured = false; + private intentionallyClosed = false; + private pendingAudio: Buffer[] = []; + private sessionReadyFired = false; + private consecutiveSilenceMs = 0; + private audioStreamEnded = false; + + constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {} + + async connect(): Promise { + this.intentionallyClosed = false; + this.sessionConfigured = false; + this.sessionReadyFired = false; + this.consecutiveSilenceMs = 0; + this.audioStreamEnded = false; + + const ai = createGoogleGenAI({ + apiKey: this.config.apiKey, + httpOptions: { + apiVersion: this.config.apiVersion ?? GOOGLE_REALTIME_DEFAULT_API_VERSION, + }, + }); + + const functionDeclarations = buildFunctionDeclarations(this.config.tools); + this.session = (await ai.live.connect({ + model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL, + config: { + responseModalities: [Modality.AUDIO], + ...(typeof this.config.temperature === "number" && this.config.temperature > 0 + ? { temperature: this.config.temperature } + : {}), + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { + voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE, + }, + }, + }, + systemInstruction: this.config.instructions, + ...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}), + ...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}), + inputAudioTranscription: {}, + outputAudioTranscription: {}, + ...(typeof this.config.enableAffectiveDialog === "boolean" + ? { enableAffectiveDialog: this.config.enableAffectiveDialog } + : {}), + ...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}), + }, + callbacks: { + onopen: () => { + this.connected = true; + }, + onmessage: (message) => { + this.handleMessage(message); + }, + onerror: (event) => { + const error = + event.error instanceof Error + ? event.error + : new Error( + typeof event.message === "string" ? event.message : "Google Live API error", + ); + this.config.onError?.(error); + }, + onclose: () => { + this.connected = false; + this.sessionConfigured = false; + const reason = this.intentionallyClosed ? "completed" : "error"; + this.session = null; + this.config.onClose?.(reason); + }, + }, + })) as GoogleLiveSession; + } + + sendAudio(audio: Buffer): void { + if (!this.session || !this.connected || !this.sessionConfigured) { + if (this.pendingAudio.length < MAX_PENDING_AUDIO_CHUNKS) { + this.pendingAudio.push(audio); + } + return; + } + const silent = isMulawSilence(audio); + if (silent && this.audioStreamEnded) { + return; + } + if (!silent) { + this.consecutiveSilenceMs = 0; + this.audioStreamEnded = false; + } + + const pcm16k = resamplePcm( + mulawToPcm(audio), + TELEPHONY_SAMPLE_RATE, + GOOGLE_REALTIME_INPUT_SAMPLE_RATE, + ); + this.session.sendRealtimeInput({ + audio: { + data: pcm16k.toString("base64"), + mimeType: `audio/pcm;rate=${GOOGLE_REALTIME_INPUT_SAMPLE_RATE}`, + }, + }); + + if (!silent) { + return; + } + + const silenceThresholdMs = + typeof this.config.silenceDurationMs === "number" + ? Math.max(0, Math.floor(this.config.silenceDurationMs)) + : DEFAULT_AUDIO_STREAM_END_SILENCE_MS; + this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000); + if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) { + this.session.sendRealtimeInput({ audioStreamEnd: true }); + this.audioStreamEnded = true; + } + } + + setMediaTimestamp(_ts: number): void {} + + sendUserMessage(text: string): void { + const normalized = text.trim(); + if (!normalized || !this.session || !this.connected || !this.sessionConfigured) { + return; + } + this.session.sendClientContent({ + turns: [{ role: "user", parts: [{ text: normalized }] }], + turnComplete: true, + }); + } + + triggerGreeting(instructions?: string): void { + const greetingPrompt = + instructions?.trim() || "Start the call now. Greet the caller naturally and keep it brief."; + this.sendUserMessage(greetingPrompt); + } + + submitToolResult(callId: string, result: unknown): void { + if (!this.session) { + return; + } + this.session.sendToolResponse({ + functionResponses: [ + { + id: callId, + response: + result && typeof result === "object" + ? (result as Record) + : { output: result }, + }, + ], + }); + } + + acknowledgeMark(): void {} + + close(): void { + this.intentionallyClosed = true; + this.connected = false; + this.sessionConfigured = false; + this.pendingAudio = []; + this.consecutiveSilenceMs = 0; + this.audioStreamEnded = false; + const session = this.session; + this.session = null; + session?.close(); + } + + isConnected(): boolean { + return this.connected && this.sessionConfigured; + } + + private handleMessage(message: LiveServerMessage): void { + if (message.setupComplete) { + this.handleSetupComplete(); + } + if (message.serverContent) { + this.handleServerContent(message.serverContent); + } + if (message.toolCall) { + this.handleToolCall(message.toolCall); + } + } + + private handleSetupComplete(): void { + this.sessionConfigured = true; + for (const chunk of this.pendingAudio.splice(0)) { + this.sendAudio(chunk); + } + if (!this.sessionReadyFired) { + this.sessionReadyFired = true; + this.config.onReady?.(); + } + } + + private handleServerContent(content: LiveServerContent): void { + if (content.interrupted) { + this.config.onClearAudio(); + } + + if (content.inputTranscription?.text) { + this.config.onTranscript?.( + "user", + content.inputTranscription.text, + content.inputTranscription.finished ?? false, + ); + } + + if (content.outputTranscription?.text) { + this.config.onTranscript?.( + "assistant", + content.outputTranscription.text, + content.outputTranscription.finished ?? false, + ); + } + + let emittedAssistantText = false; + for (const part of content.modelTurn?.parts ?? []) { + if (part.inlineData?.data) { + const pcm = Buffer.from(part.inlineData.data, "base64"); + const sampleRate = parsePcmSampleRate(part.inlineData.mimeType); + const muLaw = convertPcmToMulaw8k(pcm, sampleRate); + if (muLaw.length > 0) { + this.config.onAudio(muLaw); + this.config.onMark?.(`audio-${randomUUID()}`); + } + continue; + } + if (part.thought) { + continue; + } + if (!content.outputTranscription?.text && typeof part.text === "string" && part.text.trim()) { + emittedAssistantText = true; + this.config.onTranscript?.("assistant", part.text, content.turnComplete ?? false); + } + } + + if (!emittedAssistantText && content.turnComplete && content.waitingForInput === false) { + return; + } + } + + private handleToolCall(toolCall: LiveServerToolCall): void { + for (const call of toolCall.functionCalls ?? []) { + const name = call.name?.trim(); + if (!name) { + continue; + } + const callId = call.id?.trim() || `google-live-${randomUUID()}`; + this.config.onToolCall?.({ + itemId: callId, + callId, + name, + args: call.args ?? {}, + }); + } + } + + private get realtimeInputConfig(): RealtimeInputConfig | undefined { + return buildRealtimeInputConfig(this.config); + } + + private get thinkingConfig(): ThinkingConfig | undefined { + return buildThinkingConfig(this.config); + } +} + +export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin { + return { + id: "google", + label: "Google Live Voice", + autoSelectOrder: 20, + resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg), + isConfigured: ({ providerConfig }) => + Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()), + createBridge: (req) => { + const config = normalizeProviderConfig(req.providerConfig); + const apiKey = config.apiKey || resolveEnvApiKey(); + if (!apiKey) { + throw new Error("Google Gemini API key missing"); + } + return new GoogleRealtimeVoiceBridge({ + ...req, + apiKey, + model: config.model, + voice: config.voice, + temperature: config.temperature, + apiVersion: config.apiVersion, + prefixPaddingMs: config.prefixPaddingMs, + silenceDurationMs: config.silenceDurationMs, + startSensitivity: config.startSensitivity, + endSensitivity: config.endSensitivity, + enableAffectiveDialog: config.enableAffectiveDialog, + thinkingLevel: config.thinkingLevel, + thinkingBudget: config.thinkingBudget, + }); + }, + }; +} + +export { + GOOGLE_REALTIME_DEFAULT_API_VERSION, + GOOGLE_REALTIME_DEFAULT_MODEL, + GOOGLE_REALTIME_DEFAULT_VOICE, +}; +export type { GoogleRealtimeVoiceProviderConfig }; diff --git a/extensions/voice-call/src/telephony-audio.ts b/extensions/voice-call/src/telephony-audio.ts index e87111c3880..ca128275b41 100644 --- a/extensions/voice-call/src/telephony-audio.ts +++ b/extensions/voice-call/src/telephony-audio.ts @@ -1,105 +1,8 @@ -const TELEPHONY_SAMPLE_RATE = 8000; -const RESAMPLE_FILTER_TAPS = 31; -const RESAMPLE_CUTOFF_GUARD = 0.94; - -function clamp16(value: number): number { - return Math.max(-32768, Math.min(32767, value)); -} - -function sinc(x: number): number { - if (x === 0) { - return 1; - } - return Math.sin(Math.PI * x) / (Math.PI * x); -} - -/** - * Build a finite low-pass kernel centered on `srcPos`. - * The kernel is windowed (Hann) to reduce ringing artifacts. - */ -function sampleBandlimited( - input: Buffer, - inputSamples: number, - srcPos: number, - cutoffCyclesPerSample: number, -): number { - const half = Math.floor(RESAMPLE_FILTER_TAPS / 2); - const center = Math.floor(srcPos); - let weighted = 0; - let weightSum = 0; - - for (let tap = -half; tap <= half; tap++) { - const sampleIndex = center + tap; - if (sampleIndex < 0 || sampleIndex >= inputSamples) { - continue; - } - - const distance = sampleIndex - srcPos; - const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance); - const tapIndex = tap + half; - const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1)); - const coeff = lowPass * window; - weighted += input.readInt16LE(sampleIndex * 2) * coeff; - weightSum += coeff; - } - - if (weightSum === 0) { - const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos))); - return input.readInt16LE(nearest * 2); - } - - return weighted / weightSum; -} - -/** - * Resample 16-bit PCM (little-endian mono) to 8kHz using a windowed low-pass kernel. - */ -export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer { - if (inputSampleRate === TELEPHONY_SAMPLE_RATE) { - return input; - } - const inputSamples = Math.floor(input.length / 2); - if (inputSamples === 0) { - return Buffer.alloc(0); - } - - const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE; - const outputSamples = Math.floor(inputSamples / ratio); - const output = Buffer.alloc(outputSamples * 2); - const maxCutoff = 0.5; - const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff; - const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD); - - for (let i = 0; i < outputSamples; i++) { - const srcPos = i * ratio; - const sample = Math.round( - sampleBandlimited(input, inputSamples, srcPos, cutoffCyclesPerSample), - ); - output.writeInt16LE(clamp16(sample), i * 2); - } - - return output; -} - -/** - * Convert 16-bit PCM to 8-bit mu-law (G.711). - */ -export function pcmToMulaw(pcm: Buffer): Buffer { - const samples = Math.floor(pcm.length / 2); - const mulaw = Buffer.alloc(samples); - - for (let i = 0; i < samples; i++) { - const sample = pcm.readInt16LE(i * 2); - mulaw[i] = linearToMulaw(sample); - } - - return mulaw; -} - -export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer { - const pcm8k = resamplePcmTo8k(pcm, inputSampleRate); - return pcmToMulaw(pcm8k); -} +export { + convertPcmToMulaw8k, + pcmToMulaw, + resamplePcmTo8k, +} from "openclaw/plugin-sdk/realtime-voice"; /** * Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law). @@ -111,25 +14,3 @@ export function chunkAudio(audio: Buffer, chunkSize = 160): Generator CLIP) { - sample = CLIP; - } - - sample += BIAS; - let exponent = 7; - for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) { - expMask >>= 1; - } - - const mantissa = (sample >> (exponent + 3)) & 0x0f; - return ~(sign | (exponent << 4) | mantissa) & 0xff; -} diff --git a/extensions/whatsapp/src/session.test.ts b/extensions/whatsapp/src/session.test.ts index b24bf1e87c3..be3d70dfc41 100644 --- a/extensions/whatsapp/src/session.test.ts +++ b/extensions/whatsapp/src/session.test.ts @@ -341,10 +341,13 @@ describe("web session", () => { sock.ev.emit("creds.update", {}); sock.ev.emit("creds.update", {}); - await flushCredsUpdate(); - expect(inFlight).toBe(1); - - (release as (() => void) | null)?.(); + try { + await vi.waitFor(() => { + expect(inFlight).toBe(1); + }); + } finally { + (release as (() => void) | null)?.(); + } await waitForCredsSaveQueue(authDir); @@ -389,13 +392,16 @@ describe("web session", () => { onError, ); - await flushCredsUpdate(); + try { + await vi.waitFor(() => { + expect(inFlightA).toBe(1); + expect(inFlightB).toBe(1); + }); + } finally { + (releaseA as (() => void) | null)?.(); + (releaseB as (() => void) | null)?.(); + } - expect(inFlightA).toBe(1); - expect(inFlightB).toBe(1); - - (releaseA as (() => void) | null)?.(); - (releaseB as (() => void) | null)?.(); await Promise.all([waitForCredsSaveQueue(authDirA), waitForCredsSaveQueue(authDirB)]); expect(inFlightA).toBe(0); diff --git a/extensions/zalo/src/monitor.lifecycle.test.ts b/extensions/zalo/src/monitor.lifecycle.test.ts index 7d17b9d9cc8..f526e5e81fd 100644 --- a/extensions/zalo/src/monitor.lifecycle.test.ts +++ b/extensions/zalo/src/monitor.lifecycle.test.ts @@ -152,7 +152,7 @@ describe("monitorZaloProvider lifecycle", () => { abort.abort(); - await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1)); + await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1), { timeout: 5000 }); expect(deleteWebhookMock).toHaveBeenCalledWith("test-token", undefined, 5000); expect(settled).toBe(false); expect(registry.httpRoutes).toHaveLength(2); diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index 07aedef6299..45ef379b472 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -36,3 +36,10 @@ export { type RealtimeVoiceBridgeSessionParams, type RealtimeVoiceMarkStrategy, } from "../realtime-voice/session-runtime.js"; +export { + convertPcmToMulaw8k, + mulawToPcm, + pcmToMulaw, + resamplePcm, + resamplePcmTo8k, +} from "../realtime-voice/audio-codec.js"; diff --git a/src/realtime-voice/audio-codec.ts b/src/realtime-voice/audio-codec.ts new file mode 100644 index 00000000000..9151cecc9b6 --- /dev/null +++ b/src/realtime-voice/audio-codec.ts @@ -0,0 +1,138 @@ +const TELEPHONY_SAMPLE_RATE = 8000; +const RESAMPLE_FILTER_TAPS = 31; +const RESAMPLE_CUTOFF_GUARD = 0.94; + +function clamp16(value: number): number { + return Math.max(-32768, Math.min(32767, value)); +} + +function sinc(x: number): number { + if (x === 0) { + return 1; + } + return Math.sin(Math.PI * x) / (Math.PI * x); +} + +function sampleBandlimited( + input: Buffer, + inputSamples: number, + srcPos: number, + cutoffCyclesPerSample: number, +): number { + const half = Math.floor(RESAMPLE_FILTER_TAPS / 2); + const center = Math.floor(srcPos); + let weighted = 0; + let weightSum = 0; + + for (let tap = -half; tap <= half; tap += 1) { + const sampleIndex = center + tap; + if (sampleIndex < 0 || sampleIndex >= inputSamples) { + continue; + } + + const distance = sampleIndex - srcPos; + const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance); + const tapIndex = tap + half; + const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1)); + const coeff = lowPass * window; + weighted += input.readInt16LE(sampleIndex * 2) * coeff; + weightSum += coeff; + } + + if (weightSum === 0) { + const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos))); + return input.readInt16LE(nearest * 2); + } + + return weighted / weightSum; +} + +export function resamplePcm( + input: Buffer, + inputSampleRate: number, + outputSampleRate: number, +): Buffer { + if (inputSampleRate === outputSampleRate) { + return input; + } + const inputSamples = Math.floor(input.length / 2); + if (inputSamples === 0) { + return Buffer.alloc(0); + } + + const ratio = inputSampleRate / outputSampleRate; + const outputSamples = Math.floor(inputSamples / ratio); + const output = Buffer.alloc(outputSamples * 2); + const maxCutoff = 0.5; + const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff; + const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD); + + for (let i = 0; i < outputSamples; i += 1) { + const sample = Math.round( + sampleBandlimited(input, inputSamples, i * ratio, cutoffCyclesPerSample), + ); + output.writeInt16LE(clamp16(sample), i * 2); + } + + return output; +} + +export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer { + return resamplePcm(input, inputSampleRate, TELEPHONY_SAMPLE_RATE); +} + +export function pcmToMulaw(pcm: Buffer): Buffer { + const samples = Math.floor(pcm.length / 2); + const mulaw = Buffer.alloc(samples); + + for (let i = 0; i < samples; i += 1) { + const sample = pcm.readInt16LE(i * 2); + mulaw[i] = linearToMulaw(sample); + } + + return mulaw; +} + +export function mulawToPcm(mulaw: Buffer): Buffer { + const pcm = Buffer.alloc(mulaw.length * 2); + for (let i = 0; i < mulaw.length; i += 1) { + pcm.writeInt16LE(clamp16(mulawToLinear(mulaw[i] ?? 0)), i * 2); + } + return pcm; +} + +export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer { + return pcmToMulaw(resamplePcmTo8k(pcm, inputSampleRate)); +} + +function linearToMulaw(sample: number): number { + const BIAS = 132; + const CLIP = 32635; + + const sign = sample < 0 ? 0x80 : 0; + if (sample < 0) { + sample = -sample; + } + if (sample > CLIP) { + sample = CLIP; + } + + sample += BIAS; + let exponent = 7; + for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent -= 1) { + expMask >>= 1; + } + + const mantissa = (sample >> (exponent + 3)) & 0x0f; + return ~(sign | (exponent << 4) | mantissa) & 0xff; +} + +function mulawToLinear(value: number): number { + const muLaw = ~value & 0xff; + const sign = muLaw & 0x80; + const exponent = (muLaw >> 4) & 0x07; + const mantissa = muLaw & 0x0f; + let sample = ((mantissa << 3) + 132) << exponent; + sample -= 132; + return sign ? -sample : sample; +} diff --git a/test/helpers/plugins/plugin-registration-contract-cases.ts b/test/helpers/plugins/plugin-registration-contract-cases.ts index 2c227cf8fbf..b2bdcc325f9 100644 --- a/test/helpers/plugins/plugin-registration-contract-cases.ts +++ b/test/helpers/plugins/plugin-registration-contract-cases.ts @@ -55,6 +55,7 @@ export const pluginRegistrationContractCases = { pluginId: "google", providerIds: ["google", "google-gemini-cli"], webSearchProviderIds: ["gemini"], + realtimeVoiceProviderIds: ["google"], speechProviderIds: ["google"], mediaUnderstandingProviderIds: ["google"], imageGenerationProviderIds: ["google"],