mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-04-30 14:02:56 +08:00
feat(google): add realtime voice provider
This commit is contained in:
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Plugins/Google Meet: default Chrome realtime sessions to OpenAI plus SoX `rec`/`play` audio bridge commands, so the usual setup only needs the plugin enabled and `OPENAI_API_KEY`.
|
||||
- Plugins/Google Meet: add a `chrome-node` transport so a paired macOS node, such as a Parallels VM, can own Chrome, BlackHole, and SoX while the Gateway machine keeps the agent and model key.
|
||||
- Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
|
||||
- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support.
|
||||
- Providers/OpenAI: add image generation and reference-image editing through Codex OAuth, so `openai/gpt-image-2` works without an `OPENAI_API_KEY`. Fixes #70703.
|
||||
- Providers/OpenRouter: add image generation and reference-image editing through `image_generate`, so OpenRouter image models work with `OPENROUTER_API_KEY`. Fixes #55066 via #67668. Thanks @notamicrodose.
|
||||
- Image generation: let agents request provider-supported quality and output format hints, and pass OpenAI-specific background, moderation, compression, and user hints through the `image_generate` tool. (#70503) Thanks @ottodeng.
|
||||
|
||||
@@ -132,6 +132,7 @@ Choose your preferred auth method and follow the setup steps.
|
||||
| Image generation | Yes |
|
||||
| Music generation | Yes |
|
||||
| Text-to-speech | Yes |
|
||||
| Realtime voice | Yes (Google Live API) |
|
||||
| Image understanding | Yes |
|
||||
| Audio transcription | Yes |
|
||||
| Video understanding | Yes |
|
||||
@@ -281,6 +282,63 @@ A Google Cloud Console API key restricted to the Gemini API is valid for this
|
||||
provider. This is not the separate Cloud Text-to-Speech API path.
|
||||
</Note>
|
||||
|
||||
## Realtime voice
|
||||
|
||||
The bundled `google` plugin registers a realtime voice provider backed by the
|
||||
Gemini Live API for backend audio bridges such as Voice Call and Google Meet.
|
||||
|
||||
| Setting | Config path | Default |
|
||||
| --------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------- |
|
||||
| Model | `plugins.entries.voice-call.config.realtime.providers.google.model` | `gemini-2.5-flash-native-audio-preview-12-2025` |
|
||||
| Voice | `...google.voice` | `Kore` |
|
||||
| Temperature | `...google.temperature` | (unset) |
|
||||
| VAD start sensitivity | `...google.startSensitivity` | (unset) |
|
||||
| VAD end sensitivity | `...google.endSensitivity` | (unset) |
|
||||
| Silence duration | `...google.silenceDurationMs` | (unset) |
|
||||
| API key | `...google.apiKey` | Falls back to `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` |
|
||||
|
||||
Example Voice Call realtime config:
|
||||
|
||||
```json5
|
||||
{
|
||||
plugins: {
|
||||
entries: {
|
||||
"voice-call": {
|
||||
enabled: true,
|
||||
config: {
|
||||
realtime: {
|
||||
enabled: true,
|
||||
provider: "google",
|
||||
providers: {
|
||||
google: {
|
||||
model: "gemini-2.5-flash-native-audio-preview-12-2025",
|
||||
voice: "Kore",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
<Note>
|
||||
Google Live API uses bidirectional audio and function calling over a WebSocket.
|
||||
OpenClaw adapts telephony/Meet bridge audio to Gemini's PCM Live API stream and
|
||||
keeps tool calls on the shared realtime voice contract. Leave `temperature`
|
||||
unset unless you need sampling changes; OpenClaw omits non-positive values
|
||||
because Google Live can return transcripts without audio for `temperature: 0`.
|
||||
Gemini API transcription is enabled without `languageCodes`; the current Google
|
||||
SDK rejects language-code hints on this API path.
|
||||
</Note>
|
||||
|
||||
<Note>
|
||||
Control UI Talk browser sessions still require a realtime voice provider with a
|
||||
browser WebRTC session implementation. Today that path is OpenAI Realtime; the
|
||||
Google provider is for backend realtime bridges.
|
||||
</Note>
|
||||
|
||||
## Advanced configuration
|
||||
|
||||
<AccordionGroup>
|
||||
|
||||
@@ -156,12 +156,14 @@ Cron jobs panel notes:
|
||||
- `chat.history` also strips display-only inline directive tags from visible assistant text (for example `[[reply_to_*]]` and `[[audio_as_voice]]`), plain-text tool-call XML payloads (including `<tool_call>...</tool_call>`, `<function_call>...</function_call>`, `<tool_calls>...</tool_calls>`, `<function_calls>...</function_calls>`, and truncated tool-call blocks), and leaked ASCII/full-width model control tokens, and omits assistant entries whose whole visible text is only the exact silent token `NO_REPLY` / `no_reply`.
|
||||
- `chat.inject` appends an assistant note to the session transcript and broadcasts a `chat` event for UI-only updates (no agent run, no channel delivery).
|
||||
- The chat header model and thinking pickers patch the active session immediately through `sessions.patch`; they are persistent session overrides, not one-turn-only send options.
|
||||
- Talk mode uses the registered realtime voice provider. Configure OpenAI with
|
||||
`talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or reuse the
|
||||
Voice Call realtime provider config. The browser never receives the standard
|
||||
OpenAI API key; it receives only the ephemeral Realtime client secret. The
|
||||
Realtime session prompt is assembled by the Gateway; `talk.realtime.session`
|
||||
does not accept caller-provided instruction overrides.
|
||||
- Talk mode uses a registered realtime voice provider that supports browser
|
||||
WebRTC sessions. Configure OpenAI with `talk.provider: "openai"` plus
|
||||
`talk.providers.openai.apiKey`, or reuse the Voice Call realtime provider
|
||||
config. The browser never receives the standard OpenAI API key; it receives
|
||||
only the ephemeral Realtime client secret. Google Live realtime voice is
|
||||
supported for backend Voice Call and Google Meet bridges, but not this browser
|
||||
WebRTC path yet. The Realtime session prompt is assembled by the Gateway;
|
||||
`talk.realtime.session` does not accept caller-provided instruction overrides.
|
||||
- In the Chat composer, the Talk control is the waves button next to the
|
||||
microphone dictation button. When Talk starts, the composer status row shows
|
||||
`Connecting Talk...`, then `Talk live` while audio is connected, or
|
||||
|
||||
@@ -11,6 +11,7 @@ import {
|
||||
} from "./generation-provider-metadata.js";
|
||||
import { geminiMemoryEmbeddingProviderAdapter } from "./memory-embedding-adapter.js";
|
||||
import { registerGoogleProvider } from "./provider-registration.js";
|
||||
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
import { buildGoogleSpeechProvider } from "./speech-provider.js";
|
||||
import { createGeminiWebSearchProvider } from "./src/gemini-web-search-provider.js";
|
||||
|
||||
@@ -156,6 +157,7 @@ export default definePluginEntry({
|
||||
api.registerImageGenerationProvider(createLazyGoogleImageGenerationProvider());
|
||||
api.registerMediaUnderstandingProvider(createLazyGoogleMediaUnderstandingProvider());
|
||||
api.registerMusicGenerationProvider(createLazyGoogleMusicGenerationProvider());
|
||||
api.registerRealtimeVoiceProvider(buildGoogleRealtimeVoiceProvider());
|
||||
api.registerSpeechProvider(buildGoogleSpeechProvider());
|
||||
api.registerVideoGenerationProvider(createLazyGoogleVideoGenerationProvider());
|
||||
api.registerWebSearchProvider(createGeminiWebSearchProvider());
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
"memoryEmbeddingProviders": ["gemini"],
|
||||
"imageGenerationProviders": ["google"],
|
||||
"musicGenerationProviders": ["google"],
|
||||
"realtimeVoiceProviders": ["google"],
|
||||
"speechProviders": ["google"],
|
||||
"videoGenerationProviders": ["google"],
|
||||
"webSearchProviders": ["gemini"]
|
||||
|
||||
354
extensions/google/realtime-voice-provider.test.ts
Normal file
354
extensions/google/realtime-voice-provider.test.ts
Normal file
@@ -0,0 +1,354 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
|
||||
type MockGoogleLiveSession = {
|
||||
close: ReturnType<typeof vi.fn>;
|
||||
sendClientContent: ReturnType<typeof vi.fn>;
|
||||
sendRealtimeInput: ReturnType<typeof vi.fn>;
|
||||
sendToolResponse: ReturnType<typeof vi.fn>;
|
||||
};
|
||||
|
||||
type MockGoogleLiveConnectParams = {
|
||||
model: string;
|
||||
config: Record<string, unknown>;
|
||||
callbacks: {
|
||||
onopen: () => void;
|
||||
onmessage: (message: Record<string, unknown>) => void;
|
||||
onerror: (event: { error?: unknown; message?: string }) => void;
|
||||
onclose: () => void;
|
||||
};
|
||||
};
|
||||
|
||||
const { connectMock, session } = vi.hoisted(() => {
|
||||
const session: MockGoogleLiveSession = {
|
||||
close: vi.fn(),
|
||||
sendClientContent: vi.fn(),
|
||||
sendRealtimeInput: vi.fn(),
|
||||
sendToolResponse: vi.fn(),
|
||||
};
|
||||
const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
|
||||
return { connectMock, session };
|
||||
});
|
||||
|
||||
vi.mock("./google-genai-runtime.js", () => ({
|
||||
createGoogleGenAI: vi.fn(() => ({
|
||||
live: {
|
||||
connect: connectMock,
|
||||
},
|
||||
})),
|
||||
}));
|
||||
|
||||
function lastConnectParams(): MockGoogleLiveConnectParams {
|
||||
const params = connectMock.mock.calls.at(-1)?.[0];
|
||||
if (!params) {
|
||||
throw new Error("expected google live connect call");
|
||||
}
|
||||
return params;
|
||||
}
|
||||
|
||||
describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
beforeEach(() => {
|
||||
connectMock.mockClear();
|
||||
session.close.mockClear();
|
||||
session.sendClientContent.mockClear();
|
||||
session.sendRealtimeInput.mockClear();
|
||||
session.sendToolResponse.mockClear();
|
||||
delete process.env.GEMINI_API_KEY;
|
||||
delete process.env.GOOGLE_API_KEY;
|
||||
});
|
||||
|
||||
it("normalizes provider config and cfg model-provider key fallback", () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const resolved = provider.resolveConfig?.({
|
||||
cfg: {
|
||||
models: {
|
||||
providers: {
|
||||
google: {
|
||||
apiKey: "cfg-key",
|
||||
},
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
rawConfig: {
|
||||
providers: {
|
||||
google: {
|
||||
model: "gemini-live-2.5-flash-preview",
|
||||
voice: "Puck",
|
||||
temperature: 0.4,
|
||||
silenceDurationMs: 700,
|
||||
startSensitivity: "high",
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(resolved).toEqual({
|
||||
apiKey: "cfg-key",
|
||||
model: "gemini-live-2.5-flash-preview",
|
||||
voice: "Puck",
|
||||
temperature: 0.4,
|
||||
apiVersion: undefined,
|
||||
prefixPaddingMs: undefined,
|
||||
silenceDurationMs: 700,
|
||||
startSensitivity: "high",
|
||||
endSensitivity: undefined,
|
||||
enableAffectiveDialog: undefined,
|
||||
thinkingLevel: undefined,
|
||||
thinkingBudget: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
it("connects with Google Live setup config and tool declarations", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: {
|
||||
apiKey: "gemini-key",
|
||||
model: "gemini-live-2.5-flash-preview",
|
||||
voice: "Kore",
|
||||
temperature: 0.3,
|
||||
startSensitivity: "low",
|
||||
},
|
||||
instructions: "Speak briefly.",
|
||||
tools: [
|
||||
{
|
||||
type: "function",
|
||||
name: "lookup",
|
||||
description: "Look something up",
|
||||
parameters: {
|
||||
type: "object",
|
||||
properties: {
|
||||
query: { type: "string" },
|
||||
},
|
||||
required: ["query"],
|
||||
},
|
||||
},
|
||||
],
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
|
||||
expect(connectMock).toHaveBeenCalledTimes(1);
|
||||
expect(lastConnectParams()).toMatchObject({
|
||||
model: "gemini-live-2.5-flash-preview",
|
||||
config: {
|
||||
responseModalities: ["AUDIO"],
|
||||
temperature: 0.3,
|
||||
systemInstruction: "Speak briefly.",
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: "Kore",
|
||||
},
|
||||
},
|
||||
},
|
||||
outputAudioTranscription: {},
|
||||
tools: [
|
||||
{
|
||||
functionDeclarations: [
|
||||
{
|
||||
name: "lookup",
|
||||
description: "Look something up",
|
||||
parametersJsonSchema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
query: { type: "string" },
|
||||
},
|
||||
required: ["query"],
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("omits zero temperature for native audio responses", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: {
|
||||
apiKey: "gemini-key",
|
||||
temperature: 0,
|
||||
},
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
|
||||
expect(lastConnectParams().config).not.toHaveProperty("temperature");
|
||||
});
|
||||
|
||||
it("waits for setup completion before draining audio and firing ready", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onReady = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
onReady,
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onopen();
|
||||
bridge.sendAudio(Buffer.from([0xff, 0xff]));
|
||||
|
||||
expect(session.sendRealtimeInput).not.toHaveBeenCalled();
|
||||
expect(onReady).not.toHaveBeenCalled();
|
||||
|
||||
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
|
||||
|
||||
expect(onReady).toHaveBeenCalledTimes(1);
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1);
|
||||
expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({
|
||||
data: expect.any(String),
|
||||
mimeType: "audio/pcm;rate=16000",
|
||||
});
|
||||
});
|
||||
|
||||
it("marks the Google audio stream complete after sustained telephony silence", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key", silenceDurationMs: 60 },
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onopen();
|
||||
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
|
||||
|
||||
const silence20ms = Buffer.alloc(160, 0xff);
|
||||
bridge.sendAudio(silence20ms);
|
||||
bridge.sendAudio(silence20ms);
|
||||
bridge.sendAudio(silence20ms);
|
||||
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
|
||||
|
||||
const callsAfterStreamEnd = session.sendRealtimeInput.mock.calls.length;
|
||||
bridge.sendAudio(silence20ms);
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(callsAfterStreamEnd);
|
||||
|
||||
session.sendRealtimeInput.mockClear();
|
||||
bridge.sendAudio(Buffer.alloc(160, 0x7f));
|
||||
bridge.sendAudio(silence20ms);
|
||||
bridge.sendAudio(silence20ms);
|
||||
bridge.sendAudio(silence20ms);
|
||||
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
|
||||
});
|
||||
|
||||
it("sends text prompts as ordered client turns", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onopen();
|
||||
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
|
||||
|
||||
bridge.sendUserMessage?.(" Say hello. ");
|
||||
|
||||
expect(session.sendClientContent).toHaveBeenCalledWith({
|
||||
turns: [{ role: "user", parts: [{ text: "Say hello." }] }],
|
||||
turnComplete: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("converts Google PCM output to mu-law audio", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onAudio = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio,
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
const pcm24k = Buffer.alloc(480);
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onmessage({
|
||||
setupComplete: { sessionId: "session-1" },
|
||||
serverContent: {
|
||||
modelTurn: {
|
||||
parts: [
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: "audio/L16;codec=pcm;rate=24000",
|
||||
data: pcm24k.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(onAudio).toHaveBeenCalledTimes(1);
|
||||
expect(onAudio.mock.calls[0]?.[0]).toBeInstanceOf(Buffer);
|
||||
expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
|
||||
});
|
||||
|
||||
it("does not forward Google thought text as assistant transcript", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onTranscript = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
onTranscript,
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onmessage({
|
||||
setupComplete: {},
|
||||
serverContent: {
|
||||
modelTurn: {
|
||||
parts: [{ text: "internal reasoning", thought: true }],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(onTranscript).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("forwards Live API tool calls and submits matching function responses", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onToolCall = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
onToolCall,
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onmessage({
|
||||
setupComplete: { sessionId: "session-1" },
|
||||
toolCall: {
|
||||
functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(onToolCall).toHaveBeenCalledWith({
|
||||
itemId: "call-1",
|
||||
callId: "call-1",
|
||||
name: "lookup",
|
||||
args: { query: "hi" },
|
||||
});
|
||||
|
||||
bridge.submitToolResult("call-1", { result: "ok" });
|
||||
|
||||
expect(session.sendToolResponse).toHaveBeenCalledWith({
|
||||
functionResponses: [
|
||||
{
|
||||
id: "call-1",
|
||||
response: { result: "ok" },
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
});
|
||||
535
extensions/google/realtime-voice-provider.ts
Normal file
535
extensions/google/realtime-voice-provider.ts
Normal file
@@ -0,0 +1,535 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import {
|
||||
EndSensitivity,
|
||||
Modality,
|
||||
StartSensitivity,
|
||||
type FunctionDeclaration,
|
||||
type FunctionResponse,
|
||||
type LiveServerContent,
|
||||
type LiveServerMessage,
|
||||
type LiveServerToolCall,
|
||||
type RealtimeInputConfig,
|
||||
type ThinkingConfig,
|
||||
} from "@google/genai";
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
|
||||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
RealtimeVoiceTool,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { convertPcmToMulaw8k, mulawToPcm, resamplePcm } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
|
||||
import { createGoogleGenAI } from "./google-genai-runtime.js";
|
||||
|
||||
const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025";
|
||||
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
|
||||
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
|
||||
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
|
||||
const TELEPHONY_SAMPLE_RATE = 8000;
|
||||
const MAX_PENDING_AUDIO_CHUNKS = 320;
|
||||
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
|
||||
|
||||
type GoogleRealtimeSensitivity = "low" | "high";
|
||||
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
|
||||
|
||||
type GoogleRealtimeVoiceProviderConfig = {
|
||||
apiKey?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
temperature?: number;
|
||||
apiVersion?: string;
|
||||
prefixPaddingMs?: number;
|
||||
silenceDurationMs?: number;
|
||||
startSensitivity?: GoogleRealtimeSensitivity;
|
||||
endSensitivity?: GoogleRealtimeSensitivity;
|
||||
enableAffectiveDialog?: boolean;
|
||||
thinkingLevel?: GoogleRealtimeThinkingLevel;
|
||||
thinkingBudget?: number;
|
||||
};
|
||||
|
||||
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
|
||||
apiKey: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
temperature?: number;
|
||||
apiVersion?: string;
|
||||
prefixPaddingMs?: number;
|
||||
silenceDurationMs?: number;
|
||||
startSensitivity?: GoogleRealtimeSensitivity;
|
||||
endSensitivity?: GoogleRealtimeSensitivity;
|
||||
enableAffectiveDialog?: boolean;
|
||||
thinkingLevel?: GoogleRealtimeThinkingLevel;
|
||||
thinkingBudget?: number;
|
||||
};
|
||||
|
||||
type GoogleLiveSession = {
|
||||
sendClientContent: (params: {
|
||||
turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
|
||||
turnComplete?: boolean;
|
||||
}) => void;
|
||||
sendRealtimeInput: (params: {
|
||||
audio?: { data: string; mimeType: string };
|
||||
audioStreamEnd?: boolean;
|
||||
}) => void;
|
||||
sendToolResponse: (params: { functionResponses: FunctionResponse[] | FunctionResponse }) => void;
|
||||
close: () => void;
|
||||
};
|
||||
|
||||
function trimToUndefined(value: unknown): string | undefined {
|
||||
return normalizeOptionalString(value);
|
||||
}
|
||||
|
||||
function asFiniteNumber(value: unknown): number | undefined {
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function asBoolean(value: unknown): boolean | undefined {
|
||||
return typeof value === "boolean" ? value : undefined;
|
||||
}
|
||||
|
||||
function asSensitivity(value: unknown): GoogleRealtimeSensitivity | undefined {
|
||||
const normalized = normalizeOptionalString(value)?.toLowerCase();
|
||||
return normalized === "low" || normalized === "high" ? normalized : undefined;
|
||||
}
|
||||
|
||||
function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefined {
|
||||
const normalized = normalizeOptionalString(value)?.toLowerCase();
|
||||
return normalized === "minimal" ||
|
||||
normalized === "low" ||
|
||||
normalized === "medium" ||
|
||||
normalized === "high"
|
||||
? normalized
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function resolveGoogleRealtimeProviderConfigRecord(
|
||||
config: Record<string, unknown>,
|
||||
): Record<string, unknown> | undefined {
|
||||
const providers =
|
||||
typeof config.providers === "object" &&
|
||||
config.providers !== null &&
|
||||
!Array.isArray(config.providers)
|
||||
? (config.providers as Record<string, unknown>)
|
||||
: undefined;
|
||||
const nested = providers?.google;
|
||||
return typeof nested === "object" && nested !== null && !Array.isArray(nested)
|
||||
? (nested as Record<string, unknown>)
|
||||
: typeof config.google === "object" && config.google !== null && !Array.isArray(config.google)
|
||||
? (config.google as Record<string, unknown>)
|
||||
: config;
|
||||
}
|
||||
|
||||
function normalizeProviderConfig(
|
||||
config: RealtimeVoiceProviderConfig,
|
||||
cfg?: OpenClawConfig,
|
||||
): GoogleRealtimeVoiceProviderConfig {
|
||||
const raw = resolveGoogleRealtimeProviderConfigRecord(config);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey ?? cfg?.models?.providers?.google?.apiKey,
|
||||
path: "plugins.entries.voice-call.config.realtime.providers.google.apiKey",
|
||||
}),
|
||||
model: trimToUndefined(raw?.model),
|
||||
voice: trimToUndefined(raw?.voice),
|
||||
temperature: asFiniteNumber(raw?.temperature),
|
||||
apiVersion: trimToUndefined(raw?.apiVersion),
|
||||
prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs),
|
||||
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
|
||||
startSensitivity: asSensitivity(raw?.startSensitivity),
|
||||
endSensitivity: asSensitivity(raw?.endSensitivity),
|
||||
enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
|
||||
thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
|
||||
thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
|
||||
};
|
||||
}
|
||||
|
||||
function resolveEnvApiKey(): string | undefined {
|
||||
return trimToUndefined(process.env.GEMINI_API_KEY) ?? trimToUndefined(process.env.GOOGLE_API_KEY);
|
||||
}
|
||||
|
||||
function mapStartSensitivity(
|
||||
value: GoogleRealtimeSensitivity | undefined,
|
||||
): StartSensitivity | undefined {
|
||||
switch (value) {
|
||||
case "high":
|
||||
return StartSensitivity.START_SENSITIVITY_HIGH;
|
||||
case "low":
|
||||
return StartSensitivity.START_SENSITIVITY_LOW;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function mapEndSensitivity(
|
||||
value: GoogleRealtimeSensitivity | undefined,
|
||||
): EndSensitivity | undefined {
|
||||
switch (value) {
|
||||
case "high":
|
||||
return EndSensitivity.END_SENSITIVITY_HIGH;
|
||||
case "low":
|
||||
return EndSensitivity.END_SENSITIVITY_LOW;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
|
||||
if (config.thinkingLevel) {
|
||||
return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
|
||||
}
|
||||
if (typeof config.thinkingBudget === "number") {
|
||||
return { thinkingBudget: config.thinkingBudget };
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function buildRealtimeInputConfig(
|
||||
config: GoogleRealtimeVoiceBridgeConfig,
|
||||
): RealtimeInputConfig | undefined {
|
||||
const startSensitivity = mapStartSensitivity(config.startSensitivity);
|
||||
const endSensitivity = mapEndSensitivity(config.endSensitivity);
|
||||
const automaticActivityDetection = {
|
||||
...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}),
|
||||
...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}),
|
||||
...(typeof config.prefixPaddingMs === "number"
|
||||
? { prefixPaddingMs: Math.max(0, Math.floor(config.prefixPaddingMs)) }
|
||||
: {}),
|
||||
...(typeof config.silenceDurationMs === "number"
|
||||
? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) }
|
||||
: {}),
|
||||
};
|
||||
return Object.keys(automaticActivityDetection).length > 0
|
||||
? { automaticActivityDetection }
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] {
|
||||
return (tools ?? []).map((tool) => ({
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
parametersJsonSchema: tool.parameters,
|
||||
}));
|
||||
}
|
||||
|
||||
function parsePcmSampleRate(mimeType: string | undefined): number {
|
||||
const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
|
||||
const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : 24_000;
|
||||
}
|
||||
|
||||
function isMulawSilence(audio: Buffer): boolean {
|
||||
return audio.length > 0 && audio.every((sample) => sample === 0xff);
|
||||
}
|
||||
|
||||
class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
private session: GoogleLiveSession | null = null;
|
||||
private connected = false;
|
||||
private sessionConfigured = false;
|
||||
private intentionallyClosed = false;
|
||||
private pendingAudio: Buffer[] = [];
|
||||
private sessionReadyFired = false;
|
||||
private consecutiveSilenceMs = 0;
|
||||
private audioStreamEnded = false;
|
||||
|
||||
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.intentionallyClosed = false;
|
||||
this.sessionConfigured = false;
|
||||
this.sessionReadyFired = false;
|
||||
this.consecutiveSilenceMs = 0;
|
||||
this.audioStreamEnded = false;
|
||||
|
||||
const ai = createGoogleGenAI({
|
||||
apiKey: this.config.apiKey,
|
||||
httpOptions: {
|
||||
apiVersion: this.config.apiVersion ?? GOOGLE_REALTIME_DEFAULT_API_VERSION,
|
||||
},
|
||||
});
|
||||
|
||||
const functionDeclarations = buildFunctionDeclarations(this.config.tools);
|
||||
this.session = (await ai.live.connect({
|
||||
model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
|
||||
config: {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
...(typeof this.config.temperature === "number" && this.config.temperature > 0
|
||||
? { temperature: this.config.temperature }
|
||||
: {}),
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
|
||||
},
|
||||
},
|
||||
},
|
||||
systemInstruction: this.config.instructions,
|
||||
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
|
||||
...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}),
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
...(typeof this.config.enableAffectiveDialog === "boolean"
|
||||
? { enableAffectiveDialog: this.config.enableAffectiveDialog }
|
||||
: {}),
|
||||
...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}),
|
||||
},
|
||||
callbacks: {
|
||||
onopen: () => {
|
||||
this.connected = true;
|
||||
},
|
||||
onmessage: (message) => {
|
||||
this.handleMessage(message);
|
||||
},
|
||||
onerror: (event) => {
|
||||
const error =
|
||||
event.error instanceof Error
|
||||
? event.error
|
||||
: new Error(
|
||||
typeof event.message === "string" ? event.message : "Google Live API error",
|
||||
);
|
||||
this.config.onError?.(error);
|
||||
},
|
||||
onclose: () => {
|
||||
this.connected = false;
|
||||
this.sessionConfigured = false;
|
||||
const reason = this.intentionallyClosed ? "completed" : "error";
|
||||
this.session = null;
|
||||
this.config.onClose?.(reason);
|
||||
},
|
||||
},
|
||||
})) as GoogleLiveSession;
|
||||
}
|
||||
|
||||
sendAudio(audio: Buffer): void {
|
||||
if (!this.session || !this.connected || !this.sessionConfigured) {
|
||||
if (this.pendingAudio.length < MAX_PENDING_AUDIO_CHUNKS) {
|
||||
this.pendingAudio.push(audio);
|
||||
}
|
||||
return;
|
||||
}
|
||||
const silent = isMulawSilence(audio);
|
||||
if (silent && this.audioStreamEnded) {
|
||||
return;
|
||||
}
|
||||
if (!silent) {
|
||||
this.consecutiveSilenceMs = 0;
|
||||
this.audioStreamEnded = false;
|
||||
}
|
||||
|
||||
const pcm16k = resamplePcm(
|
||||
mulawToPcm(audio),
|
||||
TELEPHONY_SAMPLE_RATE,
|
||||
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
|
||||
);
|
||||
this.session.sendRealtimeInput({
|
||||
audio: {
|
||||
data: pcm16k.toString("base64"),
|
||||
mimeType: `audio/pcm;rate=${GOOGLE_REALTIME_INPUT_SAMPLE_RATE}`,
|
||||
},
|
||||
});
|
||||
|
||||
if (!silent) {
|
||||
return;
|
||||
}
|
||||
|
||||
const silenceThresholdMs =
|
||||
typeof this.config.silenceDurationMs === "number"
|
||||
? Math.max(0, Math.floor(this.config.silenceDurationMs))
|
||||
: DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
|
||||
this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000);
|
||||
if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
|
||||
this.session.sendRealtimeInput({ audioStreamEnd: true });
|
||||
this.audioStreamEnded = true;
|
||||
}
|
||||
}
|
||||
|
||||
setMediaTimestamp(_ts: number): void {}
|
||||
|
||||
sendUserMessage(text: string): void {
|
||||
const normalized = text.trim();
|
||||
if (!normalized || !this.session || !this.connected || !this.sessionConfigured) {
|
||||
return;
|
||||
}
|
||||
this.session.sendClientContent({
|
||||
turns: [{ role: "user", parts: [{ text: normalized }] }],
|
||||
turnComplete: true,
|
||||
});
|
||||
}
|
||||
|
||||
triggerGreeting(instructions?: string): void {
|
||||
const greetingPrompt =
|
||||
instructions?.trim() || "Start the call now. Greet the caller naturally and keep it brief.";
|
||||
this.sendUserMessage(greetingPrompt);
|
||||
}
|
||||
|
||||
submitToolResult(callId: string, result: unknown): void {
|
||||
if (!this.session) {
|
||||
return;
|
||||
}
|
||||
this.session.sendToolResponse({
|
||||
functionResponses: [
|
||||
{
|
||||
id: callId,
|
||||
response:
|
||||
result && typeof result === "object"
|
||||
? (result as Record<string, unknown>)
|
||||
: { output: result },
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
acknowledgeMark(): void {}
|
||||
|
||||
close(): void {
|
||||
this.intentionallyClosed = true;
|
||||
this.connected = false;
|
||||
this.sessionConfigured = false;
|
||||
this.pendingAudio = [];
|
||||
this.consecutiveSilenceMs = 0;
|
||||
this.audioStreamEnded = false;
|
||||
const session = this.session;
|
||||
this.session = null;
|
||||
session?.close();
|
||||
}
|
||||
|
||||
isConnected(): boolean {
|
||||
return this.connected && this.sessionConfigured;
|
||||
}
|
||||
|
||||
private handleMessage(message: LiveServerMessage): void {
|
||||
if (message.setupComplete) {
|
||||
this.handleSetupComplete();
|
||||
}
|
||||
if (message.serverContent) {
|
||||
this.handleServerContent(message.serverContent);
|
||||
}
|
||||
if (message.toolCall) {
|
||||
this.handleToolCall(message.toolCall);
|
||||
}
|
||||
}
|
||||
|
||||
private handleSetupComplete(): void {
|
||||
this.sessionConfigured = true;
|
||||
for (const chunk of this.pendingAudio.splice(0)) {
|
||||
this.sendAudio(chunk);
|
||||
}
|
||||
if (!this.sessionReadyFired) {
|
||||
this.sessionReadyFired = true;
|
||||
this.config.onReady?.();
|
||||
}
|
||||
}
|
||||
|
||||
private handleServerContent(content: LiveServerContent): void {
|
||||
if (content.interrupted) {
|
||||
this.config.onClearAudio();
|
||||
}
|
||||
|
||||
if (content.inputTranscription?.text) {
|
||||
this.config.onTranscript?.(
|
||||
"user",
|
||||
content.inputTranscription.text,
|
||||
content.inputTranscription.finished ?? false,
|
||||
);
|
||||
}
|
||||
|
||||
if (content.outputTranscription?.text) {
|
||||
this.config.onTranscript?.(
|
||||
"assistant",
|
||||
content.outputTranscription.text,
|
||||
content.outputTranscription.finished ?? false,
|
||||
);
|
||||
}
|
||||
|
||||
let emittedAssistantText = false;
|
||||
for (const part of content.modelTurn?.parts ?? []) {
|
||||
if (part.inlineData?.data) {
|
||||
const pcm = Buffer.from(part.inlineData.data, "base64");
|
||||
const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
|
||||
const muLaw = convertPcmToMulaw8k(pcm, sampleRate);
|
||||
if (muLaw.length > 0) {
|
||||
this.config.onAudio(muLaw);
|
||||
this.config.onMark?.(`audio-${randomUUID()}`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (part.thought) {
|
||||
continue;
|
||||
}
|
||||
if (!content.outputTranscription?.text && typeof part.text === "string" && part.text.trim()) {
|
||||
emittedAssistantText = true;
|
||||
this.config.onTranscript?.("assistant", part.text, content.turnComplete ?? false);
|
||||
}
|
||||
}
|
||||
|
||||
if (!emittedAssistantText && content.turnComplete && content.waitingForInput === false) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private handleToolCall(toolCall: LiveServerToolCall): void {
|
||||
for (const call of toolCall.functionCalls ?? []) {
|
||||
const name = call.name?.trim();
|
||||
if (!name) {
|
||||
continue;
|
||||
}
|
||||
const callId = call.id?.trim() || `google-live-${randomUUID()}`;
|
||||
this.config.onToolCall?.({
|
||||
itemId: callId,
|
||||
callId,
|
||||
name,
|
||||
args: call.args ?? {},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private get realtimeInputConfig(): RealtimeInputConfig | undefined {
|
||||
return buildRealtimeInputConfig(this.config);
|
||||
}
|
||||
|
||||
private get thinkingConfig(): ThinkingConfig | undefined {
|
||||
return buildThinkingConfig(this.config);
|
||||
}
|
||||
}
|
||||
|
||||
export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
|
||||
return {
|
||||
id: "google",
|
||||
label: "Google Live Voice",
|
||||
autoSelectOrder: 20,
|
||||
resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg),
|
||||
isConfigured: ({ providerConfig }) =>
|
||||
Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()),
|
||||
createBridge: (req) => {
|
||||
const config = normalizeProviderConfig(req.providerConfig);
|
||||
const apiKey = config.apiKey || resolveEnvApiKey();
|
||||
if (!apiKey) {
|
||||
throw new Error("Google Gemini API key missing");
|
||||
}
|
||||
return new GoogleRealtimeVoiceBridge({
|
||||
...req,
|
||||
apiKey,
|
||||
model: config.model,
|
||||
voice: config.voice,
|
||||
temperature: config.temperature,
|
||||
apiVersion: config.apiVersion,
|
||||
prefixPaddingMs: config.prefixPaddingMs,
|
||||
silenceDurationMs: config.silenceDurationMs,
|
||||
startSensitivity: config.startSensitivity,
|
||||
endSensitivity: config.endSensitivity,
|
||||
enableAffectiveDialog: config.enableAffectiveDialog,
|
||||
thinkingLevel: config.thinkingLevel,
|
||||
thinkingBudget: config.thinkingBudget,
|
||||
});
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export {
|
||||
GOOGLE_REALTIME_DEFAULT_API_VERSION,
|
||||
GOOGLE_REALTIME_DEFAULT_MODEL,
|
||||
GOOGLE_REALTIME_DEFAULT_VOICE,
|
||||
};
|
||||
export type { GoogleRealtimeVoiceProviderConfig };
|
||||
@@ -1,105 +1,8 @@
|
||||
const TELEPHONY_SAMPLE_RATE = 8000;
|
||||
const RESAMPLE_FILTER_TAPS = 31;
|
||||
const RESAMPLE_CUTOFF_GUARD = 0.94;
|
||||
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
function sinc(x: number): number {
|
||||
if (x === 0) {
|
||||
return 1;
|
||||
}
|
||||
return Math.sin(Math.PI * x) / (Math.PI * x);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a finite low-pass kernel centered on `srcPos`.
|
||||
* The kernel is windowed (Hann) to reduce ringing artifacts.
|
||||
*/
|
||||
function sampleBandlimited(
|
||||
input: Buffer,
|
||||
inputSamples: number,
|
||||
srcPos: number,
|
||||
cutoffCyclesPerSample: number,
|
||||
): number {
|
||||
const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
|
||||
const center = Math.floor(srcPos);
|
||||
let weighted = 0;
|
||||
let weightSum = 0;
|
||||
|
||||
for (let tap = -half; tap <= half; tap++) {
|
||||
const sampleIndex = center + tap;
|
||||
if (sampleIndex < 0 || sampleIndex >= inputSamples) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const distance = sampleIndex - srcPos;
|
||||
const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
|
||||
const tapIndex = tap + half;
|
||||
const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
|
||||
const coeff = lowPass * window;
|
||||
weighted += input.readInt16LE(sampleIndex * 2) * coeff;
|
||||
weightSum += coeff;
|
||||
}
|
||||
|
||||
if (weightSum === 0) {
|
||||
const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
|
||||
return input.readInt16LE(nearest * 2);
|
||||
}
|
||||
|
||||
return weighted / weightSum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample 16-bit PCM (little-endian mono) to 8kHz using a windowed low-pass kernel.
|
||||
*/
|
||||
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
|
||||
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) {
|
||||
return input;
|
||||
}
|
||||
const inputSamples = Math.floor(input.length / 2);
|
||||
if (inputSamples === 0) {
|
||||
return Buffer.alloc(0);
|
||||
}
|
||||
|
||||
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
|
||||
const outputSamples = Math.floor(inputSamples / ratio);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
const maxCutoff = 0.5;
|
||||
const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
|
||||
const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
const srcPos = i * ratio;
|
||||
const sample = Math.round(
|
||||
sampleBandlimited(input, inputSamples, srcPos, cutoffCyclesPerSample),
|
||||
);
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 16-bit PCM to 8-bit mu-law (G.711).
|
||||
*/
|
||||
export function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = Math.floor(pcm.length / 2);
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
|
||||
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
|
||||
return pcmToMulaw(pcm8k);
|
||||
}
|
||||
export {
|
||||
convertPcmToMulaw8k,
|
||||
pcmToMulaw,
|
||||
resamplePcmTo8k,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
|
||||
@@ -111,25 +14,3 @@ export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, vo
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) {
|
||||
sample = -sample;
|
||||
}
|
||||
if (sample > CLIP) {
|
||||
sample = CLIP;
|
||||
}
|
||||
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
|
||||
expMask >>= 1;
|
||||
}
|
||||
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
|
||||
@@ -341,10 +341,13 @@ describe("web session", () => {
|
||||
sock.ev.emit("creds.update", {});
|
||||
sock.ev.emit("creds.update", {});
|
||||
|
||||
await flushCredsUpdate();
|
||||
expect(inFlight).toBe(1);
|
||||
|
||||
(release as (() => void) | null)?.();
|
||||
try {
|
||||
await vi.waitFor(() => {
|
||||
expect(inFlight).toBe(1);
|
||||
});
|
||||
} finally {
|
||||
(release as (() => void) | null)?.();
|
||||
}
|
||||
|
||||
await waitForCredsSaveQueue(authDir);
|
||||
|
||||
@@ -389,13 +392,16 @@ describe("web session", () => {
|
||||
onError,
|
||||
);
|
||||
|
||||
await flushCredsUpdate();
|
||||
try {
|
||||
await vi.waitFor(() => {
|
||||
expect(inFlightA).toBe(1);
|
||||
expect(inFlightB).toBe(1);
|
||||
});
|
||||
} finally {
|
||||
(releaseA as (() => void) | null)?.();
|
||||
(releaseB as (() => void) | null)?.();
|
||||
}
|
||||
|
||||
expect(inFlightA).toBe(1);
|
||||
expect(inFlightB).toBe(1);
|
||||
|
||||
(releaseA as (() => void) | null)?.();
|
||||
(releaseB as (() => void) | null)?.();
|
||||
await Promise.all([waitForCredsSaveQueue(authDirA), waitForCredsSaveQueue(authDirB)]);
|
||||
|
||||
expect(inFlightA).toBe(0);
|
||||
|
||||
@@ -152,7 +152,7 @@ describe("monitorZaloProvider lifecycle", () => {
|
||||
|
||||
abort.abort();
|
||||
|
||||
await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1));
|
||||
await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1), { timeout: 5000 });
|
||||
expect(deleteWebhookMock).toHaveBeenCalledWith("test-token", undefined, 5000);
|
||||
expect(settled).toBe(false);
|
||||
expect(registry.httpRoutes).toHaveLength(2);
|
||||
|
||||
@@ -36,3 +36,10 @@ export {
|
||||
type RealtimeVoiceBridgeSessionParams,
|
||||
type RealtimeVoiceMarkStrategy,
|
||||
} from "../realtime-voice/session-runtime.js";
|
||||
export {
|
||||
convertPcmToMulaw8k,
|
||||
mulawToPcm,
|
||||
pcmToMulaw,
|
||||
resamplePcm,
|
||||
resamplePcmTo8k,
|
||||
} from "../realtime-voice/audio-codec.js";
|
||||
|
||||
138
src/realtime-voice/audio-codec.ts
Normal file
138
src/realtime-voice/audio-codec.ts
Normal file
@@ -0,0 +1,138 @@
|
||||
const TELEPHONY_SAMPLE_RATE = 8000;
|
||||
const RESAMPLE_FILTER_TAPS = 31;
|
||||
const RESAMPLE_CUTOFF_GUARD = 0.94;
|
||||
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
function sinc(x: number): number {
|
||||
if (x === 0) {
|
||||
return 1;
|
||||
}
|
||||
return Math.sin(Math.PI * x) / (Math.PI * x);
|
||||
}
|
||||
|
||||
function sampleBandlimited(
|
||||
input: Buffer,
|
||||
inputSamples: number,
|
||||
srcPos: number,
|
||||
cutoffCyclesPerSample: number,
|
||||
): number {
|
||||
const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
|
||||
const center = Math.floor(srcPos);
|
||||
let weighted = 0;
|
||||
let weightSum = 0;
|
||||
|
||||
for (let tap = -half; tap <= half; tap += 1) {
|
||||
const sampleIndex = center + tap;
|
||||
if (sampleIndex < 0 || sampleIndex >= inputSamples) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const distance = sampleIndex - srcPos;
|
||||
const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
|
||||
const tapIndex = tap + half;
|
||||
const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
|
||||
const coeff = lowPass * window;
|
||||
weighted += input.readInt16LE(sampleIndex * 2) * coeff;
|
||||
weightSum += coeff;
|
||||
}
|
||||
|
||||
if (weightSum === 0) {
|
||||
const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
|
||||
return input.readInt16LE(nearest * 2);
|
||||
}
|
||||
|
||||
return weighted / weightSum;
|
||||
}
|
||||
|
||||
export function resamplePcm(
|
||||
input: Buffer,
|
||||
inputSampleRate: number,
|
||||
outputSampleRate: number,
|
||||
): Buffer {
|
||||
if (inputSampleRate === outputSampleRate) {
|
||||
return input;
|
||||
}
|
||||
const inputSamples = Math.floor(input.length / 2);
|
||||
if (inputSamples === 0) {
|
||||
return Buffer.alloc(0);
|
||||
}
|
||||
|
||||
const ratio = inputSampleRate / outputSampleRate;
|
||||
const outputSamples = Math.floor(inputSamples / ratio);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
const maxCutoff = 0.5;
|
||||
const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
|
||||
const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
|
||||
|
||||
for (let i = 0; i < outputSamples; i += 1) {
|
||||
const sample = Math.round(
|
||||
sampleBandlimited(input, inputSamples, i * ratio, cutoffCyclesPerSample),
|
||||
);
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
|
||||
return resamplePcm(input, inputSampleRate, TELEPHONY_SAMPLE_RATE);
|
||||
}
|
||||
|
||||
export function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = Math.floor(pcm.length / 2);
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i += 1) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
export function mulawToPcm(mulaw: Buffer): Buffer {
|
||||
const pcm = Buffer.alloc(mulaw.length * 2);
|
||||
for (let i = 0; i < mulaw.length; i += 1) {
|
||||
pcm.writeInt16LE(clamp16(mulawToLinear(mulaw[i] ?? 0)), i * 2);
|
||||
}
|
||||
return pcm;
|
||||
}
|
||||
|
||||
export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
|
||||
return pcmToMulaw(resamplePcmTo8k(pcm, inputSampleRate));
|
||||
}
|
||||
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) {
|
||||
sample = -sample;
|
||||
}
|
||||
if (sample > CLIP) {
|
||||
sample = CLIP;
|
||||
}
|
||||
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent -= 1) {
|
||||
expMask >>= 1;
|
||||
}
|
||||
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
|
||||
function mulawToLinear(value: number): number {
|
||||
const muLaw = ~value & 0xff;
|
||||
const sign = muLaw & 0x80;
|
||||
const exponent = (muLaw >> 4) & 0x07;
|
||||
const mantissa = muLaw & 0x0f;
|
||||
let sample = ((mantissa << 3) + 132) << exponent;
|
||||
sample -= 132;
|
||||
return sign ? -sample : sample;
|
||||
}
|
||||
@@ -55,6 +55,7 @@ export const pluginRegistrationContractCases = {
|
||||
pluginId: "google",
|
||||
providerIds: ["google", "google-gemini-cli"],
|
||||
webSearchProviderIds: ["gemini"],
|
||||
realtimeVoiceProviderIds: ["google"],
|
||||
speechProviderIds: ["google"],
|
||||
mediaUnderstandingProviderIds: ["google"],
|
||||
imageGenerationProviderIds: ["google"],
|
||||
|
||||
Reference in New Issue
Block a user