From f94d970cee3f2fadeaf307f6c12d60093cec21c4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 30 Apr 2026 02:45:44 +0100 Subject: [PATCH] fix: refresh Google Meet speech retry readiness --- CHANGELOG.md | 1 + docs/plugins/google-meet.md | 8 +- extensions/google-meet/index.test.ts | 143 ++++++++++++++++++ extensions/google-meet/index.ts | 2 +- extensions/google-meet/src/cli.ts | 12 +- extensions/google-meet/src/runtime.ts | 132 +++++++++++++++- .../google-meet/src/transports/types.ts | 9 ++ 7 files changed, 299 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d010466dd5d..c81854ae8ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai ### Fixes - CLI/status: resolve read-only channel setup runtime fallback from the packaged OpenClaw dist root, so `status --all`, `status --deep`, channel, and doctor paths do not crash when an external channel plugin needs setup metadata. Fixes #74693. Thanks @giangthb. +- Google Meet: block managed Chrome intro/test speech until browser health proves the participant is in-call, and expose `speechReady` diagnostics so login, admission, permission, and audio-bridge blockers no longer look like successful speech. Refs #72478. Thanks @DougButdorf. - CLI/update: scope packaged Node compile caches by OpenClaw version and install metadata, so global installs no longer reuse stale compiled chunks after package updates. Thanks @pashpashpash. - Channels/Voice call: keep pre-auth webhook in-flight limiting active when socket remote address metadata is missing, so slow-body requests from stripped-IP proxy paths still share the fallback bucket. (#74453) Thanks @davidangularme. - Plugin SDK/testing: lazy-load TypeScript from the plugin test-contract runtime and add release checks for critical SDK contract entrypoint imports and bundle size, so published packages fail preflight before shipping ESM-incompatible or oversized contract helpers. Thanks @vincentkoc. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index 8c798783f3b..81abf6b2fd8 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -166,7 +166,10 @@ health such as `inCall`, `manualActionRequired`, `providerConnected`, timestamps, byte counters, and bridge closed state. If a safe Meet page prompt appears, browser automation handles it when it can. Login, host admission, and browser/OS permission prompts are reported as manual action with a reason and -message for the agent to relay. +message for the agent to relay. Managed Chrome sessions only emit the intro or +test phrase after browser health reports `inCall: true`; otherwise status reports +`speechReady: false` and the speech attempt is blocked instead of pretending the +agent spoke into the meeting. Local Chrome joins through the signed-in OpenClaw browser profile. Realtime mode requires `BlackHole 2ch` for the microphone/speaker path used by OpenClaw. For @@ -1006,6 +1009,9 @@ a session ended. - `manualActionRequired` / `manualActionReason` / `manualActionMessage`: the browser profile needs manual login, Meet host admission, permissions, or browser-control repair before speech can work +- `speechReady` / `speechBlockedReason` / `speechBlockedMessage`: whether + managed Chrome speech is allowed now. `speechReady: false` means OpenClaw did + not send the intro/test phrase into the audio bridge. - `providerConnected` / `realtimeReady`: realtime voice bridge state - `lastInputAt` / `lastOutputAt`: last audio seen from or sent to the bridge diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index d9b6692aaf9..4def88467fd 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -1998,6 +1998,9 @@ describe("google-meet plugin", () => { details: { manualActionRequired?: boolean; manualActionReason?: string; + speechReady?: boolean; + speechBlockedReason?: string; + spoken?: boolean; session?: { chrome?: { health?: { manualActionRequired?: boolean } } }; }; }>; @@ -2012,17 +2015,157 @@ describe("google-meet plugin", () => { expect(result.details).toMatchObject({ manualActionRequired: true, manualActionReason: "google-login-required", + spoken: false, + speechReady: false, + speechBlockedReason: "google-login-required", session: { chrome: { health: { manualActionRequired: true, manualActionReason: "google-login-required", + speechReady: false, + speechBlockedReason: "google-login-required", }, }, }, }); }); + it("refreshes browser health before blocking an explicit speech retry", async () => { + let openedTab = false; + let browserReady = false; + const { methods, nodesInvoke } = setup( + { + defaultTransport: "chrome-node", + defaultMode: "realtime", + }, + { + nodesInvokeHandler: async ({ command, params }) => { + const raw = params as { path?: string; body?: { url?: string; targetId?: string } }; + if (command === "browser.proxy") { + if (raw.path === "/tabs") { + return { + payload: { + result: { + running: true, + tabs: openedTab + ? [ + { + targetId: "tab-1", + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }, + ] + : [], + }, + }, + }; + } + if (raw.path === "/tabs/open") { + openedTab = true; + return { + payload: { + result: { + targetId: "tab-1", + title: "Meet", + url: raw.body?.url ?? "https://meet.google.com/abc-defg-hij", + }, + }, + }; + } + if (raw.path === "/tabs/focus" || raw.path === "/permissions/grant") { + return { payload: { result: { ok: true } } }; + } + if (raw.path === "/act") { + return { + payload: { + result: { + ok: true, + targetId: raw.body?.targetId ?? "tab-1", + result: JSON.stringify( + browserReady + ? { + inCall: true, + micMuted: false, + manualActionRequired: false, + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + } + : { + inCall: false, + manualActionRequired: true, + manualActionReason: "google-login-required", + manualActionMessage: + "Sign in to Google in the OpenClaw browser profile, then retry the Meet join.", + title: "Sign in - Google Accounts", + url: "https://accounts.google.com/signin", + }, + ), + }, + }, + }; + } + } + if (command === "googlemeet.chrome") { + return { payload: { launched: true } }; + } + throw new Error(`unexpected invoke ${command}`); + }, + }, + ); + + const join = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.join", { + url: "https://meet.google.com/abc-defg-hij", + message: "Say exactly: hello.", + })) as { + session: { id: string; chrome?: { health?: { speechBlockedReason?: string } } }; + spoken: boolean; + }; + expect(join.spoken).toBe(false); + expect(join.session.chrome?.health?.speechBlockedReason).toBe("google-login-required"); + + browserReady = true; + const retry = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.speak", { + sessionId: join.session.id, + message: "Say exactly: hello again.", + })) as { + found: boolean; + spoken: boolean; + session?: { + chrome?: { + health?: { + inCall?: boolean; + manualActionRequired?: boolean; + speechBlockedReason?: string; + }; + }; + }; + }; + + expect(retry).toMatchObject({ + found: true, + spoken: false, + session: { + chrome: { + health: { + inCall: true, + manualActionRequired: false, + speechBlockedReason: "audio-bridge-unavailable", + }, + }, + }, + }); + expect(nodesInvoke).toHaveBeenCalledWith( + expect.objectContaining({ + command: "browser.proxy", + params: expect.objectContaining({ + path: "/tabs/focus", + body: { targetId: "tab-1" }, + }), + }), + ); + }); + it("explains when chrome-node has no capable paired node", async () => { const { tools } = setup( { diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index 4d2a12cd418..b42e1d6c81e 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -823,7 +823,7 @@ export default definePluginEntry({ return; } const rt = await ensureRuntime(); - respond(true, rt.speak(sessionId, normalizeOptionalString(params?.message))); + respond(true, await rt.speak(sessionId, normalizeOptionalString(params?.message))); } catch (err) { sendError(respond, err); } diff --git a/extensions/google-meet/src/cli.ts b/extensions/google-meet/src/cli.ts index 73e45467db6..28ff9a73184 100644 --- a/extensions/google-meet/src/cli.ts +++ b/extensions/google-meet/src/cli.ts @@ -268,6 +268,11 @@ function writeDoctorStatus(status: ReturnType): voi writeStdoutLine("manual reason: %s", formatOptional(health.manualActionReason)); writeStdoutLine("manual message: %s", formatOptional(health.manualActionMessage)); } + writeStdoutLine("speech ready: %s", formatBoolean(health?.speechReady)); + if (health?.speechReady === false) { + writeStdoutLine("speech blocked reason: %s", formatOptional(health.speechBlockedReason)); + writeStdoutLine("speech blocked message: %s", formatOptional(health.speechBlockedMessage)); + } writeStdoutLine("provider connected: %s", formatBoolean(health?.providerConnected)); writeStdoutLine("realtime ready: %s", formatBoolean(health?.realtimeReady)); writeStdoutLine("audio input active: %s", formatBoolean(health?.audioInputActive)); @@ -2017,12 +2022,15 @@ export function registerGoogleMeetCli(params: { .argument("[message]", "Realtime instructions to speak now") .action(async (sessionId: string, message?: string) => { const rt = await params.ensureRuntime(); - const result = rt.speak(sessionId, message); + const result = await rt.speak(sessionId, message); if (!result.found) { throw new Error("session not found"); } if (!result.spoken) { - throw new Error("session has no active realtime audio bridge"); + throw new Error( + result.session?.chrome?.health?.speechBlockedMessage ?? + "session has no active realtime audio bridge", + ); } writeStdoutLine("speaking on %s", sessionId); }); diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index a64f22e6465..548554d78cf 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -66,6 +66,66 @@ function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } +function isManagedChromeBrowserSession(session: GoogleMeetSession): boolean { + return Boolean( + (session.transport === "chrome" || session.transport === "chrome-node") && + session.chrome && + session.chrome.launched, + ); +} + +function evaluateSpeechReadiness(session: GoogleMeetSession): { + ready: boolean; + reason?: NonNullable; + message?: string; +} { + if (session.mode !== "realtime" || !session.chrome) { + return { ready: true }; + } + if (!isManagedChromeBrowserSession(session)) { + if (session.chrome.audioBridge) { + return { ready: true }; + } + return { + ready: false, + reason: "audio-bridge-unavailable", + message: "Realtime speech requires an active Chrome audio bridge.", + }; + } + const health = session.chrome.health; + if (health?.manualActionRequired) { + return { + ready: false, + reason: health.manualActionReason ?? "browser-unverified", + message: + health.manualActionMessage ?? + "Resolve the Google Meet browser prompt before asking OpenClaw to speak.", + }; + } + if (health?.inCall === true) { + if (session.chrome.audioBridge) { + return { ready: true }; + } + return { + ready: false, + reason: "audio-bridge-unavailable", + message: "Realtime speech requires an active Chrome audio bridge.", + }; + } + if (health?.inCall === false) { + return { + ready: false, + reason: "not-in-call", + message: "Google Meet has not reported that the browser participant is in the call.", + }; + } + return { + ready: false, + reason: "browser-unverified", + message: "Google Meet browser state has not been verified yet.", + }; +} + function collectChromeAudioCommands(config: GoogleMeetConfig): string[] { const commands = config.chrome.audioBridgeCommand ? [config.chrome.audioBridgeCommand[0]] @@ -228,6 +288,7 @@ export class GoogleMeetRuntime { ); const speechInstructions = request.message ?? this.params.config.realtime.introMessage; if (reusable) { + await this.#refreshBrowserHealthForChromeSession(reusable); reusable.notes = [ ...reusable.notes.filter((note) => note !== "Reused existing active Meet session."), "Reused existing active Meet session.", @@ -235,7 +296,7 @@ export class GoogleMeetRuntime { reusable.updatedAt = nowIso(); const spoken = mode === "realtime" && speechInstructions - ? this.speak(reusable.id, speechInstructions).spoken + ? (await this.speak(reusable.id, speechInstructions)).spoken : false; return { session: reusable, spoken }; } @@ -320,6 +381,7 @@ export class GoogleMeetRuntime { ? "Chrome transport joins as the signed-in Google profile and expects BlackHole 2ch audio routing." : "Chrome transport joins as the signed-in Google profile without starting the realtime audio bridge.", ); + this.#refreshSpeechReadiness(session); } else { const dialInNumber = normalizeDialInNumber( request.dialInNumber ?? this.params.config.twilio.defaultDialInNumber, @@ -367,7 +429,7 @@ export class GoogleMeetRuntime { this.#sessions.set(session.id, session); const spoken = mode === "realtime" && speechInstructions - ? this.speak(session.id, speechInstructions).spoken + ? (await this.speak(session.id, speechInstructions)).spoken : false; return { session, spoken }; } @@ -389,18 +451,28 @@ export class GoogleMeetRuntime { return { found: true, session }; } - speak( + async speak( sessionId: string, instructions?: string, - ): { found: boolean; spoken: boolean; session?: GoogleMeetSession } { + ): Promise<{ found: boolean; spoken: boolean; session?: GoogleMeetSession }> { const session = this.#sessions.get(sessionId); if (!session) { return { found: false, spoken: false }; } + await this.#refreshBrowserHealthForChromeSession(session); const speak = this.#sessionSpeakers.get(sessionId); if (!speak || session.state !== "active") { return { found: true, spoken: false, session }; } + const readiness = this.#refreshSpeechReadiness(session); + if (!readiness.ready) { + const note = readiness.message + ? `Realtime speech blocked: ${readiness.message}` + : "Realtime speech blocked until Google Meet is ready."; + session.notes = [...session.notes.filter((item) => item !== note), note]; + session.updatedAt = nowIso(); + return { found: true, spoken: false, session }; + } speak(instructions || this.params.config.realtime.introMessage); session.updatedAt = nowIso(); this.#refreshHealth(sessionId); @@ -416,6 +488,9 @@ export class GoogleMeetRuntime { spoken: boolean; speechOutputVerified: boolean; speechOutputTimedOut: boolean; + speechReady?: boolean; + speechBlockedReason?: GoogleMeetChromeHealth["speechBlockedReason"]; + speechBlockedMessage?: string; audioOutputActive?: boolean; lastOutputBytes?: number; session: GoogleMeetSession; @@ -470,12 +545,60 @@ export class GoogleMeetRuntime { spoken: result.spoken ?? false, speechOutputVerified, speechOutputTimedOut: shouldWaitForOutput && !speechOutputVerified, + speechReady: health?.speechReady, + speechBlockedReason: health?.speechBlockedReason, + speechBlockedMessage: health?.speechBlockedMessage, audioOutputActive: health?.audioOutputActive, lastOutputBytes: health?.lastOutputBytes, session: result.session, }; } + async #refreshBrowserHealthForChromeSession(session: GoogleMeetSession) { + if (!isManagedChromeBrowserSession(session) || evaluateSpeechReadiness(session).ready) { + this.#refreshSpeechReadiness(session); + return; + } + try { + const result = + session.transport === "chrome-node" + ? await recoverCurrentMeetTabOnNode({ + runtime: this.params.runtime, + config: this.params.config, + url: session.url, + }) + : await recoverCurrentMeetTab({ + config: this.params.config, + url: session.url, + }); + if (result.found && result.browser && session.chrome) { + session.chrome.health = { + ...session.chrome.health, + ...result.browser, + }; + session.updatedAt = nowIso(); + } + } catch (error) { + this.params.logger.debug?.( + `[google-meet] browser readiness refresh ignored: ${formatErrorMessage(error)}`, + ); + } + this.#refreshSpeechReadiness(session); + } + + #refreshSpeechReadiness(session: GoogleMeetSession) { + const readiness = evaluateSpeechReadiness(session); + if (session.chrome) { + session.chrome.health = { + ...session.chrome.health, + speechReady: readiness.ready, + speechBlockedReason: readiness.reason, + speechBlockedMessage: readiness.message, + }; + } + return readiness; + } + #refreshHealth(sessionId?: string) { const ids = sessionId ? [sessionId] : [...this.#sessionHealth.keys()]; for (const id of ids) { @@ -488,6 +611,7 @@ export class GoogleMeetRuntime { ...session.chrome.health, ...getHealth(), }; + this.#refreshSpeechReadiness(session); } } } diff --git a/extensions/google-meet/src/transports/types.ts b/extensions/google-meet/src/transports/types.ts index 30f9738ce31..d2a08aa3706 100644 --- a/extensions/google-meet/src/transports/types.ts +++ b/extensions/google-meet/src/transports/types.ts @@ -19,12 +19,21 @@ export type GoogleMeetManualActionReason = | "meet-audio-choice-required" | "browser-control-unavailable"; +export type GoogleMeetSpeechBlockedReason = + | GoogleMeetManualActionReason + | "not-in-call" + | "browser-unverified" + | "audio-bridge-unavailable"; + export type GoogleMeetChromeHealth = { inCall?: boolean; micMuted?: boolean; manualActionRequired?: boolean; manualActionReason?: GoogleMeetManualActionReason; manualActionMessage?: string; + speechReady?: boolean; + speechBlockedReason?: GoogleMeetSpeechBlockedReason; + speechBlockedMessage?: string; providerConnected?: boolean; realtimeReady?: boolean; audioInputActive?: boolean;