fix(gateway): reuse paired auth for probes

This commit is contained in:
Peter Steinberger
2026-04-28 21:52:44 +01:00
parent 969cb8b4c0
commit 193c7432e3
9 changed files with 126 additions and 26 deletions

View File

@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
- Providers/Bedrock: omit deprecated `temperature` for Claude Opus 4.7 Bedrock model ids, named and application inference profiles, including dotted `opus-4.7` refs, and classify the nested validation response for failover. Fixes #73663. Thanks @bstanbury.
- Gateway: raise the preauth/connect-challenge timeout to 15s so cold CLI starts on slower hosts have more time to process the WebSocket challenge before the Gateway closes the connection. Fixes #51469; refs #73592 and #62060. Thanks @GothicFox and @jackychen-png.
- CLI/status: fall back to a bounded local `status` RPC when loopback detail probes time out or report unknown capability, so reachable local gateways are no longer marked unreachable by slow read diagnostics. Fixes #73535; refs #48360, #62762, #51357, and #42019. Thanks @RacecarGuy, @justinschille, @DJBlackhawk, @tianyaqpzm, and @0xrsydn.
- CLI/gateway: reuse cached paired-device auth during `gateway probe` and report post-connect diagnostic failures as degraded reachability, so healthy local gateways are no longer marked unreachable after loopback auth or read timeouts. Fixes #48360. Thanks @RacecarGuy.
- Channels/Discord: give Discord Gateway WebSocket handshakes a 30s timeout so stalled TLS/network transitions emit an error and Carbon can continue its reconnect loop instead of leaving the bot silent until restart. Refs #50046. Thanks @codexGW.
- NVIDIA/NIM: persist the `NVIDIA_API_KEY` provider marker and mark bundled NVIDIA Chat Completions models as string-content compatible, so NIM models load from `models.json` and OpenAI-compatible subagent calls send plain text content. Fixes #73013 and #50107; refs #73014. Thanks @bautrey, @iot2edge, @ifearghal, and @futhgar.
- Channels/Discord: let text-only configs drop the `GuildVoiceStates` gateway intent and expose a bounded `/gateway/bot` metadata timeout with rate-limited fallback logs, reducing idle CPU and warning floods. Fixes #73709 and #73585. Thanks @sanchezm86 and @trac3r00.

View File

@@ -323,6 +323,7 @@ openclaw gateway probe --json
- `Capability: read-only|write-capable|admin-capable|pairing-pending|connect-only` reports what the probe could prove about auth. It is separate from reachability.
- `Read probe: ok` means read-scope detail RPC calls (`health`/`status`/`system-presence`/`config.get`) also succeeded.
- `Read probe: limited - missing scope: operator.read` means connect succeeded but read-scope RPC is limited. This is reported as **degraded** reachability, not full failure.
- `Read probe: failed` after `Connect: ok` means the Gateway accepted the WebSocket connection, but follow-up read diagnostics timed out or failed. This is also **degraded** reachability, not an unreachable Gateway.
- Like `gateway status`, probe reuses existing cached device auth but does not create first-time device identity or pairing state.
- Exit code is non-zero only when no probed target is reachable.
@@ -331,7 +332,7 @@ openclaw gateway probe --json
Top level:
- `ok`: at least one target is reachable.
- `degraded`: at least one target had scope-limited detail RPC.
- `degraded`: at least one target accepted a connection but did not complete full detail RPC diagnostics.
- `capability`: best capability seen across reachable targets (`read_only`, `write_capable`, `admin_capable`, `pairing_pending`, `connected_no_operator_scope`, or `unknown`).
- `primaryTargetId`: best target to treat as the active winner in this order: explicit URL, SSH tunnel, configured remote, then local loopback.
- `warnings[]`: best-effort warning records with `code`, `message`, and optional `targetIds`.

View File

@@ -380,6 +380,7 @@ Common signatures:
- `SSH tunnel failed to start; falling back to direct probes.` → SSH setup failed, but the command still tried direct configured/loopback targets.
- `multiple reachable gateways detected` → more than one target answered. Usually this means an intentional multi-gateway setup or stale/duplicate listeners.
- `Read-probe diagnostics are limited by gateway scopes (missing operator.read)` → connect worked, but detail RPC is scope-limited; pair device identity or use credentials with `operator.read`.
- `Gateway accepted the WebSocket connection, but follow-up read diagnostics failed` → connect worked, but the full diagnostic RPC set timed out or failed. Treat this as a reachable Gateway with degraded diagnostics; compare `connect.ok` and `connect.rpcOk` in `--json` output.
- `Capability: pairing-pending` or `gateway closed (1008): pairing required` → the gateway answered, but this client still needs pairing/approval before normal operator access.
- unresolved `gateway.auth.*` / `gateway.remote.*` SecretRef warning text → auth material was unavailable in this command path for the failed target.

View File

@@ -4,6 +4,7 @@ import {
buildNetworkHints,
extractConfigSummary,
isProbeReachable,
isPostConnectProbeFailure,
isScopeLimitedProbeFailure,
renderProbeSummaryLine,
resolveAuthForTarget,
@@ -250,7 +251,7 @@ describe("probe reachability classification", () => {
expect(renderProbeSummaryLine(probe, false)).toContain("Read probe: limited");
});
it("keeps non-scope RPC failures as unreachable", () => {
it("treats post-connect read failures as reachable with failed diagnostics", () => {
const probe = {
ok: false,
url: "ws://127.0.0.1:18789",
@@ -269,10 +270,33 @@ describe("probe reachability classification", () => {
};
expect(isScopeLimitedProbeFailure(probe)).toBe(false);
expect(isProbeReachable(probe)).toBe(false);
expect(isPostConnectProbeFailure(probe)).toBe(true);
expect(isProbeReachable(probe)).toBe(true);
expect(renderProbeSummaryLine(probe, false)).toContain("Capability: connect-only");
expect(renderProbeSummaryLine(probe, false)).toContain("Read probe: failed");
});
it("keeps failed-before-connect probes unreachable", () => {
const probe = {
ok: false,
url: "ws://127.0.0.1:18789",
connectLatencyMs: null,
error: "timeout",
close: null,
auth: {
role: null,
scopes: [],
capability: "unknown" as const,
},
health: null,
status: null,
presence: null,
configSnapshot: null,
};
expect(isPostConnectProbeFailure(probe)).toBe(false);
expect(isProbeReachable(probe)).toBe(false);
});
});
describe("gateway-status local target scheme", () => {
it("uses wss for local loopback targets and network hints when gateway TLS is enabled", () => {

View File

@@ -276,8 +276,12 @@ export function isScopeLimitedProbeFailure(probe: GatewayProbeResult): boolean {
return MISSING_SCOPE_PATTERN.test(probe.error ?? "");
}
export function isPostConnectProbeFailure(probe: GatewayProbeResult): boolean {
return !probe.ok && probe.connectLatencyMs != null;
}
export function isProbeReachable(probe: GatewayProbeResult): boolean {
return probe.ok || isScopeLimitedProbeFailure(probe);
return probe.ok || probe.connectLatencyMs != null;
}
function getGatewayProbeCapability(probe: GatewayProbeResult): GatewayProbeCapability {

View File

@@ -95,11 +95,11 @@ describe("gateway status output", () => {
discovery: [],
probed: [
createTarget(
"unreachable-admin",
"unreachable-before-connect",
createProbe("admin_capable", {
ok: false,
connectLatencyMs: 40,
error: "unknown method: status",
connectLatencyMs: null,
error: "timeout",
}),
),
createTarget(
@@ -132,11 +132,11 @@ describe("gateway status output", () => {
discovery: [],
probed: [
createTarget(
"unreachable-admin",
"unreachable-before-connect",
createProbe("admin_capable", {
ok: false,
connectLatencyMs: 40,
error: "unknown method: status",
connectLatencyMs: null,
error: "timeout",
}),
),
createTarget(
@@ -153,4 +153,57 @@ describe("gateway status output", () => {
expect(runtime.log).toHaveBeenCalledWith("Capability: read-only");
});
it("reports post-connect detail failures as reachable but degraded in json output", () => {
const runtime = createRuntimeCapture();
writeGatewayStatusJson({
runtime,
startedAt: Date.now() - 50,
overallTimeoutMs: 5_000,
discoveryTimeoutMs: 500,
network: {
localLoopbackUrl: "ws://127.0.0.1:18789",
localTailnetUrl: null,
tailnetIPv4: null,
},
discovery: [],
probed: [
createTarget(
"detail-timeout",
createProbe("read_only", {
ok: false,
connectLatencyMs: 40,
error: "timeout",
}),
),
],
warnings: [
{
code: "probe_detail_failed",
message:
"Gateway accepted the WebSocket connection, but follow-up read diagnostics failed: timeout",
targetIds: ["detail-timeout"],
},
],
primaryTargetId: "detail-timeout",
});
expect(writeRuntimeJson).toHaveBeenCalledWith(
runtime,
expect.objectContaining({
ok: true,
degraded: true,
primaryTargetId: "detail-timeout",
targets: [
expect.objectContaining({
connect: expect.objectContaining({
ok: true,
rpcOk: false,
error: "timeout",
}),
}),
],
}),
);
});
});

View File

@@ -4,6 +4,7 @@ import { colorize, theme } from "../../terminal/theme.js";
import { serializeGatewayDiscoveryBeacon } from "./discovery.js";
import {
isProbeReachable,
isPostConnectProbeFailure,
isScopeLimitedProbeFailure,
summarizeGatewayProbeCapability,
renderProbeSummaryLine,
@@ -39,6 +40,9 @@ export function buildGatewayStatusWarnings(params: {
const degradedScopeLimited = params.probed.filter((entry) =>
isScopeLimitedProbeFailure(entry.probe),
);
const degradedDetailFailed = params.probed.filter(
(entry) => isPostConnectProbeFailure(entry.probe) && !isScopeLimitedProbeFailure(entry.probe),
);
const warnings: GatewayStatusWarning[] = [];
if (params.sshTarget && !params.sshTunnelStarted) {
warnings.push({
@@ -83,6 +87,14 @@ export function buildGatewayStatusWarnings(params: {
targetIds: [result.target.id],
});
}
for (const result of degradedDetailFailed) {
const detail = result.probe.error ? `: ${result.probe.error}` : ".";
warnings.push({
code: "probe_detail_failed",
message: `Gateway accepted the WebSocket connection, but follow-up read diagnostics failed${detail}`,
targetIds: [result.target.id],
});
}
return warnings;
}
@@ -98,7 +110,7 @@ export function writeGatewayStatusJson(params: {
primaryTargetId: string | null;
}) {
const reachable = params.probed.filter((entry) => isProbeReachable(entry.probe));
const degraded = params.probed.some((entry) => isScopeLimitedProbeFailure(entry.probe));
const degraded = params.probed.some((entry) => isPostConnectProbeFailure(entry.probe));
const capability = summarizeGatewayProbeCapability(reachable.map((entry) => entry.probe));
writeRuntimeJson(params.runtime, {
ok: reachable.length > 0,

View File

@@ -203,7 +203,18 @@ describe("probeGateway", () => {
expect(gatewayClientState.options?.scopes).toEqual(["operator.read"]);
});
it("keeps device identity disabled for unauthenticated loopback probes", async () => {
it("reuses cached device identity for unauthenticated loopback probes", async () => {
await probeGateway({
url: "ws://127.0.0.1:18789",
timeoutMs: 1_000,
});
expect(gatewayClientState.options?.deviceIdentity).toEqual(deviceIdentityState.value);
});
it("keeps device identity disabled for first-time unauthenticated loopback probes", async () => {
deviceIdentityState.cachedToken = null;
await probeGateway({
url: "ws://127.0.0.1:18789",
timeoutMs: 1_000,
@@ -220,7 +231,7 @@ describe("probeGateway", () => {
});
expect(result.ok).toBe(true);
expect(gatewayClientState.options?.deviceIdentity).toBeNull();
expect(gatewayClientState.options?.deviceIdentity).toEqual(deviceIdentityState.value);
expect(gatewayClientState.requests).toEqual([]);
});

View File

@@ -5,7 +5,6 @@ import type { SystemPresence } from "../infra/system-presence.js";
import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js";
import { GatewayClient, GatewayClientRequestError } from "./client.js";
import { READ_SCOPE } from "./method-scopes.js";
import { isLoopbackHost } from "./net.js";
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "./protocol/client-info.js";
export type GatewayProbeAuth = {
@@ -162,24 +161,18 @@ export async function probeGateway(opts: {
const detailLevel = opts.includeDetails === false ? "none" : (opts.detailLevel ?? "full");
const deviceIdentity = await (async () => {
let hostname: string;
try {
hostname = new URL(opts.url).hostname;
} catch {
return null;
}
// Keep probes non-mutating: only attach a device identity when this CLI
// already has a cached operator device token. Fresh diagnostics should not
// create a read-only pairing baseline that later blocks admin commands.
if (isLoopbackHost(hostname) && !(opts.auth?.token || opts.auth?.password)) {
return null;
}
try {
if (!URL.canParse(opts.url)) {
return null;
}
const { loadDeviceIdentityIfPresent } = await import("../infra/device-identity.js");
const identity = loadDeviceIdentityIfPresent();
if (!identity) {
return null;
}
// Keep probes non-mutating: only attach a device identity when this CLI
// already has a cached operator device token. Fresh diagnostics should not
// create a read-only pairing baseline that later blocks admin commands.
const cachedOperatorToken = loadDeviceAuthToken({
deviceId: identity.deviceId,
role: "operator",