diff --git a/CHANGELOG.md b/CHANGELOG.md index 5362f205a43..aae6a215a75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai - Providers/Bedrock: omit deprecated `temperature` for Claude Opus 4.7 Bedrock model ids, named and application inference profiles, including dotted `opus-4.7` refs, and classify the nested validation response for failover. Fixes #73663. Thanks @bstanbury. - Gateway: raise the preauth/connect-challenge timeout to 15s so cold CLI starts on slower hosts have more time to process the WebSocket challenge before the Gateway closes the connection. Fixes #51469; refs #73592 and #62060. Thanks @GothicFox and @jackychen-png. - CLI/status: fall back to a bounded local `status` RPC when loopback detail probes time out or report unknown capability, so reachable local gateways are no longer marked unreachable by slow read diagnostics. Fixes #73535; refs #48360, #62762, #51357, and #42019. Thanks @RacecarGuy, @justinschille, @DJBlackhawk, @tianyaqpzm, and @0xrsydn. +- CLI/gateway: reuse cached paired-device auth during `gateway probe` and report post-connect diagnostic failures as degraded reachability, so healthy local gateways are no longer marked unreachable after loopback auth or read timeouts. Fixes #48360. Thanks @RacecarGuy. - Channels/Discord: give Discord Gateway WebSocket handshakes a 30s timeout so stalled TLS/network transitions emit an error and Carbon can continue its reconnect loop instead of leaving the bot silent until restart. Refs #50046. Thanks @codexGW. - NVIDIA/NIM: persist the `NVIDIA_API_KEY` provider marker and mark bundled NVIDIA Chat Completions models as string-content compatible, so NIM models load from `models.json` and OpenAI-compatible subagent calls send plain text content. Fixes #73013 and #50107; refs #73014. Thanks @bautrey, @iot2edge, @ifearghal, and @futhgar. - Channels/Discord: let text-only configs drop the `GuildVoiceStates` gateway intent and expose a bounded `/gateway/bot` metadata timeout with rate-limited fallback logs, reducing idle CPU and warning floods. Fixes #73709 and #73585. Thanks @sanchezm86 and @trac3r00. diff --git a/docs/cli/gateway.md b/docs/cli/gateway.md index 4da4db1b67f..636d8a416d7 100644 --- a/docs/cli/gateway.md +++ b/docs/cli/gateway.md @@ -323,6 +323,7 @@ openclaw gateway probe --json - `Capability: read-only|write-capable|admin-capable|pairing-pending|connect-only` reports what the probe could prove about auth. It is separate from reachability. - `Read probe: ok` means read-scope detail RPC calls (`health`/`status`/`system-presence`/`config.get`) also succeeded. - `Read probe: limited - missing scope: operator.read` means connect succeeded but read-scope RPC is limited. This is reported as **degraded** reachability, not full failure. + - `Read probe: failed` after `Connect: ok` means the Gateway accepted the WebSocket connection, but follow-up read diagnostics timed out or failed. This is also **degraded** reachability, not an unreachable Gateway. - Like `gateway status`, probe reuses existing cached device auth but does not create first-time device identity or pairing state. - Exit code is non-zero only when no probed target is reachable. @@ -331,7 +332,7 @@ openclaw gateway probe --json Top level: - `ok`: at least one target is reachable. - - `degraded`: at least one target had scope-limited detail RPC. + - `degraded`: at least one target accepted a connection but did not complete full detail RPC diagnostics. - `capability`: best capability seen across reachable targets (`read_only`, `write_capable`, `admin_capable`, `pairing_pending`, `connected_no_operator_scope`, or `unknown`). - `primaryTargetId`: best target to treat as the active winner in this order: explicit URL, SSH tunnel, configured remote, then local loopback. - `warnings[]`: best-effort warning records with `code`, `message`, and optional `targetIds`. diff --git a/docs/gateway/troubleshooting.md b/docs/gateway/troubleshooting.md index 4d623b1bf81..8ef31c8825c 100644 --- a/docs/gateway/troubleshooting.md +++ b/docs/gateway/troubleshooting.md @@ -380,6 +380,7 @@ Common signatures: - `SSH tunnel failed to start; falling back to direct probes.` → SSH setup failed, but the command still tried direct configured/loopback targets. - `multiple reachable gateways detected` → more than one target answered. Usually this means an intentional multi-gateway setup or stale/duplicate listeners. - `Read-probe diagnostics are limited by gateway scopes (missing operator.read)` → connect worked, but detail RPC is scope-limited; pair device identity or use credentials with `operator.read`. +- `Gateway accepted the WebSocket connection, but follow-up read diagnostics failed` → connect worked, but the full diagnostic RPC set timed out or failed. Treat this as a reachable Gateway with degraded diagnostics; compare `connect.ok` and `connect.rpcOk` in `--json` output. - `Capability: pairing-pending` or `gateway closed (1008): pairing required` → the gateway answered, but this client still needs pairing/approval before normal operator access. - unresolved `gateway.auth.*` / `gateway.remote.*` SecretRef warning text → auth material was unavailable in this command path for the failed target. diff --git a/src/commands/gateway-status/helpers.test.ts b/src/commands/gateway-status/helpers.test.ts index 0083fc91d47..0b50ba8756b 100644 --- a/src/commands/gateway-status/helpers.test.ts +++ b/src/commands/gateway-status/helpers.test.ts @@ -4,6 +4,7 @@ import { buildNetworkHints, extractConfigSummary, isProbeReachable, + isPostConnectProbeFailure, isScopeLimitedProbeFailure, renderProbeSummaryLine, resolveAuthForTarget, @@ -250,7 +251,7 @@ describe("probe reachability classification", () => { expect(renderProbeSummaryLine(probe, false)).toContain("Read probe: limited"); }); - it("keeps non-scope RPC failures as unreachable", () => { + it("treats post-connect read failures as reachable with failed diagnostics", () => { const probe = { ok: false, url: "ws://127.0.0.1:18789", @@ -269,10 +270,33 @@ describe("probe reachability classification", () => { }; expect(isScopeLimitedProbeFailure(probe)).toBe(false); - expect(isProbeReachable(probe)).toBe(false); + expect(isPostConnectProbeFailure(probe)).toBe(true); + expect(isProbeReachable(probe)).toBe(true); expect(renderProbeSummaryLine(probe, false)).toContain("Capability: connect-only"); expect(renderProbeSummaryLine(probe, false)).toContain("Read probe: failed"); }); + + it("keeps failed-before-connect probes unreachable", () => { + const probe = { + ok: false, + url: "ws://127.0.0.1:18789", + connectLatencyMs: null, + error: "timeout", + close: null, + auth: { + role: null, + scopes: [], + capability: "unknown" as const, + }, + health: null, + status: null, + presence: null, + configSnapshot: null, + }; + + expect(isPostConnectProbeFailure(probe)).toBe(false); + expect(isProbeReachable(probe)).toBe(false); + }); }); describe("gateway-status local target scheme", () => { it("uses wss for local loopback targets and network hints when gateway TLS is enabled", () => { diff --git a/src/commands/gateway-status/helpers.ts b/src/commands/gateway-status/helpers.ts index 6ec6c1db615..9923e6803d7 100644 --- a/src/commands/gateway-status/helpers.ts +++ b/src/commands/gateway-status/helpers.ts @@ -276,8 +276,12 @@ export function isScopeLimitedProbeFailure(probe: GatewayProbeResult): boolean { return MISSING_SCOPE_PATTERN.test(probe.error ?? ""); } +export function isPostConnectProbeFailure(probe: GatewayProbeResult): boolean { + return !probe.ok && probe.connectLatencyMs != null; +} + export function isProbeReachable(probe: GatewayProbeResult): boolean { - return probe.ok || isScopeLimitedProbeFailure(probe); + return probe.ok || probe.connectLatencyMs != null; } function getGatewayProbeCapability(probe: GatewayProbeResult): GatewayProbeCapability { diff --git a/src/commands/gateway-status/output.test.ts b/src/commands/gateway-status/output.test.ts index 27f2e8a8875..87b3164ed2a 100644 --- a/src/commands/gateway-status/output.test.ts +++ b/src/commands/gateway-status/output.test.ts @@ -95,11 +95,11 @@ describe("gateway status output", () => { discovery: [], probed: [ createTarget( - "unreachable-admin", + "unreachable-before-connect", createProbe("admin_capable", { ok: false, - connectLatencyMs: 40, - error: "unknown method: status", + connectLatencyMs: null, + error: "timeout", }), ), createTarget( @@ -132,11 +132,11 @@ describe("gateway status output", () => { discovery: [], probed: [ createTarget( - "unreachable-admin", + "unreachable-before-connect", createProbe("admin_capable", { ok: false, - connectLatencyMs: 40, - error: "unknown method: status", + connectLatencyMs: null, + error: "timeout", }), ), createTarget( @@ -153,4 +153,57 @@ describe("gateway status output", () => { expect(runtime.log).toHaveBeenCalledWith("Capability: read-only"); }); + + it("reports post-connect detail failures as reachable but degraded in json output", () => { + const runtime = createRuntimeCapture(); + writeGatewayStatusJson({ + runtime, + startedAt: Date.now() - 50, + overallTimeoutMs: 5_000, + discoveryTimeoutMs: 500, + network: { + localLoopbackUrl: "ws://127.0.0.1:18789", + localTailnetUrl: null, + tailnetIPv4: null, + }, + discovery: [], + probed: [ + createTarget( + "detail-timeout", + createProbe("read_only", { + ok: false, + connectLatencyMs: 40, + error: "timeout", + }), + ), + ], + warnings: [ + { + code: "probe_detail_failed", + message: + "Gateway accepted the WebSocket connection, but follow-up read diagnostics failed: timeout", + targetIds: ["detail-timeout"], + }, + ], + primaryTargetId: "detail-timeout", + }); + + expect(writeRuntimeJson).toHaveBeenCalledWith( + runtime, + expect.objectContaining({ + ok: true, + degraded: true, + primaryTargetId: "detail-timeout", + targets: [ + expect.objectContaining({ + connect: expect.objectContaining({ + ok: true, + rpcOk: false, + error: "timeout", + }), + }), + ], + }), + ); + }); }); diff --git a/src/commands/gateway-status/output.ts b/src/commands/gateway-status/output.ts index 8ce7d38aa88..a17c31d3377 100644 --- a/src/commands/gateway-status/output.ts +++ b/src/commands/gateway-status/output.ts @@ -4,6 +4,7 @@ import { colorize, theme } from "../../terminal/theme.js"; import { serializeGatewayDiscoveryBeacon } from "./discovery.js"; import { isProbeReachable, + isPostConnectProbeFailure, isScopeLimitedProbeFailure, summarizeGatewayProbeCapability, renderProbeSummaryLine, @@ -39,6 +40,9 @@ export function buildGatewayStatusWarnings(params: { const degradedScopeLimited = params.probed.filter((entry) => isScopeLimitedProbeFailure(entry.probe), ); + const degradedDetailFailed = params.probed.filter( + (entry) => isPostConnectProbeFailure(entry.probe) && !isScopeLimitedProbeFailure(entry.probe), + ); const warnings: GatewayStatusWarning[] = []; if (params.sshTarget && !params.sshTunnelStarted) { warnings.push({ @@ -83,6 +87,14 @@ export function buildGatewayStatusWarnings(params: { targetIds: [result.target.id], }); } + for (const result of degradedDetailFailed) { + const detail = result.probe.error ? `: ${result.probe.error}` : "."; + warnings.push({ + code: "probe_detail_failed", + message: `Gateway accepted the WebSocket connection, but follow-up read diagnostics failed${detail}`, + targetIds: [result.target.id], + }); + } return warnings; } @@ -98,7 +110,7 @@ export function writeGatewayStatusJson(params: { primaryTargetId: string | null; }) { const reachable = params.probed.filter((entry) => isProbeReachable(entry.probe)); - const degraded = params.probed.some((entry) => isScopeLimitedProbeFailure(entry.probe)); + const degraded = params.probed.some((entry) => isPostConnectProbeFailure(entry.probe)); const capability = summarizeGatewayProbeCapability(reachable.map((entry) => entry.probe)); writeRuntimeJson(params.runtime, { ok: reachable.length > 0, diff --git a/src/gateway/probe.test.ts b/src/gateway/probe.test.ts index 9ba82e5cdae..7469d2de368 100644 --- a/src/gateway/probe.test.ts +++ b/src/gateway/probe.test.ts @@ -203,7 +203,18 @@ describe("probeGateway", () => { expect(gatewayClientState.options?.scopes).toEqual(["operator.read"]); }); - it("keeps device identity disabled for unauthenticated loopback probes", async () => { + it("reuses cached device identity for unauthenticated loopback probes", async () => { + await probeGateway({ + url: "ws://127.0.0.1:18789", + timeoutMs: 1_000, + }); + + expect(gatewayClientState.options?.deviceIdentity).toEqual(deviceIdentityState.value); + }); + + it("keeps device identity disabled for first-time unauthenticated loopback probes", async () => { + deviceIdentityState.cachedToken = null; + await probeGateway({ url: "ws://127.0.0.1:18789", timeoutMs: 1_000, @@ -220,7 +231,7 @@ describe("probeGateway", () => { }); expect(result.ok).toBe(true); - expect(gatewayClientState.options?.deviceIdentity).toBeNull(); + expect(gatewayClientState.options?.deviceIdentity).toEqual(deviceIdentityState.value); expect(gatewayClientState.requests).toEqual([]); }); diff --git a/src/gateway/probe.ts b/src/gateway/probe.ts index 574f58f931b..991687bdea7 100644 --- a/src/gateway/probe.ts +++ b/src/gateway/probe.ts @@ -5,7 +5,6 @@ import type { SystemPresence } from "../infra/system-presence.js"; import { MAX_SAFE_TIMEOUT_DELAY_MS, resolveSafeTimeoutDelayMs } from "../utils/timer-delay.js"; import { GatewayClient, GatewayClientRequestError } from "./client.js"; import { READ_SCOPE } from "./method-scopes.js"; -import { isLoopbackHost } from "./net.js"; import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "./protocol/client-info.js"; export type GatewayProbeAuth = { @@ -162,24 +161,18 @@ export async function probeGateway(opts: { const detailLevel = opts.includeDetails === false ? "none" : (opts.detailLevel ?? "full"); const deviceIdentity = await (async () => { - let hostname: string; - try { - hostname = new URL(opts.url).hostname; - } catch { - return null; - } - // Keep probes non-mutating: only attach a device identity when this CLI - // already has a cached operator device token. Fresh diagnostics should not - // create a read-only pairing baseline that later blocks admin commands. - if (isLoopbackHost(hostname) && !(opts.auth?.token || opts.auth?.password)) { - return null; - } try { + if (!URL.canParse(opts.url)) { + return null; + } const { loadDeviceIdentityIfPresent } = await import("../infra/device-identity.js"); const identity = loadDeviceIdentityIfPresent(); if (!identity) { return null; } + // Keep probes non-mutating: only attach a device identity when this CLI + // already has a cached operator device token. Fresh diagnostics should not + // create a read-only pairing baseline that later blocks admin commands. const cachedOperatorToken = loadDeviceAuthToken({ deviceId: identity.deviceId, role: "operator",