diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f916c447ef..265e97779c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ Docs: https://docs.openclaw.ai - Agents/attribution: send OpenClaw attribution headers on native OpenAI and Codex traffic, including SDK transports, realtime voice and TTS, device-code auth, WHAM usage, and remote embeddings, so PI-origin defaults no longer leak into provider requests. Thanks @vincentkoc. - Agents/auth: keep OAuth auth profiles inherited from the main agent read-through instead of copying refresh tokens into secondary agents, and refresh Codex app-server tokens against the owning store so multi-agent swarms avoid reused refresh-token failures. Fixes #74055. Thanks @ClarityInvest. - Channels/Telegram: honor `ALL_PROXY` / `all_proxy` and service-level `OPENCLAW_PROXY_URL` when constructing the HTTP/1-only Telegram Bot API transport, so Windows and service installs that rely on those proxy settings no longer fall back to direct egress. Fixes #74014; refs #74086. Thanks @SymbolStar. +- Channels/Telegram: keep raw host/network-unreachable Bot API connect failures non-fatal and route tagged polling uncaught exceptions through the Telegram restart path, so transient reachability failures no longer kill the Gateway or leave long polling stuck. Fixes #60515; refs #74540. Thanks @HemantSudarshan, @thacid22, and @ewimsatt. - Channels/Telegram: continue polling when `deleteWebhook` hits a transient network failure but `getWebhookInfo` confirms no webhook is configured, so startup does not retry cleanup forever after the webhook was already removed. Refs #74086; carries forward #47384. Thanks @clovericbot. - Channels/Telegram: apply strict safe-send retry to inbound final replies when grammY wraps a pre-connect failure, while leaving ambiguous plain network envelopes single-shot to avoid duplicate visible messages. Fixes #74203. Thanks @nanli2000cn. - Channels/Telegram: surface polling liveness warnings in channel status and doctor when a running long-poller has not completed `getUpdates` after startup grace or its transport activity is stale, so silent polling failures no longer look clean. Refs #74299. Thanks @lolaopenclaw. diff --git a/extensions/telegram/src/monitor.test.ts b/extensions/telegram/src/monitor.test.ts index 4674cb2c30d..757540a833d 100644 --- a/extensions/telegram/src/monitor.test.ts +++ b/extensions/telegram/src/monitor.test.ts @@ -44,24 +44,40 @@ const { initSpy, runSpy, getRuntimeConfigMock } = vi.hoisted(() => ({ })), })); -const { registerUnhandledRejectionHandlerMock, emitUnhandledRejection, resetUnhandledRejection } = - vi.hoisted(() => { - let handler: ((reason: unknown) => boolean) | undefined; - return { - registerUnhandledRejectionHandlerMock: vi.fn((next: (reason: unknown) => boolean) => { - handler = next; - return () => { - if (handler === next) { - handler = undefined; - } - }; - }), - emitUnhandledRejection: (reason: unknown) => handler?.(reason) ?? false, - resetUnhandledRejection: () => { - handler = undefined; - }, - }; - }); +const { + registerUnhandledRejectionHandlerMock, + registerUncaughtExceptionHandlerMock, + emitUnhandledRejection, + emitUncaughtException, + resetProcessErrorHandlers, +} = vi.hoisted(() => { + let unhandledRejectionHandler: ((reason: unknown) => boolean) | undefined; + let uncaughtExceptionHandler: ((error: unknown) => boolean) | undefined; + return { + registerUnhandledRejectionHandlerMock: vi.fn((next: (reason: unknown) => boolean) => { + unhandledRejectionHandler = next; + return () => { + if (unhandledRejectionHandler === next) { + unhandledRejectionHandler = undefined; + } + }; + }), + registerUncaughtExceptionHandlerMock: vi.fn((next: (error: unknown) => boolean) => { + uncaughtExceptionHandler = next; + return () => { + if (uncaughtExceptionHandler === next) { + uncaughtExceptionHandler = undefined; + } + }; + }), + emitUnhandledRejection: (reason: unknown) => unhandledRejectionHandler?.(reason) ?? false, + emitUncaughtException: (error: unknown) => uncaughtExceptionHandler?.(error) ?? false, + resetProcessErrorHandlers: () => { + unhandledRejectionHandler = undefined; + uncaughtExceptionHandler = undefined; + }, + }; +}); const { createTelegramBotErrors } = vi.hoisted(() => ({ createTelegramBotErrors: [] as unknown[], @@ -113,6 +129,16 @@ function makeRecoverableFetchError() { }); } +class MockHttpError extends Error { + constructor( + message: string, + public readonly error: unknown, + ) { + super(message); + this.name = "HttpError"; + } +} + async function makeTaggedPollingFetchError() { const { tagTelegramNetworkError } = await import("./network-errors.js"); const err = makeRecoverableFetchError(); @@ -123,6 +149,13 @@ async function makeTaggedPollingFetchError() { return err; } +async function makeTaggedPollingHttpError() { + return new MockHttpError( + "Network request for 'getUpdates' failed!", + await makeTaggedPollingFetchError(), + ); +} + const createAbortTask = ( abort: AbortController, beforeAbort?: () => void, @@ -316,6 +349,7 @@ vi.mock("openclaw/plugin-sdk/runtime-env", async () => { computeBackoff, sleepWithAbort, registerUnhandledRejectionHandler: registerUnhandledRejectionHandlerMock, + registerUncaughtExceptionHandler: registerUncaughtExceptionHandlerMock, }; }); @@ -364,7 +398,8 @@ describe("monitorTelegramProvider (grammY)", () => { close: vi.fn(async () => undefined), })); registerUnhandledRejectionHandlerMock.mockClear(); - resetUnhandledRejection(); + registerUncaughtExceptionHandlerMock.mockClear(); + resetProcessErrorHandlers(); createTelegramBotErrors.length = 0; createdBotStops.length = 0; consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); @@ -659,6 +694,38 @@ describe("monitorTelegramProvider (grammY)", () => { expectRecoverableRetryState(2); }); + it("force-restarts polling when uncaught network exception stalls runner", async () => { + const abort = new AbortController(); + const firstCycle = mockRunOnceWithStalledPollingRunner(); + const secondCycle = mockRunOnceWithStalledPollingRunner(); + + const monitor = monitorTelegramProvider({ token: "tok", abortSignal: abort.signal }); + await firstCycle.waitForRunStart(); + + expect(emitUncaughtException(await makeTaggedPollingFetchError())).toBe(true); + expect(firstCycle.stop).toHaveBeenCalledTimes(1); + await secondCycle.waitForRunStart(); + abort.abort(); + await monitor; + expectRecoverableRetryState(2); + }); + + it("force-restarts polling when uncaught polling HttpError stalls runner", async () => { + const abort = new AbortController(); + const firstCycle = mockRunOnceWithStalledPollingRunner(); + const secondCycle = mockRunOnceWithStalledPollingRunner(); + + const monitor = monitorTelegramProvider({ token: "tok", abortSignal: abort.signal }); + await firstCycle.waitForRunStart(); + + expect(emitUncaughtException(await makeTaggedPollingHttpError())).toBe(true); + expect(firstCycle.stop).toHaveBeenCalledTimes(1); + await secondCycle.waitForRunStart(); + abort.abort(); + await monitor; + expectRecoverableRetryState(2); + }); + it("rebuilds the resolved transport after a stalled polling restart", async () => { vi.useFakeTimers({ shouldAdvanceTime: true }); try { diff --git a/extensions/telegram/src/monitor.ts b/extensions/telegram/src/monitor.ts index 00d7ed97d6c..baeff5ed50a 100644 --- a/extensions/telegram/src/monitor.ts +++ b/extensions/telegram/src/monitor.ts @@ -4,8 +4,11 @@ import { registerChannelRuntimeContext } from "openclaw/plugin-sdk/channel-runti import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; import { resolveAgentMaxConcurrent } from "openclaw/plugin-sdk/model-session-runtime"; import { getRuntimeConfig } from "openclaw/plugin-sdk/runtime-config-snapshot"; -import { waitForAbortSignal } from "openclaw/plugin-sdk/runtime-env"; -import { registerUnhandledRejectionHandler } from "openclaw/plugin-sdk/runtime-env"; +import { + registerUncaughtExceptionHandler, + registerUnhandledRejectionHandler, + waitForAbortSignal, +} from "openclaw/plugin-sdk/runtime-env"; import type { RuntimeEnv } from "openclaw/plugin-sdk/runtime-env"; import { formatErrorMessage } from "openclaw/plugin-sdk/ssrf-runtime"; import { resolveTelegramAccount } from "./accounts.js"; @@ -89,13 +92,9 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) { const log = opts.runtime?.error ?? console.error; let pollingSession: TelegramPollingSessionInstance | undefined; - const unregisterHandler = registerUnhandledRejectionHandler((err) => { + const handlePollingNetworkFailure = (err: unknown, label: string) => { const isNetworkError = isRecoverableTelegramNetworkError(err, { context: "polling" }); const isTelegramPollingError = isTelegramPollingNetworkError(err); - if (isGrammyHttpError(err) && isNetworkError && isTelegramPollingError) { - log(`[telegram] Suppressed network error: ${formatErrorMessage(err)}`); - return true; - } const activeRunner = pollingSession?.activeRunner; if (isNetworkError && isTelegramPollingError && activeRunner && activeRunner.isRunning()) { @@ -104,14 +103,24 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) { pollingSession?.abortActiveFetch(); void activeRunner.stop().catch(() => {}); log("[telegram][diag] marking transport dirty after polling network failure"); - log( - `[telegram] Restarting polling after unhandled network error: ${formatErrorMessage(err)}`, - ); + log(`[telegram] Restarting polling after ${label}: ${formatErrorMessage(err)}`); + return true; + } + + if (isGrammyHttpError(err) && isNetworkError && isTelegramPollingError) { + log(`[telegram] Suppressed network error: ${formatErrorMessage(err)}`); return true; } return false; - }); + }; + + const unregisterUnhandledRejectionHandler = registerUnhandledRejectionHandler((err) => + handlePollingNetworkFailure(err, "unhandled network error"), + ); + const unregisterUncaughtExceptionHandler = registerUncaughtExceptionHandler((err) => + handlePollingNetworkFailure(err, "uncaught network error"), + ); try { const cfg = opts.config ?? getRuntimeConfig(); @@ -254,6 +263,7 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) { pollingLease.release(); } } finally { - unregisterHandler(); + unregisterUnhandledRejectionHandler(); + unregisterUncaughtExceptionHandler(); } } diff --git a/src/cli/run-main.exit.test.ts b/src/cli/run-main.exit.test.ts index fef4ae8f560..09bcf647d20 100644 --- a/src/cli/run-main.exit.test.ts +++ b/src/cli/run-main.exit.test.ts @@ -711,11 +711,13 @@ describe("runCli exit behavior", () => { expect(typeof handler).toBe("function"); try { - const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" }); - expect(() => (handler as (error: unknown) => void)(epipe)).not.toThrow(); + const hostUnreachable = Object.assign(new Error("connect EHOSTUNREACH 149.154.167.220:443"), { + code: "EHOSTUNREACH", + }); + expect(() => (handler as (error: unknown) => void)(hostUnreachable)).not.toThrow(); expect(consoleWarnSpy).toHaveBeenCalledWith( "[openclaw] Non-fatal uncaught exception (continuing):", - expect.stringContaining("write EPIPE"), + expect.stringContaining("EHOSTUNREACH"), ); expect(restoreTerminalStateMock).not.toHaveBeenCalled(); expect(exitSpy).not.toHaveBeenCalled(); diff --git a/src/infra/unhandled-rejections.test.ts b/src/infra/unhandled-rejections.test.ts index eb8e9f3cb1b..c3731d89b5b 100644 --- a/src/infra/unhandled-rejections.test.ts +++ b/src/infra/unhandled-rejections.test.ts @@ -259,15 +259,23 @@ describe("isTransientSqliteError", () => { }); describe("isTransientUnhandledRejectionError", () => { - it("keeps uncaught exception suppression scoped to broken pipes", () => { + it("treats raw pre-connect network uncaught exceptions as benign", () => { const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" }); const sqlite = Object.assign(new Error("database is locked"), { code: "SQLITE_BUSY" }); const network = Object.assign(new Error("connection reset"), { code: "ECONNRESET" }); + const hostUnreachable = Object.assign(new Error("connect EHOSTUNREACH"), { + code: "EHOSTUNREACH", + }); + const rawHostUnreachable = new Error( + "connect EHOSTUNREACH 149.154.167.220:443 - Local (10.0.10.40:50017)", + ); const generic = new Error("boom"); expect(isBenignUncaughtExceptionError(epipe)).toBe(true); expect(isBenignUncaughtExceptionError(sqlite)).toBe(false); expect(isBenignUncaughtExceptionError(network)).toBe(false); + expect(isBenignUncaughtExceptionError(hostUnreachable)).toBe(true); + expect(isBenignUncaughtExceptionError(rawHostUnreachable)).toBe(true); expect(isBenignUncaughtExceptionError(generic)).toBe(false); }); it("returns true for transient SQLite errors", () => { diff --git a/src/infra/unhandled-rejections.ts b/src/infra/unhandled-rejections.ts index 593a9fdd563..81f57a05af3 100644 --- a/src/infra/unhandled-rejections.ts +++ b/src/infra/unhandled-rejections.ts @@ -89,9 +89,22 @@ const TRANSIENT_SQLITE_CODES = new Set([ const TRANSIENT_SQLITE_ERRCODES = new Set([5, 6, 10, 14]); const BENIGN_UNCAUGHT_EXCEPTION_CODES = new Set(["EPIPE", "EIO"]); +const BENIGN_UNCAUGHT_EXCEPTION_NETWORK_CODES = new Set([ + "ECONNREFUSED", + "EHOSTUNREACH", + "ENETUNREACH", + "EAI_AGAIN", + "ENOTFOUND", + "ETIMEDOUT", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_DNS_RESOLVE_FAILED", + "UND_ERR_CONNECT", +]); const TRANSIENT_NETWORK_MESSAGE_CODE_RE = /\b(ECONNRESET|ECONNREFUSED|ENOTFOUND|ETIMEDOUT|ESOCKETTIMEDOUT|ECONNABORTED|EPIPE|EHOSTUNREACH|ENETUNREACH|EAI_AGAIN|EPROTO|UND_ERR_CONNECT_TIMEOUT|UND_ERR_DNS_RESOLVE_FAILED|UND_ERR_CONNECT|UND_ERR_SOCKET|UND_ERR_HEADERS_TIMEOUT|UND_ERR_BODY_TIMEOUT)\b/i; +const BENIGN_UNCAUGHT_EXCEPTION_NETWORK_MESSAGE_CODE_RE = + /\b(ECONNREFUSED|EHOSTUNREACH|ENETUNREACH|EAI_AGAIN|ENOTFOUND|ETIMEDOUT|UND_ERR_CONNECT_TIMEOUT|UND_ERR_DNS_RESOLVE_FAILED|UND_ERR_CONNECT)\b/i; const TRANSIENT_SQLITE_MESSAGE_CODE_RE = /\b(SQLITE_BUSY|SQLITE_CANTOPEN|SQLITE_IOERR|SQLITE_LOCKED)\b/i; @@ -341,7 +354,27 @@ export function isTransientUnhandledRejectionError(err: unknown): boolean { return isTransientNetworkError(err) || isTransientSqliteError(err); } +function isBenignUncaughtNetworkException(err: unknown): boolean { + for (const candidate of collectNestedUnhandledErrorCandidates(err)) { + const code = extractErrorCodeOrErrno(candidate); + if (code && BENIGN_UNCAUGHT_EXCEPTION_NETWORK_CODES.has(code)) { + return true; + } + if (!candidate || typeof candidate !== "object") { + continue; + } + const message = normalizeLowercaseStringOrEmpty((candidate as { message?: unknown }).message); + if (message && BENIGN_UNCAUGHT_EXCEPTION_NETWORK_MESSAGE_CODE_RE.test(message)) { + return true; + } + } + return false; +} + export function isBenignUncaughtExceptionError(err: unknown): boolean { + if (isBenignUncaughtNetworkException(err)) { + return true; + } for (const candidate of collectNestedUnhandledErrorCandidates(err)) { const code = extractErrorCodeOrErrno(candidate); if (code && BENIGN_UNCAUGHT_EXCEPTION_CODES.has(code)) {