fix(telegram): keep unreachable polling sockets non-fatal

* Runtime: suppress transient network uncaught exceptions

* fix(telegram): keep unreachable polling sockets non-fatal

---------

Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Hemant Sudarshan
2026-04-30 00:23:43 +05:30
committed by GitHub
parent dabf76b3de
commit db6951088a
6 changed files with 156 additions and 35 deletions

View File

@@ -57,6 +57,7 @@ Docs: https://docs.openclaw.ai
- Agents/attribution: send OpenClaw attribution headers on native OpenAI and Codex traffic, including SDK transports, realtime voice and TTS, device-code auth, WHAM usage, and remote embeddings, so PI-origin defaults no longer leak into provider requests. Thanks @vincentkoc.
- Agents/auth: keep OAuth auth profiles inherited from the main agent read-through instead of copying refresh tokens into secondary agents, and refresh Codex app-server tokens against the owning store so multi-agent swarms avoid reused refresh-token failures. Fixes #74055. Thanks @ClarityInvest.
- Channels/Telegram: honor `ALL_PROXY` / `all_proxy` and service-level `OPENCLAW_PROXY_URL` when constructing the HTTP/1-only Telegram Bot API transport, so Windows and service installs that rely on those proxy settings no longer fall back to direct egress. Fixes #74014; refs #74086. Thanks @SymbolStar.
- Channels/Telegram: keep raw host/network-unreachable Bot API connect failures non-fatal and route tagged polling uncaught exceptions through the Telegram restart path, so transient reachability failures no longer kill the Gateway or leave long polling stuck. Fixes #60515; refs #74540. Thanks @HemantSudarshan, @thacid22, and @ewimsatt.
- Channels/Telegram: continue polling when `deleteWebhook` hits a transient network failure but `getWebhookInfo` confirms no webhook is configured, so startup does not retry cleanup forever after the webhook was already removed. Refs #74086; carries forward #47384. Thanks @clovericbot.
- Channels/Telegram: apply strict safe-send retry to inbound final replies when grammY wraps a pre-connect failure, while leaving ambiguous plain network envelopes single-shot to avoid duplicate visible messages. Fixes #74203. Thanks @nanli2000cn.
- Channels/Telegram: surface polling liveness warnings in channel status and doctor when a running long-poller has not completed `getUpdates` after startup grace or its transport activity is stale, so silent polling failures no longer look clean. Refs #74299. Thanks @lolaopenclaw.

View File

@@ -44,24 +44,40 @@ const { initSpy, runSpy, getRuntimeConfigMock } = vi.hoisted(() => ({
})),
}));
const { registerUnhandledRejectionHandlerMock, emitUnhandledRejection, resetUnhandledRejection } =
vi.hoisted(() => {
let handler: ((reason: unknown) => boolean) | undefined;
return {
registerUnhandledRejectionHandlerMock: vi.fn((next: (reason: unknown) => boolean) => {
handler = next;
return () => {
if (handler === next) {
handler = undefined;
}
};
}),
emitUnhandledRejection: (reason: unknown) => handler?.(reason) ?? false,
resetUnhandledRejection: () => {
handler = undefined;
},
};
});
const {
registerUnhandledRejectionHandlerMock,
registerUncaughtExceptionHandlerMock,
emitUnhandledRejection,
emitUncaughtException,
resetProcessErrorHandlers,
} = vi.hoisted(() => {
let unhandledRejectionHandler: ((reason: unknown) => boolean) | undefined;
let uncaughtExceptionHandler: ((error: unknown) => boolean) | undefined;
return {
registerUnhandledRejectionHandlerMock: vi.fn((next: (reason: unknown) => boolean) => {
unhandledRejectionHandler = next;
return () => {
if (unhandledRejectionHandler === next) {
unhandledRejectionHandler = undefined;
}
};
}),
registerUncaughtExceptionHandlerMock: vi.fn((next: (error: unknown) => boolean) => {
uncaughtExceptionHandler = next;
return () => {
if (uncaughtExceptionHandler === next) {
uncaughtExceptionHandler = undefined;
}
};
}),
emitUnhandledRejection: (reason: unknown) => unhandledRejectionHandler?.(reason) ?? false,
emitUncaughtException: (error: unknown) => uncaughtExceptionHandler?.(error) ?? false,
resetProcessErrorHandlers: () => {
unhandledRejectionHandler = undefined;
uncaughtExceptionHandler = undefined;
},
};
});
const { createTelegramBotErrors } = vi.hoisted(() => ({
createTelegramBotErrors: [] as unknown[],
@@ -113,6 +129,16 @@ function makeRecoverableFetchError() {
});
}
class MockHttpError extends Error {
constructor(
message: string,
public readonly error: unknown,
) {
super(message);
this.name = "HttpError";
}
}
async function makeTaggedPollingFetchError() {
const { tagTelegramNetworkError } = await import("./network-errors.js");
const err = makeRecoverableFetchError();
@@ -123,6 +149,13 @@ async function makeTaggedPollingFetchError() {
return err;
}
async function makeTaggedPollingHttpError() {
return new MockHttpError(
"Network request for 'getUpdates' failed!",
await makeTaggedPollingFetchError(),
);
}
const createAbortTask = (
abort: AbortController,
beforeAbort?: () => void,
@@ -316,6 +349,7 @@ vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
computeBackoff,
sleepWithAbort,
registerUnhandledRejectionHandler: registerUnhandledRejectionHandlerMock,
registerUncaughtExceptionHandler: registerUncaughtExceptionHandlerMock,
};
});
@@ -364,7 +398,8 @@ describe("monitorTelegramProvider (grammY)", () => {
close: vi.fn(async () => undefined),
}));
registerUnhandledRejectionHandlerMock.mockClear();
resetUnhandledRejection();
registerUncaughtExceptionHandlerMock.mockClear();
resetProcessErrorHandlers();
createTelegramBotErrors.length = 0;
createdBotStops.length = 0;
consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
@@ -659,6 +694,38 @@ describe("monitorTelegramProvider (grammY)", () => {
expectRecoverableRetryState(2);
});
it("force-restarts polling when uncaught network exception stalls runner", async () => {
const abort = new AbortController();
const firstCycle = mockRunOnceWithStalledPollingRunner();
const secondCycle = mockRunOnceWithStalledPollingRunner();
const monitor = monitorTelegramProvider({ token: "tok", abortSignal: abort.signal });
await firstCycle.waitForRunStart();
expect(emitUncaughtException(await makeTaggedPollingFetchError())).toBe(true);
expect(firstCycle.stop).toHaveBeenCalledTimes(1);
await secondCycle.waitForRunStart();
abort.abort();
await monitor;
expectRecoverableRetryState(2);
});
it("force-restarts polling when uncaught polling HttpError stalls runner", async () => {
const abort = new AbortController();
const firstCycle = mockRunOnceWithStalledPollingRunner();
const secondCycle = mockRunOnceWithStalledPollingRunner();
const monitor = monitorTelegramProvider({ token: "tok", abortSignal: abort.signal });
await firstCycle.waitForRunStart();
expect(emitUncaughtException(await makeTaggedPollingHttpError())).toBe(true);
expect(firstCycle.stop).toHaveBeenCalledTimes(1);
await secondCycle.waitForRunStart();
abort.abort();
await monitor;
expectRecoverableRetryState(2);
});
it("rebuilds the resolved transport after a stalled polling restart", async () => {
vi.useFakeTimers({ shouldAdvanceTime: true });
try {

View File

@@ -4,8 +4,11 @@ import { registerChannelRuntimeContext } from "openclaw/plugin-sdk/channel-runti
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
import { resolveAgentMaxConcurrent } from "openclaw/plugin-sdk/model-session-runtime";
import { getRuntimeConfig } from "openclaw/plugin-sdk/runtime-config-snapshot";
import { waitForAbortSignal } from "openclaw/plugin-sdk/runtime-env";
import { registerUnhandledRejectionHandler } from "openclaw/plugin-sdk/runtime-env";
import {
registerUncaughtExceptionHandler,
registerUnhandledRejectionHandler,
waitForAbortSignal,
} from "openclaw/plugin-sdk/runtime-env";
import type { RuntimeEnv } from "openclaw/plugin-sdk/runtime-env";
import { formatErrorMessage } from "openclaw/plugin-sdk/ssrf-runtime";
import { resolveTelegramAccount } from "./accounts.js";
@@ -89,13 +92,9 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) {
const log = opts.runtime?.error ?? console.error;
let pollingSession: TelegramPollingSessionInstance | undefined;
const unregisterHandler = registerUnhandledRejectionHandler((err) => {
const handlePollingNetworkFailure = (err: unknown, label: string) => {
const isNetworkError = isRecoverableTelegramNetworkError(err, { context: "polling" });
const isTelegramPollingError = isTelegramPollingNetworkError(err);
if (isGrammyHttpError(err) && isNetworkError && isTelegramPollingError) {
log(`[telegram] Suppressed network error: ${formatErrorMessage(err)}`);
return true;
}
const activeRunner = pollingSession?.activeRunner;
if (isNetworkError && isTelegramPollingError && activeRunner && activeRunner.isRunning()) {
@@ -104,14 +103,24 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) {
pollingSession?.abortActiveFetch();
void activeRunner.stop().catch(() => {});
log("[telegram][diag] marking transport dirty after polling network failure");
log(
`[telegram] Restarting polling after unhandled network error: ${formatErrorMessage(err)}`,
);
log(`[telegram] Restarting polling after ${label}: ${formatErrorMessage(err)}`);
return true;
}
if (isGrammyHttpError(err) && isNetworkError && isTelegramPollingError) {
log(`[telegram] Suppressed network error: ${formatErrorMessage(err)}`);
return true;
}
return false;
});
};
const unregisterUnhandledRejectionHandler = registerUnhandledRejectionHandler((err) =>
handlePollingNetworkFailure(err, "unhandled network error"),
);
const unregisterUncaughtExceptionHandler = registerUncaughtExceptionHandler((err) =>
handlePollingNetworkFailure(err, "uncaught network error"),
);
try {
const cfg = opts.config ?? getRuntimeConfig();
@@ -254,6 +263,7 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) {
pollingLease.release();
}
} finally {
unregisterHandler();
unregisterUnhandledRejectionHandler();
unregisterUncaughtExceptionHandler();
}
}

View File

@@ -711,11 +711,13 @@ describe("runCli exit behavior", () => {
expect(typeof handler).toBe("function");
try {
const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" });
expect(() => (handler as (error: unknown) => void)(epipe)).not.toThrow();
const hostUnreachable = Object.assign(new Error("connect EHOSTUNREACH 149.154.167.220:443"), {
code: "EHOSTUNREACH",
});
expect(() => (handler as (error: unknown) => void)(hostUnreachable)).not.toThrow();
expect(consoleWarnSpy).toHaveBeenCalledWith(
"[openclaw] Non-fatal uncaught exception (continuing):",
expect.stringContaining("write EPIPE"),
expect.stringContaining("EHOSTUNREACH"),
);
expect(restoreTerminalStateMock).not.toHaveBeenCalled();
expect(exitSpy).not.toHaveBeenCalled();

View File

@@ -259,15 +259,23 @@ describe("isTransientSqliteError", () => {
});
describe("isTransientUnhandledRejectionError", () => {
it("keeps uncaught exception suppression scoped to broken pipes", () => {
it("treats raw pre-connect network uncaught exceptions as benign", () => {
const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" });
const sqlite = Object.assign(new Error("database is locked"), { code: "SQLITE_BUSY" });
const network = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
const hostUnreachable = Object.assign(new Error("connect EHOSTUNREACH"), {
code: "EHOSTUNREACH",
});
const rawHostUnreachable = new Error(
"connect EHOSTUNREACH 149.154.167.220:443 - Local (10.0.10.40:50017)",
);
const generic = new Error("boom");
expect(isBenignUncaughtExceptionError(epipe)).toBe(true);
expect(isBenignUncaughtExceptionError(sqlite)).toBe(false);
expect(isBenignUncaughtExceptionError(network)).toBe(false);
expect(isBenignUncaughtExceptionError(hostUnreachable)).toBe(true);
expect(isBenignUncaughtExceptionError(rawHostUnreachable)).toBe(true);
expect(isBenignUncaughtExceptionError(generic)).toBe(false);
});
it("returns true for transient SQLite errors", () => {

View File

@@ -89,9 +89,22 @@ const TRANSIENT_SQLITE_CODES = new Set([
const TRANSIENT_SQLITE_ERRCODES = new Set([5, 6, 10, 14]);
const BENIGN_UNCAUGHT_EXCEPTION_CODES = new Set(["EPIPE", "EIO"]);
const BENIGN_UNCAUGHT_EXCEPTION_NETWORK_CODES = new Set([
"ECONNREFUSED",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"ENOTFOUND",
"ETIMEDOUT",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_DNS_RESOLVE_FAILED",
"UND_ERR_CONNECT",
]);
const TRANSIENT_NETWORK_MESSAGE_CODE_RE =
/\b(ECONNRESET|ECONNREFUSED|ENOTFOUND|ETIMEDOUT|ESOCKETTIMEDOUT|ECONNABORTED|EPIPE|EHOSTUNREACH|ENETUNREACH|EAI_AGAIN|EPROTO|UND_ERR_CONNECT_TIMEOUT|UND_ERR_DNS_RESOLVE_FAILED|UND_ERR_CONNECT|UND_ERR_SOCKET|UND_ERR_HEADERS_TIMEOUT|UND_ERR_BODY_TIMEOUT)\b/i;
const BENIGN_UNCAUGHT_EXCEPTION_NETWORK_MESSAGE_CODE_RE =
/\b(ECONNREFUSED|EHOSTUNREACH|ENETUNREACH|EAI_AGAIN|ENOTFOUND|ETIMEDOUT|UND_ERR_CONNECT_TIMEOUT|UND_ERR_DNS_RESOLVE_FAILED|UND_ERR_CONNECT)\b/i;
const TRANSIENT_SQLITE_MESSAGE_CODE_RE =
/\b(SQLITE_BUSY|SQLITE_CANTOPEN|SQLITE_IOERR|SQLITE_LOCKED)\b/i;
@@ -341,7 +354,27 @@ export function isTransientUnhandledRejectionError(err: unknown): boolean {
return isTransientNetworkError(err) || isTransientSqliteError(err);
}
function isBenignUncaughtNetworkException(err: unknown): boolean {
for (const candidate of collectNestedUnhandledErrorCandidates(err)) {
const code = extractErrorCodeOrErrno(candidate);
if (code && BENIGN_UNCAUGHT_EXCEPTION_NETWORK_CODES.has(code)) {
return true;
}
if (!candidate || typeof candidate !== "object") {
continue;
}
const message = normalizeLowercaseStringOrEmpty((candidate as { message?: unknown }).message);
if (message && BENIGN_UNCAUGHT_EXCEPTION_NETWORK_MESSAGE_CODE_RE.test(message)) {
return true;
}
}
return false;
}
export function isBenignUncaughtExceptionError(err: unknown): boolean {
if (isBenignUncaughtNetworkException(err)) {
return true;
}
for (const candidate of collectNestedUnhandledErrorCandidates(err)) {
const code = extractErrorCodeOrErrno(candidate);
if (code && BENIGN_UNCAUGHT_EXCEPTION_CODES.has(code)) {