mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-05-01 06:36:23 +08:00
fix(telegram): keep unreachable polling sockets non-fatal
* Runtime: suppress transient network uncaught exceptions * fix(telegram): keep unreachable polling sockets non-fatal --------- Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -57,6 +57,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/attribution: send OpenClaw attribution headers on native OpenAI and Codex traffic, including SDK transports, realtime voice and TTS, device-code auth, WHAM usage, and remote embeddings, so PI-origin defaults no longer leak into provider requests. Thanks @vincentkoc.
|
||||
- Agents/auth: keep OAuth auth profiles inherited from the main agent read-through instead of copying refresh tokens into secondary agents, and refresh Codex app-server tokens against the owning store so multi-agent swarms avoid reused refresh-token failures. Fixes #74055. Thanks @ClarityInvest.
|
||||
- Channels/Telegram: honor `ALL_PROXY` / `all_proxy` and service-level `OPENCLAW_PROXY_URL` when constructing the HTTP/1-only Telegram Bot API transport, so Windows and service installs that rely on those proxy settings no longer fall back to direct egress. Fixes #74014; refs #74086. Thanks @SymbolStar.
|
||||
- Channels/Telegram: keep raw host/network-unreachable Bot API connect failures non-fatal and route tagged polling uncaught exceptions through the Telegram restart path, so transient reachability failures no longer kill the Gateway or leave long polling stuck. Fixes #60515; refs #74540. Thanks @HemantSudarshan, @thacid22, and @ewimsatt.
|
||||
- Channels/Telegram: continue polling when `deleteWebhook` hits a transient network failure but `getWebhookInfo` confirms no webhook is configured, so startup does not retry cleanup forever after the webhook was already removed. Refs #74086; carries forward #47384. Thanks @clovericbot.
|
||||
- Channels/Telegram: apply strict safe-send retry to inbound final replies when grammY wraps a pre-connect failure, while leaving ambiguous plain network envelopes single-shot to avoid duplicate visible messages. Fixes #74203. Thanks @nanli2000cn.
|
||||
- Channels/Telegram: surface polling liveness warnings in channel status and doctor when a running long-poller has not completed `getUpdates` after startup grace or its transport activity is stale, so silent polling failures no longer look clean. Refs #74299. Thanks @lolaopenclaw.
|
||||
|
||||
@@ -44,24 +44,40 @@ const { initSpy, runSpy, getRuntimeConfigMock } = vi.hoisted(() => ({
|
||||
})),
|
||||
}));
|
||||
|
||||
const { registerUnhandledRejectionHandlerMock, emitUnhandledRejection, resetUnhandledRejection } =
|
||||
vi.hoisted(() => {
|
||||
let handler: ((reason: unknown) => boolean) | undefined;
|
||||
return {
|
||||
registerUnhandledRejectionHandlerMock: vi.fn((next: (reason: unknown) => boolean) => {
|
||||
handler = next;
|
||||
return () => {
|
||||
if (handler === next) {
|
||||
handler = undefined;
|
||||
}
|
||||
};
|
||||
}),
|
||||
emitUnhandledRejection: (reason: unknown) => handler?.(reason) ?? false,
|
||||
resetUnhandledRejection: () => {
|
||||
handler = undefined;
|
||||
},
|
||||
};
|
||||
});
|
||||
const {
|
||||
registerUnhandledRejectionHandlerMock,
|
||||
registerUncaughtExceptionHandlerMock,
|
||||
emitUnhandledRejection,
|
||||
emitUncaughtException,
|
||||
resetProcessErrorHandlers,
|
||||
} = vi.hoisted(() => {
|
||||
let unhandledRejectionHandler: ((reason: unknown) => boolean) | undefined;
|
||||
let uncaughtExceptionHandler: ((error: unknown) => boolean) | undefined;
|
||||
return {
|
||||
registerUnhandledRejectionHandlerMock: vi.fn((next: (reason: unknown) => boolean) => {
|
||||
unhandledRejectionHandler = next;
|
||||
return () => {
|
||||
if (unhandledRejectionHandler === next) {
|
||||
unhandledRejectionHandler = undefined;
|
||||
}
|
||||
};
|
||||
}),
|
||||
registerUncaughtExceptionHandlerMock: vi.fn((next: (error: unknown) => boolean) => {
|
||||
uncaughtExceptionHandler = next;
|
||||
return () => {
|
||||
if (uncaughtExceptionHandler === next) {
|
||||
uncaughtExceptionHandler = undefined;
|
||||
}
|
||||
};
|
||||
}),
|
||||
emitUnhandledRejection: (reason: unknown) => unhandledRejectionHandler?.(reason) ?? false,
|
||||
emitUncaughtException: (error: unknown) => uncaughtExceptionHandler?.(error) ?? false,
|
||||
resetProcessErrorHandlers: () => {
|
||||
unhandledRejectionHandler = undefined;
|
||||
uncaughtExceptionHandler = undefined;
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
const { createTelegramBotErrors } = vi.hoisted(() => ({
|
||||
createTelegramBotErrors: [] as unknown[],
|
||||
@@ -113,6 +129,16 @@ function makeRecoverableFetchError() {
|
||||
});
|
||||
}
|
||||
|
||||
class MockHttpError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly error: unknown,
|
||||
) {
|
||||
super(message);
|
||||
this.name = "HttpError";
|
||||
}
|
||||
}
|
||||
|
||||
async function makeTaggedPollingFetchError() {
|
||||
const { tagTelegramNetworkError } = await import("./network-errors.js");
|
||||
const err = makeRecoverableFetchError();
|
||||
@@ -123,6 +149,13 @@ async function makeTaggedPollingFetchError() {
|
||||
return err;
|
||||
}
|
||||
|
||||
async function makeTaggedPollingHttpError() {
|
||||
return new MockHttpError(
|
||||
"Network request for 'getUpdates' failed!",
|
||||
await makeTaggedPollingFetchError(),
|
||||
);
|
||||
}
|
||||
|
||||
const createAbortTask = (
|
||||
abort: AbortController,
|
||||
beforeAbort?: () => void,
|
||||
@@ -316,6 +349,7 @@ vi.mock("openclaw/plugin-sdk/runtime-env", async () => {
|
||||
computeBackoff,
|
||||
sleepWithAbort,
|
||||
registerUnhandledRejectionHandler: registerUnhandledRejectionHandlerMock,
|
||||
registerUncaughtExceptionHandler: registerUncaughtExceptionHandlerMock,
|
||||
};
|
||||
});
|
||||
|
||||
@@ -364,7 +398,8 @@ describe("monitorTelegramProvider (grammY)", () => {
|
||||
close: vi.fn(async () => undefined),
|
||||
}));
|
||||
registerUnhandledRejectionHandlerMock.mockClear();
|
||||
resetUnhandledRejection();
|
||||
registerUncaughtExceptionHandlerMock.mockClear();
|
||||
resetProcessErrorHandlers();
|
||||
createTelegramBotErrors.length = 0;
|
||||
createdBotStops.length = 0;
|
||||
consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
@@ -659,6 +694,38 @@ describe("monitorTelegramProvider (grammY)", () => {
|
||||
expectRecoverableRetryState(2);
|
||||
});
|
||||
|
||||
it("force-restarts polling when uncaught network exception stalls runner", async () => {
|
||||
const abort = new AbortController();
|
||||
const firstCycle = mockRunOnceWithStalledPollingRunner();
|
||||
const secondCycle = mockRunOnceWithStalledPollingRunner();
|
||||
|
||||
const monitor = monitorTelegramProvider({ token: "tok", abortSignal: abort.signal });
|
||||
await firstCycle.waitForRunStart();
|
||||
|
||||
expect(emitUncaughtException(await makeTaggedPollingFetchError())).toBe(true);
|
||||
expect(firstCycle.stop).toHaveBeenCalledTimes(1);
|
||||
await secondCycle.waitForRunStart();
|
||||
abort.abort();
|
||||
await monitor;
|
||||
expectRecoverableRetryState(2);
|
||||
});
|
||||
|
||||
it("force-restarts polling when uncaught polling HttpError stalls runner", async () => {
|
||||
const abort = new AbortController();
|
||||
const firstCycle = mockRunOnceWithStalledPollingRunner();
|
||||
const secondCycle = mockRunOnceWithStalledPollingRunner();
|
||||
|
||||
const monitor = monitorTelegramProvider({ token: "tok", abortSignal: abort.signal });
|
||||
await firstCycle.waitForRunStart();
|
||||
|
||||
expect(emitUncaughtException(await makeTaggedPollingHttpError())).toBe(true);
|
||||
expect(firstCycle.stop).toHaveBeenCalledTimes(1);
|
||||
await secondCycle.waitForRunStart();
|
||||
abort.abort();
|
||||
await monitor;
|
||||
expectRecoverableRetryState(2);
|
||||
});
|
||||
|
||||
it("rebuilds the resolved transport after a stalled polling restart", async () => {
|
||||
vi.useFakeTimers({ shouldAdvanceTime: true });
|
||||
try {
|
||||
|
||||
@@ -4,8 +4,11 @@ import { registerChannelRuntimeContext } from "openclaw/plugin-sdk/channel-runti
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types";
|
||||
import { resolveAgentMaxConcurrent } from "openclaw/plugin-sdk/model-session-runtime";
|
||||
import { getRuntimeConfig } from "openclaw/plugin-sdk/runtime-config-snapshot";
|
||||
import { waitForAbortSignal } from "openclaw/plugin-sdk/runtime-env";
|
||||
import { registerUnhandledRejectionHandler } from "openclaw/plugin-sdk/runtime-env";
|
||||
import {
|
||||
registerUncaughtExceptionHandler,
|
||||
registerUnhandledRejectionHandler,
|
||||
waitForAbortSignal,
|
||||
} from "openclaw/plugin-sdk/runtime-env";
|
||||
import type { RuntimeEnv } from "openclaw/plugin-sdk/runtime-env";
|
||||
import { formatErrorMessage } from "openclaw/plugin-sdk/ssrf-runtime";
|
||||
import { resolveTelegramAccount } from "./accounts.js";
|
||||
@@ -89,13 +92,9 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) {
|
||||
const log = opts.runtime?.error ?? console.error;
|
||||
let pollingSession: TelegramPollingSessionInstance | undefined;
|
||||
|
||||
const unregisterHandler = registerUnhandledRejectionHandler((err) => {
|
||||
const handlePollingNetworkFailure = (err: unknown, label: string) => {
|
||||
const isNetworkError = isRecoverableTelegramNetworkError(err, { context: "polling" });
|
||||
const isTelegramPollingError = isTelegramPollingNetworkError(err);
|
||||
if (isGrammyHttpError(err) && isNetworkError && isTelegramPollingError) {
|
||||
log(`[telegram] Suppressed network error: ${formatErrorMessage(err)}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
const activeRunner = pollingSession?.activeRunner;
|
||||
if (isNetworkError && isTelegramPollingError && activeRunner && activeRunner.isRunning()) {
|
||||
@@ -104,14 +103,24 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) {
|
||||
pollingSession?.abortActiveFetch();
|
||||
void activeRunner.stop().catch(() => {});
|
||||
log("[telegram][diag] marking transport dirty after polling network failure");
|
||||
log(
|
||||
`[telegram] Restarting polling after unhandled network error: ${formatErrorMessage(err)}`,
|
||||
);
|
||||
log(`[telegram] Restarting polling after ${label}: ${formatErrorMessage(err)}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isGrammyHttpError(err) && isNetworkError && isTelegramPollingError) {
|
||||
log(`[telegram] Suppressed network error: ${formatErrorMessage(err)}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
};
|
||||
|
||||
const unregisterUnhandledRejectionHandler = registerUnhandledRejectionHandler((err) =>
|
||||
handlePollingNetworkFailure(err, "unhandled network error"),
|
||||
);
|
||||
const unregisterUncaughtExceptionHandler = registerUncaughtExceptionHandler((err) =>
|
||||
handlePollingNetworkFailure(err, "uncaught network error"),
|
||||
);
|
||||
|
||||
try {
|
||||
const cfg = opts.config ?? getRuntimeConfig();
|
||||
@@ -254,6 +263,7 @@ export async function monitorTelegramProvider(opts: MonitorTelegramOpts = {}) {
|
||||
pollingLease.release();
|
||||
}
|
||||
} finally {
|
||||
unregisterHandler();
|
||||
unregisterUnhandledRejectionHandler();
|
||||
unregisterUncaughtExceptionHandler();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -711,11 +711,13 @@ describe("runCli exit behavior", () => {
|
||||
expect(typeof handler).toBe("function");
|
||||
|
||||
try {
|
||||
const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" });
|
||||
expect(() => (handler as (error: unknown) => void)(epipe)).not.toThrow();
|
||||
const hostUnreachable = Object.assign(new Error("connect EHOSTUNREACH 149.154.167.220:443"), {
|
||||
code: "EHOSTUNREACH",
|
||||
});
|
||||
expect(() => (handler as (error: unknown) => void)(hostUnreachable)).not.toThrow();
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
"[openclaw] Non-fatal uncaught exception (continuing):",
|
||||
expect.stringContaining("write EPIPE"),
|
||||
expect.stringContaining("EHOSTUNREACH"),
|
||||
);
|
||||
expect(restoreTerminalStateMock).not.toHaveBeenCalled();
|
||||
expect(exitSpy).not.toHaveBeenCalled();
|
||||
|
||||
@@ -259,15 +259,23 @@ describe("isTransientSqliteError", () => {
|
||||
});
|
||||
|
||||
describe("isTransientUnhandledRejectionError", () => {
|
||||
it("keeps uncaught exception suppression scoped to broken pipes", () => {
|
||||
it("treats raw pre-connect network uncaught exceptions as benign", () => {
|
||||
const epipe = Object.assign(new Error("write EPIPE"), { code: "EPIPE" });
|
||||
const sqlite = Object.assign(new Error("database is locked"), { code: "SQLITE_BUSY" });
|
||||
const network = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
|
||||
const hostUnreachable = Object.assign(new Error("connect EHOSTUNREACH"), {
|
||||
code: "EHOSTUNREACH",
|
||||
});
|
||||
const rawHostUnreachable = new Error(
|
||||
"connect EHOSTUNREACH 149.154.167.220:443 - Local (10.0.10.40:50017)",
|
||||
);
|
||||
const generic = new Error("boom");
|
||||
|
||||
expect(isBenignUncaughtExceptionError(epipe)).toBe(true);
|
||||
expect(isBenignUncaughtExceptionError(sqlite)).toBe(false);
|
||||
expect(isBenignUncaughtExceptionError(network)).toBe(false);
|
||||
expect(isBenignUncaughtExceptionError(hostUnreachable)).toBe(true);
|
||||
expect(isBenignUncaughtExceptionError(rawHostUnreachable)).toBe(true);
|
||||
expect(isBenignUncaughtExceptionError(generic)).toBe(false);
|
||||
});
|
||||
it("returns true for transient SQLite errors", () => {
|
||||
|
||||
@@ -89,9 +89,22 @@ const TRANSIENT_SQLITE_CODES = new Set([
|
||||
const TRANSIENT_SQLITE_ERRCODES = new Set([5, 6, 10, 14]);
|
||||
|
||||
const BENIGN_UNCAUGHT_EXCEPTION_CODES = new Set(["EPIPE", "EIO"]);
|
||||
const BENIGN_UNCAUGHT_EXCEPTION_NETWORK_CODES = new Set([
|
||||
"ECONNREFUSED",
|
||||
"EHOSTUNREACH",
|
||||
"ENETUNREACH",
|
||||
"EAI_AGAIN",
|
||||
"ENOTFOUND",
|
||||
"ETIMEDOUT",
|
||||
"UND_ERR_CONNECT_TIMEOUT",
|
||||
"UND_ERR_DNS_RESOLVE_FAILED",
|
||||
"UND_ERR_CONNECT",
|
||||
]);
|
||||
|
||||
const TRANSIENT_NETWORK_MESSAGE_CODE_RE =
|
||||
/\b(ECONNRESET|ECONNREFUSED|ENOTFOUND|ETIMEDOUT|ESOCKETTIMEDOUT|ECONNABORTED|EPIPE|EHOSTUNREACH|ENETUNREACH|EAI_AGAIN|EPROTO|UND_ERR_CONNECT_TIMEOUT|UND_ERR_DNS_RESOLVE_FAILED|UND_ERR_CONNECT|UND_ERR_SOCKET|UND_ERR_HEADERS_TIMEOUT|UND_ERR_BODY_TIMEOUT)\b/i;
|
||||
const BENIGN_UNCAUGHT_EXCEPTION_NETWORK_MESSAGE_CODE_RE =
|
||||
/\b(ECONNREFUSED|EHOSTUNREACH|ENETUNREACH|EAI_AGAIN|ENOTFOUND|ETIMEDOUT|UND_ERR_CONNECT_TIMEOUT|UND_ERR_DNS_RESOLVE_FAILED|UND_ERR_CONNECT)\b/i;
|
||||
|
||||
const TRANSIENT_SQLITE_MESSAGE_CODE_RE =
|
||||
/\b(SQLITE_BUSY|SQLITE_CANTOPEN|SQLITE_IOERR|SQLITE_LOCKED)\b/i;
|
||||
@@ -341,7 +354,27 @@ export function isTransientUnhandledRejectionError(err: unknown): boolean {
|
||||
return isTransientNetworkError(err) || isTransientSqliteError(err);
|
||||
}
|
||||
|
||||
function isBenignUncaughtNetworkException(err: unknown): boolean {
|
||||
for (const candidate of collectNestedUnhandledErrorCandidates(err)) {
|
||||
const code = extractErrorCodeOrErrno(candidate);
|
||||
if (code && BENIGN_UNCAUGHT_EXCEPTION_NETWORK_CODES.has(code)) {
|
||||
return true;
|
||||
}
|
||||
if (!candidate || typeof candidate !== "object") {
|
||||
continue;
|
||||
}
|
||||
const message = normalizeLowercaseStringOrEmpty((candidate as { message?: unknown }).message);
|
||||
if (message && BENIGN_UNCAUGHT_EXCEPTION_NETWORK_MESSAGE_CODE_RE.test(message)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export function isBenignUncaughtExceptionError(err: unknown): boolean {
|
||||
if (isBenignUncaughtNetworkException(err)) {
|
||||
return true;
|
||||
}
|
||||
for (const candidate of collectNestedUnhandledErrorCandidates(err)) {
|
||||
const code = extractErrorCodeOrErrno(candidate);
|
||||
if (code && BENIGN_UNCAUGHT_EXCEPTION_CODES.has(code)) {
|
||||
|
||||
Reference in New Issue
Block a user