From a95da5b52db09700a1c117dcbd04547579496b7d Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 27 Apr 2026 09:25:33 +0100 Subject: [PATCH] fix(models): enrich local transport failure diagnostics --- CHANGELOG.md | 1 + docs/gateway/local-models.md | 5 ++ docs/gateway/opentelemetry.md | 3 +- .../diagnostics-otel/src/service.test.ts | 15 ++++++ extensions/diagnostics-otel/src/service.ts | 6 +++ .../attempt.model-diagnostic-events.test.ts | 43 +++++++++++++++ .../run/attempt.model-diagnostic-events.ts | 22 +++++++- src/infra/diagnostic-error-metadata.test.ts | 34 ++++++++++++ src/infra/diagnostic-error-metadata.ts | 53 +++++++++++++++++++ src/infra/diagnostic-events.ts | 2 + src/logging/diagnostic-stability.test.ts | 16 ++++++ src/logging/diagnostic-stability.ts | 4 ++ src/plugins/hook-types.ts | 1 + 13 files changed, 203 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cc38b38a97..7606eaa0734 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai - Agents/Bedrock: stop heartbeat runs from persisting blank user transcript turns and repair existing blank user text messages before replay, preventing AWS Bedrock `ContentBlock` blank-text validation failures. Fixes #72640 and #72622. Thanks @goldzulu. - Agents/LM Studio: promote standalone bracketed local-model tool requests into registered tool calls and hide unsupported bracket blocks from visible replies, so MemPalace MCP lookups do not print raw `[tool]` JSON scaffolding in chat. Fixes #66178. Thanks @detroit357. - Local models: warn when an assistant reply looks like a tool call but the provider emitted plain text instead of a structured tool invocation, making fake/non-executed tool calls visible in logs. Fixes #51332. Thanks @emilclaw. +- Local models: classify terminated, reset, closed, timeout, and aborted model-call failures and attach a process memory snapshot to the diagnostic event, making LM Studio/Ollama RAM-pressure failures easier to prove from stability bundles. Refs #65551. Thanks @BigWiLLi111. - LM Studio: trust configured LM Studio loopback, LAN, and tailnet endpoints for guarded model requests by default, preserving explicit private-network opt-outs. Refs #60994. Thanks @tnowakow. - Docker/setup: route Docker onboarding defaults for host-side LM Studio and Ollama through `host.docker.internal` and add the Linux host-gateway mapping to the bundled Compose file, so containerized gateways can reach local providers without using container loopback. Fixes #68684; supersedes #68702. Thanks @safrano9999 and @skolez. - Agents/LM Studio: strip prior-turn Gemma 4 reasoning from OpenAI-compatible replay while preserving active tool-call continuation reasoning. Fixes #68704. Thanks @chip-snomo and @Kailigithub. diff --git a/docs/gateway/local-models.md b/docs/gateway/local-models.md index 65c52e39985..43b7129eb98 100644 --- a/docs/gateway/local-models.md +++ b/docs/gateway/local-models.md @@ -191,6 +191,11 @@ Compatibility notes for stricter OpenAI-compatible backends: - Gateway can reach the proxy? `curl http://127.0.0.1:1234/v1/models`. - LM Studio model unloaded? Reload; cold start is a common “hanging” cause. +- Local server says `terminated`, `ECONNRESET`, or closes the stream mid-turn? + OpenClaw records a low-cardinality `model.call.error.failureKind` plus the + OpenClaw process RSS/heap snapshot in diagnostics. For LM Studio/Ollama + memory pressure, match that timestamp against the server log or macOS crash / + jetsam log to confirm whether the model server was killed. - OpenClaw warns when the detected context window is below **32k** and blocks below **16k**. If you hit that preflight, raise the server/model context limit or choose a larger model. - Context errors? Lower `contextWindow` or raise your server limit. - OpenAI-compatible server returns `messages[].content ... expected a string`? diff --git a/docs/gateway/opentelemetry.md b/docs/gateway/opentelemetry.md index 7e1fa390e0d..55a108aa949 100644 --- a/docs/gateway/opentelemetry.md +++ b/docs/gateway/opentelemetry.md @@ -169,7 +169,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted - `openclaw.context.tokens` (histogram, attrs: `openclaw.context`, `openclaw.channel`, `openclaw.provider`, `openclaw.model`) - `gen_ai.client.token.usage` (histogram, GenAI semantic-conventions metric, attrs: `gen_ai.token.type` = `input`/`output`, `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`) - `gen_ai.client.operation.duration` (histogram, seconds, GenAI semantic-conventions metric, attrs: `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`, optional `error.type`) -- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`) +- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`, plus `openclaw.errorCategory` and `openclaw.failureKind` on classified errors) - `openclaw.model_call.request_bytes` (histogram, UTF-8 byte size of the final model request payload; no raw payload content) - `openclaw.model_call.response_bytes` (histogram, UTF-8 byte size of streamed model response events; no raw response content) - `openclaw.model_call.time_to_first_byte_ms` (histogram, elapsed time before the first streamed response event) @@ -224,6 +224,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted - `openclaw.model.call` - `gen_ai.system` by default, or `gen_ai.provider.name` when the latest GenAI semantic conventions are opted in - `gen_ai.request.model`, `gen_ai.operation.name`, `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport` + - `openclaw.errorCategory` and optional `openclaw.failureKind` on errors - `openclaw.model_call.request_bytes`, `openclaw.model_call.response_bytes`, `openclaw.model_call.time_to_first_byte_ms` - `openclaw.provider.request_id_hash` (bounded SHA-based hash of the upstream provider request id; raw ids are not exported) - `openclaw.harness.run` diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index c4ac82b7d3f..9cfb9dce920 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -1525,6 +1525,7 @@ describe("diagnostics-otel service", () => { api: "openai-responses", durationMs: 40, errorCategory: "ProviderError", + failureKind: "terminated", upstreamRequestIdHash: "sha256:123456abcdef", }); await flushDiagnosticEvents(); @@ -1532,6 +1533,12 @@ describe("diagnostics-otel service", () => { const modelCall = telemetryState.tracer.startSpan.mock.calls.find( (call) => call[0] === "openclaw.model.call", ); + expect(modelCall?.[1]).toEqual({ + attributes: expect.objectContaining({ + "openclaw.failureKind": "terminated", + }), + startTime: expect.any(Number), + }); expect(modelCall?.[1]).toEqual({ attributes: expect.not.objectContaining({ "openclaw.upstreamRequestIdHash": expect.anything(), @@ -1542,6 +1549,14 @@ describe("diagnostics-otel service", () => { expect(span?.addEvent).toHaveBeenCalledWith("openclaw.provider.request", { "openclaw.upstreamRequestIdHash": "sha256:123456abcdef", }); + expect( + telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record, + ).toHaveBeenCalledWith( + 40, + expect.objectContaining({ + "openclaw.failureKind": "terminated", + }), + ); expect( telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record, ).toHaveBeenCalledWith( diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index 3ec59b05238..fdf0e026565 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -1834,6 +1834,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { const metricAttrs = { ...modelCallMetricAttrs(evt), "openclaw.errorCategory": errorType, + ...(evt.failureKind + ? { "openclaw.failureKind": lowCardinalityAttr(evt.failureKind, "other") } + : {}), }; modelCallDurationHistogram.record(evt.durationMs, metricAttrs); recordModelCallSizeTimingMetrics(evt, metricAttrs); @@ -1850,6 +1853,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { "openclaw.errorCategory": errorType, "error.type": errorType, }; + if (evt.failureKind) { + spanAttrs["openclaw.failureKind"] = lowCardinalityAttr(evt.failureKind, "other"); + } assignGenAiModelCallAttrs(spanAttrs, evt); if (evt.api) { spanAttrs["openclaw.api"] = evt.api; diff --git a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts index 92dcdd4df21..0816a285b7a 100644 --- a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts +++ b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts @@ -254,6 +254,49 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => { expect(JSON.stringify(events[1])).not.toContain(requestId); }); + it("adds failure kind and memory diagnostics for terminated model calls", async () => { + const stream = { + [Symbol.asyncIterator]() { + return { + async next(): Promise> { + throw new Error("terminated"); + }, + }; + }, + }; + const wrapped = wrapStreamFnWithDiagnosticModelCallEvents( + (() => stream) as unknown as StreamFn, + { + runId: "run-1", + provider: "lmstudio", + model: "qwen/qwen3.5-9b", + trace: createDiagnosticTraceContext(), + nextCallId: () => "call-terminated", + }, + ); + + const events = await collectModelCallEvents(async () => { + await expect( + drain(wrapped({} as never, {} as never, {} as never) as AsyncIterable), + ).rejects.toThrow("terminated"); + }); + + expect(events.map((event) => event.type)).toEqual(["model.call.started", "model.call.error"]); + expect(events[1]).toMatchObject({ + type: "model.call.error", + callId: "call-terminated", + errorCategory: "Error", + failureKind: "terminated", + memory: { + rssBytes: expect.any(Number), + heapTotalBytes: expect.any(Number), + heapUsedBytes: expect.any(Number), + externalBytes: expect.any(Number), + arrayBuffersBytes: expect.any(Number), + }, + }); + }); + it("does not mutate non-configurable provider streams", async () => { const stream = {}; Object.defineProperty(stream, Symbol.asyncIterator, { diff --git a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts index cf10564d934..aad6f1d548b 100644 --- a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts +++ b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts @@ -2,11 +2,13 @@ import type { StreamFn } from "@mariozechner/pi-agent-core"; import { fireAndForgetBoundedHook } from "../../../hooks/fire-and-forget.js"; import { diagnosticErrorCategory, + diagnosticErrorFailureKind, diagnosticProviderRequestIdHash, } from "../../../infra/diagnostic-error-metadata.js"; import { emitTrustedDiagnosticEvent, type DiagnosticEventInput, + type DiagnosticMemoryUsage, } from "../../../infra/diagnostic-events.js"; import { createChildDiagnosticTraceContext, @@ -41,7 +43,7 @@ type ModelCallEventBase = Omit< >; type ModelCallErrorFields = Pick< Extract, - "errorCategory" | "upstreamRequestIdHash" + "errorCategory" | "failureKind" | "memory" | "upstreamRequestIdHash" >; type ModelCallEndedHookFields = Pick< PluginHookModelCallEndedEvent, @@ -51,6 +53,7 @@ type ModelCallEndedHookFields = Pick< | "requestPayloadBytes" | "responseStreamBytes" | "timeToFirstByteMs" + | "failureKind" | "upstreamRequestIdHash" >; type ModelCallSizeTimingFields = Pick< @@ -152,12 +155,29 @@ function baseModelCallEvent( function modelCallErrorFields(err: unknown): ModelCallErrorFields { const upstreamRequestIdHash = diagnosticProviderRequestIdHash(err); + const failureKind = diagnosticErrorFailureKind(err); return { errorCategory: diagnosticErrorCategory(err), + ...(failureKind ? { failureKind, memory: processMemoryUsageSnapshot() } : {}), ...(upstreamRequestIdHash ? { upstreamRequestIdHash } : {}), }; } +function processMemoryUsageSnapshot(): DiagnosticMemoryUsage | undefined { + try { + const memory = process.memoryUsage(); + return { + rssBytes: memory.rss, + heapTotalBytes: memory.heapTotal, + heapUsedBytes: memory.heapUsed, + externalBytes: memory.external, + arrayBuffersBytes: memory.arrayBuffers, + }; + } catch { + return undefined; + } +} + function modelCallHookEventBase(eventBase: ModelCallEventBase): PluginHookModelCallStartedEvent { return { runId: eventBase.runId, diff --git a/src/infra/diagnostic-error-metadata.test.ts b/src/infra/diagnostic-error-metadata.test.ts index e88b97bf0aa..753efc69952 100644 --- a/src/infra/diagnostic-error-metadata.test.ts +++ b/src/infra/diagnostic-error-metadata.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import { diagnosticErrorCategory, + diagnosticErrorFailureKind, diagnosticHttpStatusCode, diagnosticProviderRequestIdHash, } from "./diagnostic-error-metadata.js"; @@ -76,4 +77,37 @@ describe("diagnostic error metadata", () => { expect(diagnosticProviderRequestIdHash(errorLike)).toBeUndefined(); }); + + it("classifies low-cardinality transport failure kinds without exposing messages", () => { + expect(diagnosticErrorFailureKind(new Error("terminated"))).toBe("terminated"); + expect( + diagnosticErrorFailureKind(Object.assign(new Error("fetch failed"), { code: "ECONNRESET" })), + ).toBe("connection_reset"); + expect( + diagnosticErrorFailureKind({ + error: Object.assign(new Error("socket closed"), { code: "UND_ERR_SOCKET" }), + }), + ).toBe("connection_closed"); + expect(diagnosticErrorFailureKind(new Error("request timed out after 120000ms"))).toBe( + "timeout", + ); + expect(diagnosticErrorFailureKind(new Error("operation was aborted"))).toBe("aborted"); + expect(diagnosticErrorFailureKind(new Error("provider rejected the request"))).toBeUndefined(); + }); + + it("does not invoke throwing getters while classifying failure kinds", () => { + const errorLike = {}; + Object.defineProperty(errorLike, "code", { + get() { + throw new Error("should not read getter"); + }, + }); + Object.defineProperty(errorLike, "message", { + get() { + throw new Error("should not read getter"); + }, + }); + + expect(diagnosticErrorFailureKind(errorLike)).toBeUndefined(); + }); }); diff --git a/src/infra/diagnostic-error-metadata.ts b/src/infra/diagnostic-error-metadata.ts index 043a94f32bb..d8692ad8cc0 100644 --- a/src/infra/diagnostic-error-metadata.ts +++ b/src/infra/diagnostic-error-metadata.ts @@ -15,6 +15,13 @@ const PROVIDER_REQUEST_ID_TEXT_PATTERNS = [ /\((?:request_id|trace_id)\s*:\s*([A-Za-z0-9._:-]{1,128})\)/i, ] as const; +export type DiagnosticErrorFailureKind = + | "aborted" + | "connection_closed" + | "connection_reset" + | "terminated" + | "timeout"; + function isObjectLike(value: unknown): value is object { return (typeof value === "object" || typeof value === "function") && value !== null; } @@ -101,6 +108,11 @@ function readDirectMessage(err: unknown): string | undefined { return typeof message === "string" ? message : undefined; } +function readDirectCode(err: unknown): string | undefined { + const code = readOwnDataProperty(err, "code"); + return typeof code === "string" ? code : undefined; +} + function extractProviderRequestIdFromText(text: string | undefined): string | undefined { if (!text) { return undefined; @@ -158,6 +170,47 @@ export function diagnosticHttpStatusCode(err: unknown): string | undefined { return undefined; } +export function diagnosticErrorFailureKind(err: unknown): DiagnosticErrorFailureKind | undefined { + const code = findDiagnosticErrorProperty(err, readDirectCode)?.trim().toUpperCase(); + switch (code) { + case undefined: + break; + case "ABORT_ERR": + case "ECONNABORTED": + case "ERR_ABORTED": + return "aborted"; + case "ECONNRESET": + return "connection_reset"; + case "ERR_STREAM_PREMATURE_CLOSE": + case "UND_ERR_SOCKET": + return "connection_closed"; + case "ETIMEDOUT": + case "ERR_SOCKET_CONNECTION_TIMEOUT": + return "timeout"; + } + + const message = findDiagnosticErrorProperty(err, readDirectMessage); + if (!message) { + return undefined; + } + if (/\b(?:terminated|sigkill|sigterm)\b/i.test(message)) { + return "terminated"; + } + if (/\b(?:econnreset|connection reset)\b/i.test(message)) { + return "connection_reset"; + } + if (/\b(?:socket hang up|premature close|connection closed|other side closed)\b/i.test(message)) { + return "connection_closed"; + } + if (/\b(?:timed out|timeout|etimedout)\b/i.test(message)) { + return "timeout"; + } + if (/\b(?:aborted|abort_err|operation was aborted)\b/i.test(message)) { + return "aborted"; + } + return undefined; +} + export function diagnosticProviderRequestIdHash(err: unknown): string | undefined { const fromProperty = findDiagnosticErrorProperty(err, readDirectProviderRequestId); if (fromProperty) { diff --git a/src/infra/diagnostic-events.ts b/src/infra/diagnostic-events.ts index 3892f96a665..ab3c055bdae 100644 --- a/src/infra/diagnostic-events.ts +++ b/src/infra/diagnostic-events.ts @@ -327,6 +327,8 @@ export type DiagnosticModelCallErrorEvent = DiagnosticModelCallBaseEvent & { type: "model.call.error"; durationMs: number; errorCategory: string; + failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout"; + memory?: DiagnosticMemoryUsage; requestPayloadBytes?: number; responseStreamBytes?: number; timeToFirstByteMs?: number; diff --git a/src/logging/diagnostic-stability.test.ts b/src/logging/diagnostic-stability.test.ts index 98c5cd0b66d..549160e2964 100644 --- a/src/logging/diagnostic-stability.test.ts +++ b/src/logging/diagnostic-stability.test.ts @@ -156,6 +156,14 @@ describe("diagnostic stability recorder", () => { responseStreamBytes: 567, timeToFirstByteMs: 89, errorCategory: "TypeError", + failureKind: "terminated", + memory: { + rssBytes: 100, + heapTotalBytes: 80, + heapUsedBytes: 40, + externalBytes: 20, + arrayBuffersBytes: 10, + }, }); await new Promise((resolve) => setImmediate(resolve)); @@ -175,6 +183,14 @@ describe("diagnostic stability recorder", () => { responseBytes: 567, timeToFirstByteMs: 89, reason: "TypeError", + failureKind: "terminated", + memory: { + rssBytes: 100, + heapTotalBytes: 80, + heapUsedBytes: 40, + externalBytes: 20, + arrayBuffersBytes: 10, + }, }); expect(JSON.stringify(snapshot.events[1])).not.toContain("call-1"); }); diff --git a/src/logging/diagnostic-stability.ts b/src/logging/diagnostic-stability.ts index 310478f8102..b625e172b30 100644 --- a/src/logging/diagnostic-stability.ts +++ b/src/logging/diagnostic-stability.ts @@ -49,6 +49,7 @@ export type DiagnosticStabilityEventRecord = { queueDepth?: number; queueSize?: number; waitMs?: number; + failureKind?: string; active?: number; waiting?: number; queued?: number; @@ -293,6 +294,7 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi record.commandLength = event.commandLength; record.exitCode = event.exitCode; record.timedOut = event.timedOut; + record.failureKind = event.failureKind; assignReasonCode(record, event.failureKind); break; case "run.started": @@ -355,6 +357,8 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi record.requestBytes = event.requestPayloadBytes; record.responseBytes = event.responseStreamBytes; record.timeToFirstByteMs = event.timeToFirstByteMs; + record.failureKind = event.failureKind; + record.memory = event.memory ? copyMemory(event.memory) : undefined; assignReasonCode(record, event.errorCategory); break; case "log.record": diff --git a/src/plugins/hook-types.ts b/src/plugins/hook-types.ts index 334d69e7f2e..5fe204b9bd2 100644 --- a/src/plugins/hook-types.ts +++ b/src/plugins/hook-types.ts @@ -212,6 +212,7 @@ export type PluginHookModelCallEndedEvent = PluginHookModelCallBaseEvent & { durationMs: number; outcome: "completed" | "error"; errorCategory?: string; + failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout"; requestPayloadBytes?: number; responseStreamBytes?: number; timeToFirstByteMs?: number;