fix(models): enrich local transport failure diagnostics

This commit is contained in:
Peter Steinberger
2026-04-27 09:25:33 +01:00
parent c2d82b87ee
commit a95da5b52d
13 changed files with 203 additions and 2 deletions

View File

@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
- Agents/Bedrock: stop heartbeat runs from persisting blank user transcript turns and repair existing blank user text messages before replay, preventing AWS Bedrock `ContentBlock` blank-text validation failures. Fixes #72640 and #72622. Thanks @goldzulu.
- Agents/LM Studio: promote standalone bracketed local-model tool requests into registered tool calls and hide unsupported bracket blocks from visible replies, so MemPalace MCP lookups do not print raw `[tool]` JSON scaffolding in chat. Fixes #66178. Thanks @detroit357.
- Local models: warn when an assistant reply looks like a tool call but the provider emitted plain text instead of a structured tool invocation, making fake/non-executed tool calls visible in logs. Fixes #51332. Thanks @emilclaw.
- Local models: classify terminated, reset, closed, timeout, and aborted model-call failures and attach a process memory snapshot to the diagnostic event, making LM Studio/Ollama RAM-pressure failures easier to prove from stability bundles. Refs #65551. Thanks @BigWiLLi111.
- LM Studio: trust configured LM Studio loopback, LAN, and tailnet endpoints for guarded model requests by default, preserving explicit private-network opt-outs. Refs #60994. Thanks @tnowakow.
- Docker/setup: route Docker onboarding defaults for host-side LM Studio and Ollama through `host.docker.internal` and add the Linux host-gateway mapping to the bundled Compose file, so containerized gateways can reach local providers without using container loopback. Fixes #68684; supersedes #68702. Thanks @safrano9999 and @skolez.
- Agents/LM Studio: strip prior-turn Gemma 4 reasoning from OpenAI-compatible replay while preserving active tool-call continuation reasoning. Fixes #68704. Thanks @chip-snomo and @Kailigithub.

View File

@@ -191,6 +191,11 @@ Compatibility notes for stricter OpenAI-compatible backends:
- Gateway can reach the proxy? `curl http://127.0.0.1:1234/v1/models`.
- LM Studio model unloaded? Reload; cold start is a common “hanging” cause.
- Local server says `terminated`, `ECONNRESET`, or closes the stream mid-turn?
OpenClaw records a low-cardinality `model.call.error.failureKind` plus the
OpenClaw process RSS/heap snapshot in diagnostics. For LM Studio/Ollama
memory pressure, match that timestamp against the server log or macOS crash /
jetsam log to confirm whether the model server was killed.
- OpenClaw warns when the detected context window is below **32k** and blocks below **16k**. If you hit that preflight, raise the server/model context limit or choose a larger model.
- Context errors? Lower `contextWindow` or raise your server limit.
- OpenAI-compatible server returns `messages[].content ... expected a string`?

View File

@@ -169,7 +169,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted
- `openclaw.context.tokens` (histogram, attrs: `openclaw.context`, `openclaw.channel`, `openclaw.provider`, `openclaw.model`)
- `gen_ai.client.token.usage` (histogram, GenAI semantic-conventions metric, attrs: `gen_ai.token.type` = `input`/`output`, `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`)
- `gen_ai.client.operation.duration` (histogram, seconds, GenAI semantic-conventions metric, attrs: `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`, optional `error.type`)
- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`)
- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`, plus `openclaw.errorCategory` and `openclaw.failureKind` on classified errors)
- `openclaw.model_call.request_bytes` (histogram, UTF-8 byte size of the final model request payload; no raw payload content)
- `openclaw.model_call.response_bytes` (histogram, UTF-8 byte size of streamed model response events; no raw response content)
- `openclaw.model_call.time_to_first_byte_ms` (histogram, elapsed time before the first streamed response event)
@@ -224,6 +224,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted
- `openclaw.model.call`
- `gen_ai.system` by default, or `gen_ai.provider.name` when the latest GenAI semantic conventions are opted in
- `gen_ai.request.model`, `gen_ai.operation.name`, `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`
- `openclaw.errorCategory` and optional `openclaw.failureKind` on errors
- `openclaw.model_call.request_bytes`, `openclaw.model_call.response_bytes`, `openclaw.model_call.time_to_first_byte_ms`
- `openclaw.provider.request_id_hash` (bounded SHA-based hash of the upstream provider request id; raw ids are not exported)
- `openclaw.harness.run`

View File

@@ -1525,6 +1525,7 @@ describe("diagnostics-otel service", () => {
api: "openai-responses",
durationMs: 40,
errorCategory: "ProviderError",
failureKind: "terminated",
upstreamRequestIdHash: "sha256:123456abcdef",
});
await flushDiagnosticEvents();
@@ -1532,6 +1533,12 @@ describe("diagnostics-otel service", () => {
const modelCall = telemetryState.tracer.startSpan.mock.calls.find(
(call) => call[0] === "openclaw.model.call",
);
expect(modelCall?.[1]).toEqual({
attributes: expect.objectContaining({
"openclaw.failureKind": "terminated",
}),
startTime: expect.any(Number),
});
expect(modelCall?.[1]).toEqual({
attributes: expect.not.objectContaining({
"openclaw.upstreamRequestIdHash": expect.anything(),
@@ -1542,6 +1549,14 @@ describe("diagnostics-otel service", () => {
expect(span?.addEvent).toHaveBeenCalledWith("openclaw.provider.request", {
"openclaw.upstreamRequestIdHash": "sha256:123456abcdef",
});
expect(
telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record,
).toHaveBeenCalledWith(
40,
expect.objectContaining({
"openclaw.failureKind": "terminated",
}),
);
expect(
telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record,
).toHaveBeenCalledWith(

View File

@@ -1834,6 +1834,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
const metricAttrs = {
...modelCallMetricAttrs(evt),
"openclaw.errorCategory": errorType,
...(evt.failureKind
? { "openclaw.failureKind": lowCardinalityAttr(evt.failureKind, "other") }
: {}),
};
modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
recordModelCallSizeTimingMetrics(evt, metricAttrs);
@@ -1850,6 +1853,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
"openclaw.errorCategory": errorType,
"error.type": errorType,
};
if (evt.failureKind) {
spanAttrs["openclaw.failureKind"] = lowCardinalityAttr(evt.failureKind, "other");
}
assignGenAiModelCallAttrs(spanAttrs, evt);
if (evt.api) {
spanAttrs["openclaw.api"] = evt.api;

View File

@@ -254,6 +254,49 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => {
expect(JSON.stringify(events[1])).not.toContain(requestId);
});
it("adds failure kind and memory diagnostics for terminated model calls", async () => {
const stream = {
[Symbol.asyncIterator]() {
return {
async next(): Promise<IteratorResult<unknown>> {
throw new Error("terminated");
},
};
},
};
const wrapped = wrapStreamFnWithDiagnosticModelCallEvents(
(() => stream) as unknown as StreamFn,
{
runId: "run-1",
provider: "lmstudio",
model: "qwen/qwen3.5-9b",
trace: createDiagnosticTraceContext(),
nextCallId: () => "call-terminated",
},
);
const events = await collectModelCallEvents(async () => {
await expect(
drain(wrapped({} as never, {} as never, {} as never) as AsyncIterable<unknown>),
).rejects.toThrow("terminated");
});
expect(events.map((event) => event.type)).toEqual(["model.call.started", "model.call.error"]);
expect(events[1]).toMatchObject({
type: "model.call.error",
callId: "call-terminated",
errorCategory: "Error",
failureKind: "terminated",
memory: {
rssBytes: expect.any(Number),
heapTotalBytes: expect.any(Number),
heapUsedBytes: expect.any(Number),
externalBytes: expect.any(Number),
arrayBuffersBytes: expect.any(Number),
},
});
});
it("does not mutate non-configurable provider streams", async () => {
const stream = {};
Object.defineProperty(stream, Symbol.asyncIterator, {

View File

@@ -2,11 +2,13 @@ import type { StreamFn } from "@mariozechner/pi-agent-core";
import { fireAndForgetBoundedHook } from "../../../hooks/fire-and-forget.js";
import {
diagnosticErrorCategory,
diagnosticErrorFailureKind,
diagnosticProviderRequestIdHash,
} from "../../../infra/diagnostic-error-metadata.js";
import {
emitTrustedDiagnosticEvent,
type DiagnosticEventInput,
type DiagnosticMemoryUsage,
} from "../../../infra/diagnostic-events.js";
import {
createChildDiagnosticTraceContext,
@@ -41,7 +43,7 @@ type ModelCallEventBase = Omit<
>;
type ModelCallErrorFields = Pick<
Extract<DiagnosticEventInput, { type: "model.call.error" }>,
"errorCategory" | "upstreamRequestIdHash"
"errorCategory" | "failureKind" | "memory" | "upstreamRequestIdHash"
>;
type ModelCallEndedHookFields = Pick<
PluginHookModelCallEndedEvent,
@@ -51,6 +53,7 @@ type ModelCallEndedHookFields = Pick<
| "requestPayloadBytes"
| "responseStreamBytes"
| "timeToFirstByteMs"
| "failureKind"
| "upstreamRequestIdHash"
>;
type ModelCallSizeTimingFields = Pick<
@@ -152,12 +155,29 @@ function baseModelCallEvent(
function modelCallErrorFields(err: unknown): ModelCallErrorFields {
const upstreamRequestIdHash = diagnosticProviderRequestIdHash(err);
const failureKind = diagnosticErrorFailureKind(err);
return {
errorCategory: diagnosticErrorCategory(err),
...(failureKind ? { failureKind, memory: processMemoryUsageSnapshot() } : {}),
...(upstreamRequestIdHash ? { upstreamRequestIdHash } : {}),
};
}
function processMemoryUsageSnapshot(): DiagnosticMemoryUsage | undefined {
try {
const memory = process.memoryUsage();
return {
rssBytes: memory.rss,
heapTotalBytes: memory.heapTotal,
heapUsedBytes: memory.heapUsed,
externalBytes: memory.external,
arrayBuffersBytes: memory.arrayBuffers,
};
} catch {
return undefined;
}
}
function modelCallHookEventBase(eventBase: ModelCallEventBase): PluginHookModelCallStartedEvent {
return {
runId: eventBase.runId,

View File

@@ -1,6 +1,7 @@
import { describe, expect, it } from "vitest";
import {
diagnosticErrorCategory,
diagnosticErrorFailureKind,
diagnosticHttpStatusCode,
diagnosticProviderRequestIdHash,
} from "./diagnostic-error-metadata.js";
@@ -76,4 +77,37 @@ describe("diagnostic error metadata", () => {
expect(diagnosticProviderRequestIdHash(errorLike)).toBeUndefined();
});
it("classifies low-cardinality transport failure kinds without exposing messages", () => {
expect(diagnosticErrorFailureKind(new Error("terminated"))).toBe("terminated");
expect(
diagnosticErrorFailureKind(Object.assign(new Error("fetch failed"), { code: "ECONNRESET" })),
).toBe("connection_reset");
expect(
diagnosticErrorFailureKind({
error: Object.assign(new Error("socket closed"), { code: "UND_ERR_SOCKET" }),
}),
).toBe("connection_closed");
expect(diagnosticErrorFailureKind(new Error("request timed out after 120000ms"))).toBe(
"timeout",
);
expect(diagnosticErrorFailureKind(new Error("operation was aborted"))).toBe("aborted");
expect(diagnosticErrorFailureKind(new Error("provider rejected the request"))).toBeUndefined();
});
it("does not invoke throwing getters while classifying failure kinds", () => {
const errorLike = {};
Object.defineProperty(errorLike, "code", {
get() {
throw new Error("should not read getter");
},
});
Object.defineProperty(errorLike, "message", {
get() {
throw new Error("should not read getter");
},
});
expect(diagnosticErrorFailureKind(errorLike)).toBeUndefined();
});
});

View File

@@ -15,6 +15,13 @@ const PROVIDER_REQUEST_ID_TEXT_PATTERNS = [
/\((?:request_id|trace_id)\s*:\s*([A-Za-z0-9._:-]{1,128})\)/i,
] as const;
export type DiagnosticErrorFailureKind =
| "aborted"
| "connection_closed"
| "connection_reset"
| "terminated"
| "timeout";
function isObjectLike(value: unknown): value is object {
return (typeof value === "object" || typeof value === "function") && value !== null;
}
@@ -101,6 +108,11 @@ function readDirectMessage(err: unknown): string | undefined {
return typeof message === "string" ? message : undefined;
}
function readDirectCode(err: unknown): string | undefined {
const code = readOwnDataProperty(err, "code");
return typeof code === "string" ? code : undefined;
}
function extractProviderRequestIdFromText(text: string | undefined): string | undefined {
if (!text) {
return undefined;
@@ -158,6 +170,47 @@ export function diagnosticHttpStatusCode(err: unknown): string | undefined {
return undefined;
}
export function diagnosticErrorFailureKind(err: unknown): DiagnosticErrorFailureKind | undefined {
const code = findDiagnosticErrorProperty(err, readDirectCode)?.trim().toUpperCase();
switch (code) {
case undefined:
break;
case "ABORT_ERR":
case "ECONNABORTED":
case "ERR_ABORTED":
return "aborted";
case "ECONNRESET":
return "connection_reset";
case "ERR_STREAM_PREMATURE_CLOSE":
case "UND_ERR_SOCKET":
return "connection_closed";
case "ETIMEDOUT":
case "ERR_SOCKET_CONNECTION_TIMEOUT":
return "timeout";
}
const message = findDiagnosticErrorProperty(err, readDirectMessage);
if (!message) {
return undefined;
}
if (/\b(?:terminated|sigkill|sigterm)\b/i.test(message)) {
return "terminated";
}
if (/\b(?:econnreset|connection reset)\b/i.test(message)) {
return "connection_reset";
}
if (/\b(?:socket hang up|premature close|connection closed|other side closed)\b/i.test(message)) {
return "connection_closed";
}
if (/\b(?:timed out|timeout|etimedout)\b/i.test(message)) {
return "timeout";
}
if (/\b(?:aborted|abort_err|operation was aborted)\b/i.test(message)) {
return "aborted";
}
return undefined;
}
export function diagnosticProviderRequestIdHash(err: unknown): string | undefined {
const fromProperty = findDiagnosticErrorProperty(err, readDirectProviderRequestId);
if (fromProperty) {

View File

@@ -327,6 +327,8 @@ export type DiagnosticModelCallErrorEvent = DiagnosticModelCallBaseEvent & {
type: "model.call.error";
durationMs: number;
errorCategory: string;
failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout";
memory?: DiagnosticMemoryUsage;
requestPayloadBytes?: number;
responseStreamBytes?: number;
timeToFirstByteMs?: number;

View File

@@ -156,6 +156,14 @@ describe("diagnostic stability recorder", () => {
responseStreamBytes: 567,
timeToFirstByteMs: 89,
errorCategory: "TypeError",
failureKind: "terminated",
memory: {
rssBytes: 100,
heapTotalBytes: 80,
heapUsedBytes: 40,
externalBytes: 20,
arrayBuffersBytes: 10,
},
});
await new Promise<void>((resolve) => setImmediate(resolve));
@@ -175,6 +183,14 @@ describe("diagnostic stability recorder", () => {
responseBytes: 567,
timeToFirstByteMs: 89,
reason: "TypeError",
failureKind: "terminated",
memory: {
rssBytes: 100,
heapTotalBytes: 80,
heapUsedBytes: 40,
externalBytes: 20,
arrayBuffersBytes: 10,
},
});
expect(JSON.stringify(snapshot.events[1])).not.toContain("call-1");
});

View File

@@ -49,6 +49,7 @@ export type DiagnosticStabilityEventRecord = {
queueDepth?: number;
queueSize?: number;
waitMs?: number;
failureKind?: string;
active?: number;
waiting?: number;
queued?: number;
@@ -293,6 +294,7 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
record.commandLength = event.commandLength;
record.exitCode = event.exitCode;
record.timedOut = event.timedOut;
record.failureKind = event.failureKind;
assignReasonCode(record, event.failureKind);
break;
case "run.started":
@@ -355,6 +357,8 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
record.requestBytes = event.requestPayloadBytes;
record.responseBytes = event.responseStreamBytes;
record.timeToFirstByteMs = event.timeToFirstByteMs;
record.failureKind = event.failureKind;
record.memory = event.memory ? copyMemory(event.memory) : undefined;
assignReasonCode(record, event.errorCategory);
break;
case "log.record":

View File

@@ -212,6 +212,7 @@ export type PluginHookModelCallEndedEvent = PluginHookModelCallBaseEvent & {
durationMs: number;
outcome: "completed" | "error";
errorCategory?: string;
failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout";
requestPayloadBytes?: number;
responseStreamBytes?: number;
timeToFirstByteMs?: number;