mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-05-01 06:36:23 +08:00
fix(models): enrich local transport failure diagnostics
This commit is contained in:
@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/Bedrock: stop heartbeat runs from persisting blank user transcript turns and repair existing blank user text messages before replay, preventing AWS Bedrock `ContentBlock` blank-text validation failures. Fixes #72640 and #72622. Thanks @goldzulu.
|
||||
- Agents/LM Studio: promote standalone bracketed local-model tool requests into registered tool calls and hide unsupported bracket blocks from visible replies, so MemPalace MCP lookups do not print raw `[tool]` JSON scaffolding in chat. Fixes #66178. Thanks @detroit357.
|
||||
- Local models: warn when an assistant reply looks like a tool call but the provider emitted plain text instead of a structured tool invocation, making fake/non-executed tool calls visible in logs. Fixes #51332. Thanks @emilclaw.
|
||||
- Local models: classify terminated, reset, closed, timeout, and aborted model-call failures and attach a process memory snapshot to the diagnostic event, making LM Studio/Ollama RAM-pressure failures easier to prove from stability bundles. Refs #65551. Thanks @BigWiLLi111.
|
||||
- LM Studio: trust configured LM Studio loopback, LAN, and tailnet endpoints for guarded model requests by default, preserving explicit private-network opt-outs. Refs #60994. Thanks @tnowakow.
|
||||
- Docker/setup: route Docker onboarding defaults for host-side LM Studio and Ollama through `host.docker.internal` and add the Linux host-gateway mapping to the bundled Compose file, so containerized gateways can reach local providers without using container loopback. Fixes #68684; supersedes #68702. Thanks @safrano9999 and @skolez.
|
||||
- Agents/LM Studio: strip prior-turn Gemma 4 reasoning from OpenAI-compatible replay while preserving active tool-call continuation reasoning. Fixes #68704. Thanks @chip-snomo and @Kailigithub.
|
||||
|
||||
@@ -191,6 +191,11 @@ Compatibility notes for stricter OpenAI-compatible backends:
|
||||
|
||||
- Gateway can reach the proxy? `curl http://127.0.0.1:1234/v1/models`.
|
||||
- LM Studio model unloaded? Reload; cold start is a common “hanging” cause.
|
||||
- Local server says `terminated`, `ECONNRESET`, or closes the stream mid-turn?
|
||||
OpenClaw records a low-cardinality `model.call.error.failureKind` plus the
|
||||
OpenClaw process RSS/heap snapshot in diagnostics. For LM Studio/Ollama
|
||||
memory pressure, match that timestamp against the server log or macOS crash /
|
||||
jetsam log to confirm whether the model server was killed.
|
||||
- OpenClaw warns when the detected context window is below **32k** and blocks below **16k**. If you hit that preflight, raise the server/model context limit or choose a larger model.
|
||||
- Context errors? Lower `contextWindow` or raise your server limit.
|
||||
- OpenAI-compatible server returns `messages[].content ... expected a string`?
|
||||
|
||||
@@ -169,7 +169,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted
|
||||
- `openclaw.context.tokens` (histogram, attrs: `openclaw.context`, `openclaw.channel`, `openclaw.provider`, `openclaw.model`)
|
||||
- `gen_ai.client.token.usage` (histogram, GenAI semantic-conventions metric, attrs: `gen_ai.token.type` = `input`/`output`, `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`)
|
||||
- `gen_ai.client.operation.duration` (histogram, seconds, GenAI semantic-conventions metric, attrs: `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`, optional `error.type`)
|
||||
- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`)
|
||||
- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`, plus `openclaw.errorCategory` and `openclaw.failureKind` on classified errors)
|
||||
- `openclaw.model_call.request_bytes` (histogram, UTF-8 byte size of the final model request payload; no raw payload content)
|
||||
- `openclaw.model_call.response_bytes` (histogram, UTF-8 byte size of streamed model response events; no raw response content)
|
||||
- `openclaw.model_call.time_to_first_byte_ms` (histogram, elapsed time before the first streamed response event)
|
||||
@@ -224,6 +224,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted
|
||||
- `openclaw.model.call`
|
||||
- `gen_ai.system` by default, or `gen_ai.provider.name` when the latest GenAI semantic conventions are opted in
|
||||
- `gen_ai.request.model`, `gen_ai.operation.name`, `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`
|
||||
- `openclaw.errorCategory` and optional `openclaw.failureKind` on errors
|
||||
- `openclaw.model_call.request_bytes`, `openclaw.model_call.response_bytes`, `openclaw.model_call.time_to_first_byte_ms`
|
||||
- `openclaw.provider.request_id_hash` (bounded SHA-based hash of the upstream provider request id; raw ids are not exported)
|
||||
- `openclaw.harness.run`
|
||||
|
||||
@@ -1525,6 +1525,7 @@ describe("diagnostics-otel service", () => {
|
||||
api: "openai-responses",
|
||||
durationMs: 40,
|
||||
errorCategory: "ProviderError",
|
||||
failureKind: "terminated",
|
||||
upstreamRequestIdHash: "sha256:123456abcdef",
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
@@ -1532,6 +1533,12 @@ describe("diagnostics-otel service", () => {
|
||||
const modelCall = telemetryState.tracer.startSpan.mock.calls.find(
|
||||
(call) => call[0] === "openclaw.model.call",
|
||||
);
|
||||
expect(modelCall?.[1]).toEqual({
|
||||
attributes: expect.objectContaining({
|
||||
"openclaw.failureKind": "terminated",
|
||||
}),
|
||||
startTime: expect.any(Number),
|
||||
});
|
||||
expect(modelCall?.[1]).toEqual({
|
||||
attributes: expect.not.objectContaining({
|
||||
"openclaw.upstreamRequestIdHash": expect.anything(),
|
||||
@@ -1542,6 +1549,14 @@ describe("diagnostics-otel service", () => {
|
||||
expect(span?.addEvent).toHaveBeenCalledWith("openclaw.provider.request", {
|
||||
"openclaw.upstreamRequestIdHash": "sha256:123456abcdef",
|
||||
});
|
||||
expect(
|
||||
telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record,
|
||||
).toHaveBeenCalledWith(
|
||||
40,
|
||||
expect.objectContaining({
|
||||
"openclaw.failureKind": "terminated",
|
||||
}),
|
||||
);
|
||||
expect(
|
||||
telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record,
|
||||
).toHaveBeenCalledWith(
|
||||
|
||||
@@ -1834,6 +1834,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
const metricAttrs = {
|
||||
...modelCallMetricAttrs(evt),
|
||||
"openclaw.errorCategory": errorType,
|
||||
...(evt.failureKind
|
||||
? { "openclaw.failureKind": lowCardinalityAttr(evt.failureKind, "other") }
|
||||
: {}),
|
||||
};
|
||||
modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
|
||||
recordModelCallSizeTimingMetrics(evt, metricAttrs);
|
||||
@@ -1850,6 +1853,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
"openclaw.errorCategory": errorType,
|
||||
"error.type": errorType,
|
||||
};
|
||||
if (evt.failureKind) {
|
||||
spanAttrs["openclaw.failureKind"] = lowCardinalityAttr(evt.failureKind, "other");
|
||||
}
|
||||
assignGenAiModelCallAttrs(spanAttrs, evt);
|
||||
if (evt.api) {
|
||||
spanAttrs["openclaw.api"] = evt.api;
|
||||
|
||||
@@ -254,6 +254,49 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => {
|
||||
expect(JSON.stringify(events[1])).not.toContain(requestId);
|
||||
});
|
||||
|
||||
it("adds failure kind and memory diagnostics for terminated model calls", async () => {
|
||||
const stream = {
|
||||
[Symbol.asyncIterator]() {
|
||||
return {
|
||||
async next(): Promise<IteratorResult<unknown>> {
|
||||
throw new Error("terminated");
|
||||
},
|
||||
};
|
||||
},
|
||||
};
|
||||
const wrapped = wrapStreamFnWithDiagnosticModelCallEvents(
|
||||
(() => stream) as unknown as StreamFn,
|
||||
{
|
||||
runId: "run-1",
|
||||
provider: "lmstudio",
|
||||
model: "qwen/qwen3.5-9b",
|
||||
trace: createDiagnosticTraceContext(),
|
||||
nextCallId: () => "call-terminated",
|
||||
},
|
||||
);
|
||||
|
||||
const events = await collectModelCallEvents(async () => {
|
||||
await expect(
|
||||
drain(wrapped({} as never, {} as never, {} as never) as AsyncIterable<unknown>),
|
||||
).rejects.toThrow("terminated");
|
||||
});
|
||||
|
||||
expect(events.map((event) => event.type)).toEqual(["model.call.started", "model.call.error"]);
|
||||
expect(events[1]).toMatchObject({
|
||||
type: "model.call.error",
|
||||
callId: "call-terminated",
|
||||
errorCategory: "Error",
|
||||
failureKind: "terminated",
|
||||
memory: {
|
||||
rssBytes: expect.any(Number),
|
||||
heapTotalBytes: expect.any(Number),
|
||||
heapUsedBytes: expect.any(Number),
|
||||
externalBytes: expect.any(Number),
|
||||
arrayBuffersBytes: expect.any(Number),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("does not mutate non-configurable provider streams", async () => {
|
||||
const stream = {};
|
||||
Object.defineProperty(stream, Symbol.asyncIterator, {
|
||||
|
||||
@@ -2,11 +2,13 @@ import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import { fireAndForgetBoundedHook } from "../../../hooks/fire-and-forget.js";
|
||||
import {
|
||||
diagnosticErrorCategory,
|
||||
diagnosticErrorFailureKind,
|
||||
diagnosticProviderRequestIdHash,
|
||||
} from "../../../infra/diagnostic-error-metadata.js";
|
||||
import {
|
||||
emitTrustedDiagnosticEvent,
|
||||
type DiagnosticEventInput,
|
||||
type DiagnosticMemoryUsage,
|
||||
} from "../../../infra/diagnostic-events.js";
|
||||
import {
|
||||
createChildDiagnosticTraceContext,
|
||||
@@ -41,7 +43,7 @@ type ModelCallEventBase = Omit<
|
||||
>;
|
||||
type ModelCallErrorFields = Pick<
|
||||
Extract<DiagnosticEventInput, { type: "model.call.error" }>,
|
||||
"errorCategory" | "upstreamRequestIdHash"
|
||||
"errorCategory" | "failureKind" | "memory" | "upstreamRequestIdHash"
|
||||
>;
|
||||
type ModelCallEndedHookFields = Pick<
|
||||
PluginHookModelCallEndedEvent,
|
||||
@@ -51,6 +53,7 @@ type ModelCallEndedHookFields = Pick<
|
||||
| "requestPayloadBytes"
|
||||
| "responseStreamBytes"
|
||||
| "timeToFirstByteMs"
|
||||
| "failureKind"
|
||||
| "upstreamRequestIdHash"
|
||||
>;
|
||||
type ModelCallSizeTimingFields = Pick<
|
||||
@@ -152,12 +155,29 @@ function baseModelCallEvent(
|
||||
|
||||
function modelCallErrorFields(err: unknown): ModelCallErrorFields {
|
||||
const upstreamRequestIdHash = diagnosticProviderRequestIdHash(err);
|
||||
const failureKind = diagnosticErrorFailureKind(err);
|
||||
return {
|
||||
errorCategory: diagnosticErrorCategory(err),
|
||||
...(failureKind ? { failureKind, memory: processMemoryUsageSnapshot() } : {}),
|
||||
...(upstreamRequestIdHash ? { upstreamRequestIdHash } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
function processMemoryUsageSnapshot(): DiagnosticMemoryUsage | undefined {
|
||||
try {
|
||||
const memory = process.memoryUsage();
|
||||
return {
|
||||
rssBytes: memory.rss,
|
||||
heapTotalBytes: memory.heapTotal,
|
||||
heapUsedBytes: memory.heapUsed,
|
||||
externalBytes: memory.external,
|
||||
arrayBuffersBytes: memory.arrayBuffers,
|
||||
};
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function modelCallHookEventBase(eventBase: ModelCallEventBase): PluginHookModelCallStartedEvent {
|
||||
return {
|
||||
runId: eventBase.runId,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
diagnosticErrorCategory,
|
||||
diagnosticErrorFailureKind,
|
||||
diagnosticHttpStatusCode,
|
||||
diagnosticProviderRequestIdHash,
|
||||
} from "./diagnostic-error-metadata.js";
|
||||
@@ -76,4 +77,37 @@ describe("diagnostic error metadata", () => {
|
||||
|
||||
expect(diagnosticProviderRequestIdHash(errorLike)).toBeUndefined();
|
||||
});
|
||||
|
||||
it("classifies low-cardinality transport failure kinds without exposing messages", () => {
|
||||
expect(diagnosticErrorFailureKind(new Error("terminated"))).toBe("terminated");
|
||||
expect(
|
||||
diagnosticErrorFailureKind(Object.assign(new Error("fetch failed"), { code: "ECONNRESET" })),
|
||||
).toBe("connection_reset");
|
||||
expect(
|
||||
diagnosticErrorFailureKind({
|
||||
error: Object.assign(new Error("socket closed"), { code: "UND_ERR_SOCKET" }),
|
||||
}),
|
||||
).toBe("connection_closed");
|
||||
expect(diagnosticErrorFailureKind(new Error("request timed out after 120000ms"))).toBe(
|
||||
"timeout",
|
||||
);
|
||||
expect(diagnosticErrorFailureKind(new Error("operation was aborted"))).toBe("aborted");
|
||||
expect(diagnosticErrorFailureKind(new Error("provider rejected the request"))).toBeUndefined();
|
||||
});
|
||||
|
||||
it("does not invoke throwing getters while classifying failure kinds", () => {
|
||||
const errorLike = {};
|
||||
Object.defineProperty(errorLike, "code", {
|
||||
get() {
|
||||
throw new Error("should not read getter");
|
||||
},
|
||||
});
|
||||
Object.defineProperty(errorLike, "message", {
|
||||
get() {
|
||||
throw new Error("should not read getter");
|
||||
},
|
||||
});
|
||||
|
||||
expect(diagnosticErrorFailureKind(errorLike)).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -15,6 +15,13 @@ const PROVIDER_REQUEST_ID_TEXT_PATTERNS = [
|
||||
/\((?:request_id|trace_id)\s*:\s*([A-Za-z0-9._:-]{1,128})\)/i,
|
||||
] as const;
|
||||
|
||||
export type DiagnosticErrorFailureKind =
|
||||
| "aborted"
|
||||
| "connection_closed"
|
||||
| "connection_reset"
|
||||
| "terminated"
|
||||
| "timeout";
|
||||
|
||||
function isObjectLike(value: unknown): value is object {
|
||||
return (typeof value === "object" || typeof value === "function") && value !== null;
|
||||
}
|
||||
@@ -101,6 +108,11 @@ function readDirectMessage(err: unknown): string | undefined {
|
||||
return typeof message === "string" ? message : undefined;
|
||||
}
|
||||
|
||||
function readDirectCode(err: unknown): string | undefined {
|
||||
const code = readOwnDataProperty(err, "code");
|
||||
return typeof code === "string" ? code : undefined;
|
||||
}
|
||||
|
||||
function extractProviderRequestIdFromText(text: string | undefined): string | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
@@ -158,6 +170,47 @@ export function diagnosticHttpStatusCode(err: unknown): string | undefined {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function diagnosticErrorFailureKind(err: unknown): DiagnosticErrorFailureKind | undefined {
|
||||
const code = findDiagnosticErrorProperty(err, readDirectCode)?.trim().toUpperCase();
|
||||
switch (code) {
|
||||
case undefined:
|
||||
break;
|
||||
case "ABORT_ERR":
|
||||
case "ECONNABORTED":
|
||||
case "ERR_ABORTED":
|
||||
return "aborted";
|
||||
case "ECONNRESET":
|
||||
return "connection_reset";
|
||||
case "ERR_STREAM_PREMATURE_CLOSE":
|
||||
case "UND_ERR_SOCKET":
|
||||
return "connection_closed";
|
||||
case "ETIMEDOUT":
|
||||
case "ERR_SOCKET_CONNECTION_TIMEOUT":
|
||||
return "timeout";
|
||||
}
|
||||
|
||||
const message = findDiagnosticErrorProperty(err, readDirectMessage);
|
||||
if (!message) {
|
||||
return undefined;
|
||||
}
|
||||
if (/\b(?:terminated|sigkill|sigterm)\b/i.test(message)) {
|
||||
return "terminated";
|
||||
}
|
||||
if (/\b(?:econnreset|connection reset)\b/i.test(message)) {
|
||||
return "connection_reset";
|
||||
}
|
||||
if (/\b(?:socket hang up|premature close|connection closed|other side closed)\b/i.test(message)) {
|
||||
return "connection_closed";
|
||||
}
|
||||
if (/\b(?:timed out|timeout|etimedout)\b/i.test(message)) {
|
||||
return "timeout";
|
||||
}
|
||||
if (/\b(?:aborted|abort_err|operation was aborted)\b/i.test(message)) {
|
||||
return "aborted";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function diagnosticProviderRequestIdHash(err: unknown): string | undefined {
|
||||
const fromProperty = findDiagnosticErrorProperty(err, readDirectProviderRequestId);
|
||||
if (fromProperty) {
|
||||
|
||||
@@ -327,6 +327,8 @@ export type DiagnosticModelCallErrorEvent = DiagnosticModelCallBaseEvent & {
|
||||
type: "model.call.error";
|
||||
durationMs: number;
|
||||
errorCategory: string;
|
||||
failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout";
|
||||
memory?: DiagnosticMemoryUsage;
|
||||
requestPayloadBytes?: number;
|
||||
responseStreamBytes?: number;
|
||||
timeToFirstByteMs?: number;
|
||||
|
||||
@@ -156,6 +156,14 @@ describe("diagnostic stability recorder", () => {
|
||||
responseStreamBytes: 567,
|
||||
timeToFirstByteMs: 89,
|
||||
errorCategory: "TypeError",
|
||||
failureKind: "terminated",
|
||||
memory: {
|
||||
rssBytes: 100,
|
||||
heapTotalBytes: 80,
|
||||
heapUsedBytes: 40,
|
||||
externalBytes: 20,
|
||||
arrayBuffersBytes: 10,
|
||||
},
|
||||
});
|
||||
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||
|
||||
@@ -175,6 +183,14 @@ describe("diagnostic stability recorder", () => {
|
||||
responseBytes: 567,
|
||||
timeToFirstByteMs: 89,
|
||||
reason: "TypeError",
|
||||
failureKind: "terminated",
|
||||
memory: {
|
||||
rssBytes: 100,
|
||||
heapTotalBytes: 80,
|
||||
heapUsedBytes: 40,
|
||||
externalBytes: 20,
|
||||
arrayBuffersBytes: 10,
|
||||
},
|
||||
});
|
||||
expect(JSON.stringify(snapshot.events[1])).not.toContain("call-1");
|
||||
});
|
||||
|
||||
@@ -49,6 +49,7 @@ export type DiagnosticStabilityEventRecord = {
|
||||
queueDepth?: number;
|
||||
queueSize?: number;
|
||||
waitMs?: number;
|
||||
failureKind?: string;
|
||||
active?: number;
|
||||
waiting?: number;
|
||||
queued?: number;
|
||||
@@ -293,6 +294,7 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
|
||||
record.commandLength = event.commandLength;
|
||||
record.exitCode = event.exitCode;
|
||||
record.timedOut = event.timedOut;
|
||||
record.failureKind = event.failureKind;
|
||||
assignReasonCode(record, event.failureKind);
|
||||
break;
|
||||
case "run.started":
|
||||
@@ -355,6 +357,8 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
|
||||
record.requestBytes = event.requestPayloadBytes;
|
||||
record.responseBytes = event.responseStreamBytes;
|
||||
record.timeToFirstByteMs = event.timeToFirstByteMs;
|
||||
record.failureKind = event.failureKind;
|
||||
record.memory = event.memory ? copyMemory(event.memory) : undefined;
|
||||
assignReasonCode(record, event.errorCategory);
|
||||
break;
|
||||
case "log.record":
|
||||
|
||||
@@ -212,6 +212,7 @@ export type PluginHookModelCallEndedEvent = PluginHookModelCallBaseEvent & {
|
||||
durationMs: number;
|
||||
outcome: "completed" | "error";
|
||||
errorCategory?: string;
|
||||
failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout";
|
||||
requestPayloadBytes?: number;
|
||||
responseStreamBytes?: number;
|
||||
timeToFirstByteMs?: number;
|
||||
|
||||
Reference in New Issue
Block a user