fix(models): enrich local transport failure diagnostics

2026-05-01 06:36:23 +08:00 · 2026-04-27 09:25:33 +01:00
parent c2d82b87ee
commit a95da5b52d
13 changed files with 203 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
 - Agents/Bedrock: stop heartbeat runs from persisting blank user transcript turns and repair existing blank user text messages before replay, preventing AWS Bedrock `ContentBlock` blank-text validation failures. Fixes #72640 and #72622. Thanks @goldzulu.
 - Agents/LM Studio: promote standalone bracketed local-model tool requests into registered tool calls and hide unsupported bracket blocks from visible replies, so MemPalace MCP lookups do not print raw `[tool]` JSON scaffolding in chat. Fixes #66178. Thanks @detroit357.
 - Local models: warn when an assistant reply looks like a tool call but the provider emitted plain text instead of a structured tool invocation, making fake/non-executed tool calls visible in logs. Fixes #51332. Thanks @emilclaw.
+- Local models: classify terminated, reset, closed, timeout, and aborted model-call failures and attach a process memory snapshot to the diagnostic event, making LM Studio/Ollama RAM-pressure failures easier to prove from stability bundles. Refs #65551. Thanks @BigWiLLi111.
 - LM Studio: trust configured LM Studio loopback, LAN, and tailnet endpoints for guarded model requests by default, preserving explicit private-network opt-outs. Refs #60994. Thanks @tnowakow.
 - Docker/setup: route Docker onboarding defaults for host-side LM Studio and Ollama through `host.docker.internal` and add the Linux host-gateway mapping to the bundled Compose file, so containerized gateways can reach local providers without using container loopback. Fixes #68684; supersedes #68702. Thanks @safrano9999 and @skolez.
 - Agents/LM Studio: strip prior-turn Gemma 4 reasoning from OpenAI-compatible replay while preserving active tool-call continuation reasoning. Fixes #68704. Thanks @chip-snomo and @Kailigithub.
--- a/docs/gateway/local-models.md
+++ b/docs/gateway/local-models.md
@@ -191,6 +191,11 @@ Compatibility notes for stricter OpenAI-compatible backends:

 - Gateway can reach the proxy? `curl http://127.0.0.1:1234/v1/models`.
 - LM Studio model unloaded? Reload; cold start is a common “hanging” cause.
+- Local server says `terminated`, `ECONNRESET`, or closes the stream mid-turn?
+  OpenClaw records a low-cardinality `model.call.error.failureKind` plus the
+  OpenClaw process RSS/heap snapshot in diagnostics. For LM Studio/Ollama
+  memory pressure, match that timestamp against the server log or macOS crash /
+  jetsam log to confirm whether the model server was killed.
 - OpenClaw warns when the detected context window is below **32k** and blocks below **16k**. If you hit that preflight, raise the server/model context limit or choose a larger model.
 - Context errors? Lower `contextWindow` or raise your server limit.
 - OpenAI-compatible server returns `messages[].content ... expected a string`?
--- a/docs/gateway/opentelemetry.md
+++ b/docs/gateway/opentelemetry.md
@@ -169,7 +169,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted
 - `openclaw.context.tokens` (histogram, attrs: `openclaw.context`, `openclaw.channel`, `openclaw.provider`, `openclaw.model`)
 - `gen_ai.client.token.usage` (histogram, GenAI semantic-conventions metric, attrs: `gen_ai.token.type` = `input`/`output`, `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`)
 - `gen_ai.client.operation.duration` (histogram, seconds, GenAI semantic-conventions metric, attrs: `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`, optional `error.type`)
- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`)
+- `openclaw.model_call.duration_ms` (histogram, attrs: `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`, plus `openclaw.errorCategory` and `openclaw.failureKind` on classified errors)
 - `openclaw.model_call.request_bytes` (histogram, UTF-8 byte size of the final model request payload; no raw payload content)
 - `openclaw.model_call.response_bytes` (histogram, UTF-8 byte size of streamed model response events; no raw response content)
 - `openclaw.model_call.time_to_first_byte_ms` (histogram, elapsed time before the first streamed response event)
@@ -224,6 +224,7 @@ When any subkey is enabled, model and tool spans get bounded, redacted
 - `openclaw.model.call`
  - `gen_ai.system` by default, or `gen_ai.provider.name` when the latest GenAI semantic conventions are opted in
  - `gen_ai.request.model`, `gen_ai.operation.name`, `openclaw.provider`, `openclaw.model`, `openclaw.api`, `openclaw.transport`
+  - `openclaw.errorCategory` and optional `openclaw.failureKind` on errors
  - `openclaw.model_call.request_bytes`, `openclaw.model_call.response_bytes`, `openclaw.model_call.time_to_first_byte_ms`
  - `openclaw.provider.request_id_hash` (bounded SHA-based hash of the upstream provider request id; raw ids are not exported)
 - `openclaw.harness.run`
--- a/extensions/diagnostics-otel/src/service.test.ts
+++ b/extensions/diagnostics-otel/src/service.test.ts
@@ -1525,6 +1525,7 @@ describe("diagnostics-otel service", () => {
      api: "openai-responses",
      durationMs: 40,
      errorCategory: "ProviderError",
+      failureKind: "terminated",
      upstreamRequestIdHash: "sha256:123456abcdef",
    });
    await flushDiagnosticEvents();
@@ -1532,6 +1533,12 @@ describe("diagnostics-otel service", () => {
    const modelCall = telemetryState.tracer.startSpan.mock.calls.find(
      (call) => call[0] === "openclaw.model.call",
    );
+    expect(modelCall?.[1]).toEqual({
+      attributes: expect.objectContaining({
+        "openclaw.failureKind": "terminated",
+      }),
+      startTime: expect.any(Number),
+    });
    expect(modelCall?.[1]).toEqual({
      attributes: expect.not.objectContaining({
        "openclaw.upstreamRequestIdHash": expect.anything(),
@@ -1542,6 +1549,14 @@ describe("diagnostics-otel service", () => {
    expect(span?.addEvent).toHaveBeenCalledWith("openclaw.provider.request", {
      "openclaw.upstreamRequestIdHash": "sha256:123456abcdef",
    });
+    expect(
+      telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record,
+    ).toHaveBeenCalledWith(
+      40,
+      expect.objectContaining({
+        "openclaw.failureKind": "terminated",
+      }),
+    );
    expect(
      telemetryState.histograms.get("openclaw.model_call.duration_ms")?.record,
    ).toHaveBeenCalledWith(
--- a/extensions/diagnostics-otel/src/service.ts
+++ b/extensions/diagnostics-otel/src/service.ts
@@ -1834,6 +1834,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        const metricAttrs = {
          ...modelCallMetricAttrs(evt),
          "openclaw.errorCategory": errorType,
+          ...(evt.failureKind
+            ? { "openclaw.failureKind": lowCardinalityAttr(evt.failureKind, "other") }
+            : {}),
        };
        modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
        recordModelCallSizeTimingMetrics(evt, metricAttrs);
@@ -1850,6 +1853,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
          "openclaw.errorCategory": errorType,
          "error.type": errorType,
        };
+        if (evt.failureKind) {
+          spanAttrs["openclaw.failureKind"] = lowCardinalityAttr(evt.failureKind, "other");
+        }
        assignGenAiModelCallAttrs(spanAttrs, evt);
        if (evt.api) {
          spanAttrs["openclaw.api"] = evt.api;
--- a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.test.ts
@@ -254,6 +254,49 @@ describe("wrapStreamFnWithDiagnosticModelCallEvents", () => {
    expect(JSON.stringify(events[1])).not.toContain(requestId);
  });

+  it("adds failure kind and memory diagnostics for terminated model calls", async () => {
+    const stream = {
+      [Symbol.asyncIterator]() {
+        return {
+          async next(): Promise<IteratorResult<unknown>> {
+            throw new Error("terminated");
+          },
+        };
+      },
+    };
+    const wrapped = wrapStreamFnWithDiagnosticModelCallEvents(
+      (() => stream) as unknown as StreamFn,
+      {
+        runId: "run-1",
+        provider: "lmstudio",
+        model: "qwen/qwen3.5-9b",
+        trace: createDiagnosticTraceContext(),
+        nextCallId: () => "call-terminated",
+      },
+    );
+
+    const events = await collectModelCallEvents(async () => {
+      await expect(
+        drain(wrapped({} as never, {} as never, {} as never) as AsyncIterable<unknown>),
+      ).rejects.toThrow("terminated");
+    });
+
+    expect(events.map((event) => event.type)).toEqual(["model.call.started", "model.call.error"]);
+    expect(events[1]).toMatchObject({
+      type: "model.call.error",
+      callId: "call-terminated",
+      errorCategory: "Error",
+      failureKind: "terminated",
+      memory: {
+        rssBytes: expect.any(Number),
+        heapTotalBytes: expect.any(Number),
+        heapUsedBytes: expect.any(Number),
+        externalBytes: expect.any(Number),
+        arrayBuffersBytes: expect.any(Number),
+      },
+    });
+  });
+
  it("does not mutate non-configurable provider streams", async () => {
    const stream = {};
    Object.defineProperty(stream, Symbol.asyncIterator, {
--- a/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.model-diagnostic-events.ts
@@ -2,11 +2,13 @@ import type { StreamFn } from "@mariozechner/pi-agent-core";
 import { fireAndForgetBoundedHook } from "../../../hooks/fire-and-forget.js";
 import {
  diagnosticErrorCategory,
+  diagnosticErrorFailureKind,
  diagnosticProviderRequestIdHash,
 } from "../../../infra/diagnostic-error-metadata.js";
 import {
  emitTrustedDiagnosticEvent,
  type DiagnosticEventInput,
+  type DiagnosticMemoryUsage,
 } from "../../../infra/diagnostic-events.js";
 import {
  createChildDiagnosticTraceContext,
@@ -41,7 +43,7 @@ type ModelCallEventBase = Omit<
 >;
 type ModelCallErrorFields = Pick<
  Extract<DiagnosticEventInput, { type: "model.call.error" }>,
-  "errorCategory" | "upstreamRequestIdHash"
+  "errorCategory" | "failureKind" | "memory" | "upstreamRequestIdHash"
 >;
 type ModelCallEndedHookFields = Pick<
  PluginHookModelCallEndedEvent,
@@ -51,6 +53,7 @@ type ModelCallEndedHookFields = Pick<
  | "requestPayloadBytes"
  | "responseStreamBytes"
  | "timeToFirstByteMs"
+  | "failureKind"
  | "upstreamRequestIdHash"
 >;
 type ModelCallSizeTimingFields = Pick<
@@ -152,12 +155,29 @@ function baseModelCallEvent(

 function modelCallErrorFields(err: unknown): ModelCallErrorFields {
  const upstreamRequestIdHash = diagnosticProviderRequestIdHash(err);
+  const failureKind = diagnosticErrorFailureKind(err);
  return {
    errorCategory: diagnosticErrorCategory(err),
+    ...(failureKind ? { failureKind, memory: processMemoryUsageSnapshot() } : {}),
    ...(upstreamRequestIdHash ? { upstreamRequestIdHash } : {}),
  };
 }

+function processMemoryUsageSnapshot(): DiagnosticMemoryUsage | undefined {
+  try {
+    const memory = process.memoryUsage();
+    return {
+      rssBytes: memory.rss,
+      heapTotalBytes: memory.heapTotal,
+      heapUsedBytes: memory.heapUsed,
+      externalBytes: memory.external,
+      arrayBuffersBytes: memory.arrayBuffers,
+    };
+  } catch {
+    return undefined;
+  }
+}
+
 function modelCallHookEventBase(eventBase: ModelCallEventBase): PluginHookModelCallStartedEvent {
  return {
    runId: eventBase.runId,
--- a/src/infra/diagnostic-error-metadata.test.ts
+++ b/src/infra/diagnostic-error-metadata.test.ts
@@ -1,6 +1,7 @@
 import { describe, expect, it } from "vitest";
 import {
  diagnosticErrorCategory,
+  diagnosticErrorFailureKind,
  diagnosticHttpStatusCode,
  diagnosticProviderRequestIdHash,
 } from "./diagnostic-error-metadata.js";
@@ -76,4 +77,37 @@ describe("diagnostic error metadata", () => {

    expect(diagnosticProviderRequestIdHash(errorLike)).toBeUndefined();
  });
+
+  it("classifies low-cardinality transport failure kinds without exposing messages", () => {
+    expect(diagnosticErrorFailureKind(new Error("terminated"))).toBe("terminated");
+    expect(
+      diagnosticErrorFailureKind(Object.assign(new Error("fetch failed"), { code: "ECONNRESET" })),
+    ).toBe("connection_reset");
+    expect(
+      diagnosticErrorFailureKind({
+        error: Object.assign(new Error("socket closed"), { code: "UND_ERR_SOCKET" }),
+      }),
+    ).toBe("connection_closed");
+    expect(diagnosticErrorFailureKind(new Error("request timed out after 120000ms"))).toBe(
+      "timeout",
+    );
+    expect(diagnosticErrorFailureKind(new Error("operation was aborted"))).toBe("aborted");
+    expect(diagnosticErrorFailureKind(new Error("provider rejected the request"))).toBeUndefined();
+  });
+
+  it("does not invoke throwing getters while classifying failure kinds", () => {
+    const errorLike = {};
+    Object.defineProperty(errorLike, "code", {
+      get() {
+        throw new Error("should not read getter");
+      },
+    });
+    Object.defineProperty(errorLike, "message", {
+      get() {
+        throw new Error("should not read getter");
+      },
+    });
+
+    expect(diagnosticErrorFailureKind(errorLike)).toBeUndefined();
+  });
 });
--- a/src/infra/diagnostic-error-metadata.ts
+++ b/src/infra/diagnostic-error-metadata.ts
@@ -15,6 +15,13 @@ const PROVIDER_REQUEST_ID_TEXT_PATTERNS = [
  /\((?:request_id|trace_id)\s*:\s*([A-Za-z0-9._:-]{1,128})\)/i,
 ] as const;

+export type DiagnosticErrorFailureKind =
+  | "aborted"
+  | "connection_closed"
+  | "connection_reset"
+  | "terminated"
+  | "timeout";
+
 function isObjectLike(value: unknown): value is object {
  return (typeof value === "object" || typeof value === "function") && value !== null;
 }
@@ -101,6 +108,11 @@ function readDirectMessage(err: unknown): string | undefined {
  return typeof message === "string" ? message : undefined;
 }

+function readDirectCode(err: unknown): string | undefined {
+  const code = readOwnDataProperty(err, "code");
+  return typeof code === "string" ? code : undefined;
+}
+
 function extractProviderRequestIdFromText(text: string | undefined): string | undefined {
  if (!text) {
    return undefined;
@@ -158,6 +170,47 @@ export function diagnosticHttpStatusCode(err: unknown): string | undefined {
  return undefined;
 }

+export function diagnosticErrorFailureKind(err: unknown): DiagnosticErrorFailureKind | undefined {
+  const code = findDiagnosticErrorProperty(err, readDirectCode)?.trim().toUpperCase();
+  switch (code) {
+    case undefined:
+      break;
+    case "ABORT_ERR":
+    case "ECONNABORTED":
+    case "ERR_ABORTED":
+      return "aborted";
+    case "ECONNRESET":
+      return "connection_reset";
+    case "ERR_STREAM_PREMATURE_CLOSE":
+    case "UND_ERR_SOCKET":
+      return "connection_closed";
+    case "ETIMEDOUT":
+    case "ERR_SOCKET_CONNECTION_TIMEOUT":
+      return "timeout";
+  }
+
+  const message = findDiagnosticErrorProperty(err, readDirectMessage);
+  if (!message) {
+    return undefined;
+  }
+  if (/\b(?:terminated|sigkill|sigterm)\b/i.test(message)) {
+    return "terminated";
+  }
+  if (/\b(?:econnreset|connection reset)\b/i.test(message)) {
+    return "connection_reset";
+  }
+  if (/\b(?:socket hang up|premature close|connection closed|other side closed)\b/i.test(message)) {
+    return "connection_closed";
+  }
+  if (/\b(?:timed out|timeout|etimedout)\b/i.test(message)) {
+    return "timeout";
+  }
+  if (/\b(?:aborted|abort_err|operation was aborted)\b/i.test(message)) {
+    return "aborted";
+  }
+  return undefined;
+}
+
 export function diagnosticProviderRequestIdHash(err: unknown): string | undefined {
  const fromProperty = findDiagnosticErrorProperty(err, readDirectProviderRequestId);
  if (fromProperty) {
--- a/src/infra/diagnostic-events.ts
+++ b/src/infra/diagnostic-events.ts
@@ -327,6 +327,8 @@ export type DiagnosticModelCallErrorEvent = DiagnosticModelCallBaseEvent & {
  type: "model.call.error";
  durationMs: number;
  errorCategory: string;
+  failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout";
+  memory?: DiagnosticMemoryUsage;
  requestPayloadBytes?: number;
  responseStreamBytes?: number;
  timeToFirstByteMs?: number;
--- a/src/logging/diagnostic-stability.test.ts
+++ b/src/logging/diagnostic-stability.test.ts
@@ -156,6 +156,14 @@ describe("diagnostic stability recorder", () => {
      responseStreamBytes: 567,
      timeToFirstByteMs: 89,
      errorCategory: "TypeError",
+      failureKind: "terminated",
+      memory: {
+        rssBytes: 100,
+        heapTotalBytes: 80,
+        heapUsedBytes: 40,
+        externalBytes: 20,
+        arrayBuffersBytes: 10,
+      },
    });
    await new Promise<void>((resolve) => setImmediate(resolve));

@@ -175,6 +183,14 @@ describe("diagnostic stability recorder", () => {
      responseBytes: 567,
      timeToFirstByteMs: 89,
      reason: "TypeError",
+      failureKind: "terminated",
+      memory: {
+        rssBytes: 100,
+        heapTotalBytes: 80,
+        heapUsedBytes: 40,
+        externalBytes: 20,
+        arrayBuffersBytes: 10,
+      },
    });
    expect(JSON.stringify(snapshot.events[1])).not.toContain("call-1");
  });
--- a/src/logging/diagnostic-stability.ts
+++ b/src/logging/diagnostic-stability.ts
@@ -49,6 +49,7 @@ export type DiagnosticStabilityEventRecord = {
  queueDepth?: number;
  queueSize?: number;
  waitMs?: number;
+  failureKind?: string;
  active?: number;
  waiting?: number;
  queued?: number;
@@ -293,6 +294,7 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
      record.commandLength = event.commandLength;
      record.exitCode = event.exitCode;
      record.timedOut = event.timedOut;
+      record.failureKind = event.failureKind;
      assignReasonCode(record, event.failureKind);
      break;
    case "run.started":
@@ -355,6 +357,8 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
      record.requestBytes = event.requestPayloadBytes;
      record.responseBytes = event.responseStreamBytes;
      record.timeToFirstByteMs = event.timeToFirstByteMs;
+      record.failureKind = event.failureKind;
+      record.memory = event.memory ? copyMemory(event.memory) : undefined;
      assignReasonCode(record, event.errorCategory);
      break;
    case "log.record":
--- a/src/plugins/hook-types.ts
+++ b/src/plugins/hook-types.ts
@@ -212,6 +212,7 @@ export type PluginHookModelCallEndedEvent = PluginHookModelCallBaseEvent & {
  durationMs: number;
  outcome: "completed" | "error";
  errorCategory?: string;
+  failureKind?: "aborted" | "connection_closed" | "connection_reset" | "terminated" | "timeout";
  requestPayloadBytes?: number;
  responseStreamBytes?: number;
  timeToFirstByteMs?: number;