From 769d04b4ce7d6631f35ad78dd531609fc0309837 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Mon, 27 Apr 2026 12:53:42 +0100
Subject: [PATCH] docs(models): clarify local chat completions routing

---
 docs/gateway/config-tools.md                |  2 +-
 docs/gateway/local-models.md                | 21 ++++++++++--
 docs/gateway/troubleshooting.md             |  3 ++
 src/agents/model-selection.test.ts          |  6 ++++
 src/agents/openai-transport-stream.test.ts  | 19 +++++++----
 src/agents/pi-embedded-runner/model.test.ts | 37 +++++++++++++++++++++
 6 files changed, 78 insertions(+), 10 deletions(-)
diff --git a/docs/gateway/config-tools.md b/docs/gateway/config-tools.md
index de0322798ff..096eee8881b 100644
--- a/docs/gateway/config-tools.md
+++ b/docs/gateway/config-tools.md
@@ -432,7 +432,7 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi
       - Safe edits: use `openclaw config set models.providers.<id> '<json>' --strict-json --merge` or `openclaw config set models.providers.<id>.models '<json-array>' --strict-json --merge` for additive updates. `config set` refuses destructive replacements unless you pass `--replace`.
   </Accordion>
   <Accordion title="Provider connection and auth">
-    - `models.providers.*.api`: request adapter (`openai-completions`, `openai-responses`, `anthropic-messages`, `google-generative-ai`, etc).
+    - `models.providers.*.api`: request adapter (`openai-completions`, `openai-responses`, `anthropic-messages`, `google-generative-ai`, etc). For self-hosted `/v1/chat/completions` backends such as MLX, vLLM, SGLang, and most OpenAI-compatible local servers, use `openai-completions`. Use `openai-responses` only when the backend supports `/v1/responses`.
     - `models.providers.*.apiKey`: provider credential (prefer SecretRef/env substitution).
     - `models.providers.*.auth`: auth strategy (`api-key`, `token`, `oauth`, `aws-sdk`).
     - `models.providers.*.contextWindow`: default native context window for models under this provider when the model entry does not set `contextWindow`.
diff --git a/docs/gateway/local-models.md b/docs/gateway/local-models.md
index c72cbd7295e..5595ca5c5e5 100644
--- a/docs/gateway/local-models.md
+++ b/docs/gateway/local-models.md
@@ -113,17 +113,26 @@ Swap the primary and fallback order; keep the same providers block and `models.m
 
 ## Other OpenAI-compatible local proxies
 
-vLLM, LiteLLM, OAI-proxy, or custom gateways work if they expose an OpenAI-style `/v1` endpoint. Replace the provider block above with your endpoint and model ID:
+MLX (`mlx_lm.server`), vLLM, SGLang, LiteLLM, OAI-proxy, or custom
+gateways work if they expose an OpenAI-style `/v1/chat/completions`
+endpoint. Use the Chat Completions adapter unless the backend explicitly
+documents `/v1/responses` support. Replace the provider block above with your
+endpoint and model ID:
 
 ```json5
 {
+  agents: {
+    defaults: {
+      model: { primary: "local/my-local-model" },
+    },
+  },
   models: {
     mode: "merge",
     providers: {
       local: {
         baseUrl: "http://127.0.0.1:8000/v1",
         apiKey: "sk-local",
-        api: "openai-responses",
+        api: "openai-completions",
         timeoutSeconds: 300,
         models: [
           {
@@ -142,6 +151,14 @@ vLLM, LiteLLM, OAI-proxy, or custom gateways work if they expose an OpenAI-style
 }
 ```
 
+The `models.providers.<id>.models[].id` value is provider-local. Do not
+include the provider prefix there. For example, an MLX server started with
+`mlx_lm.server --model mlx-community/Qwen3-30B-A3B-6bit` should use this
+catalog id and model ref:
+
+- `models.providers.mlx.models[].id: "mlx-community/Qwen3-30B-A3B-6bit"`
+- `agents.defaults.model.primary: "mlx/mlx-community/Qwen3-30B-A3B-6bit"`
+
 Keep `models.mode: "merge"` so hosted models stay available as fallbacks.
 Use `models.providers.<id>.timeoutSeconds` for slow local or remote model
 servers before raising `agents.defaults.timeoutSeconds`. The provider timeout
diff --git a/docs/gateway/troubleshooting.md b/docs/gateway/troubleshooting.md
index 7fca62f86a6..32e669ffe2e 100644
--- a/docs/gateway/troubleshooting.md
+++ b/docs/gateway/troubleshooting.md
@@ -118,12 +118,15 @@ openclaw logs --follow
 Look for:
 
 - direct tiny calls succeed, but OpenClaw runs fail only on larger prompts
+- `model_not_found` or 404 errors even though direct `/v1/chat/completions`
+  works with the same bare model id
 - backend errors about `messages[].content` expecting a string
 - intermittent `incomplete turn detected ... stopReason=stop payloads=0` warnings with an OpenAI-compatible local backend
 - backend crashes that appear only with larger prompt-token counts or full agent runtime prompts
 
 <AccordionGroup>
   <Accordion title="Common signatures">
+    - `model_not_found` with a local MLX/vLLM-style server → verify `baseUrl` includes `/v1`, `api` is `"openai-completions"` for `/v1/chat/completions` backends, and `models.providers.<provider>.models[].id` is the bare provider-local id. Select it with the provider prefix once, for example `mlx/mlx-community/Qwen3-30B-A3B-6bit`; keep the catalog entry as `mlx-community/Qwen3-30B-A3B-6bit`.
     - `messages[...].content: invalid type: sequence, expected a string` → backend rejects structured Chat Completions content parts. Fix: set `models.providers.<provider>.models[].compat.requiresStringContent: true`.
     - `incomplete turn detected ... stopReason=stop payloads=0` → the backend completed the Chat Completions request but returned no user-visible assistant text for that turn. OpenClaw retries replay-safe empty OpenAI-compatible turns once; persistent failures usually mean the backend is emitting empty/non-text content or suppressing final-answer text.
     - direct tiny requests succeed, but OpenClaw agent runs fail with backend/model crashes (for example Gemma on some `inferrs` builds) → OpenClaw transport is likely already correct; the backend is failing on the larger agent-runtime prompt shape.
diff --git a/src/agents/model-selection.test.ts b/src/agents/model-selection.test.ts
index 78fcc88b4bc..21ac8e79b7a 100644
--- a/src/agents/model-selection.test.ts
+++ b/src/agents/model-selection.test.ts
@@ -216,6 +216,12 @@ describe("model-selection", () => {
         defaultProvider: "anthropic",
         expected: { provider: "nvidia", model: "moonshotai/kimi-k2.5" },
       },
+      {
+        name: "preserves nested MLX model ids after the provider prefix",
+        variants: ["mlx/mlx-community/Qwen3-30B-A3B-6bit"],
+        defaultProvider: "anthropic",
+        expected: { provider: "mlx", model: "mlx-community/Qwen3-30B-A3B-6bit" },
+      },
       {
         name: "normalizes anthropic shorthand aliases",
         variants: ["anthropic/opus-4.6", "opus-4.6", " anthropic / opus-4.6 "],
diff --git a/src/agents/openai-transport-stream.test.ts b/src/agents/openai-transport-stream.test.ts
index e961c131352..6f4a0cf22b0 100644
--- a/src/agents/openai-transport-stream.test.ts
+++ b/src/agents/openai-transport-stream.test.ts
@@ -425,7 +425,7 @@ describe("openai transport stream", () => {
   });
 
   it("streams OpenAI-compatible loopback requests with the configured SDK timeout", async () => {
-    let captured: { path?: string; timeout?: string; roles?: string[] } = {};
+    let captured: { path?: string; timeout?: string; model?: string; roles?: string[] } = {};
     const server = createServer((req, res) => {
       let body = "";
       req.setEncoding("utf8");
@@ -433,12 +433,16 @@ describe("openai transport stream", () => {
         body += chunk;
       });
       req.on("end", () => {
-        const parsed = JSON.parse(body) as { messages?: Array<{ role?: string }> };
+        const parsed = JSON.parse(body) as {
+          model?: string;
+          messages?: Array<{ role?: string }>;
+        };
         captured = {
           path: req.url,
           timeout: Array.isArray(req.headers["x-stainless-timeout"])
             ? req.headers["x-stainless-timeout"][0]
             : req.headers["x-stainless-timeout"],
+          model: parsed.model,
           roles: parsed.messages?.map((message) => message.role ?? ""),
         };
         res.writeHead(200, {
@@ -452,7 +456,7 @@ describe("openai transport stream", () => {
             id: "chatcmpl-timeout-proof",
             object: "chat.completion.chunk",
             created,
-            model: "slow-local",
+            model: "mlx-community/Qwen3-30B-A3B-6bit",
             choices: [
               {
                 index: 0,
@@ -467,7 +471,7 @@ describe("openai transport stream", () => {
             id: "chatcmpl-timeout-proof",
             object: "chat.completion.chunk",
             created,
-            model: "slow-local",
+            model: "mlx-community/Qwen3-30B-A3B-6bit",
             choices: [{ index: 0, delta: {}, finish_reason: "stop" }],
             usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 },
           })}\n\n`,
@@ -484,10 +488,10 @@ describe("openai transport stream", () => {
         throw new Error("Missing loopback server address");
       }
       const baseModel = {
-        id: "slow-local",
-        name: "Slow Local",
+        id: "mlx-community/Qwen3-30B-A3B-6bit",
+        name: "Qwen3 MLX",
         api: "openai-completions",
-        provider: "custom-openai-compatible",
+        provider: "mlx",
         baseUrl: `http://127.0.0.1:${address.port}/v1`,
         reasoning: false,
         input: ["text"],
@@ -524,6 +528,7 @@ describe("openai transport stream", () => {
 
       expect(captured.path).toBe("/v1/chat/completions");
       expect(captured.timeout).toBe("900");
+      expect(captured.model).toBe("mlx-community/Qwen3-30B-A3B-6bit");
       expect(captured.roles).toEqual(["system", "user"]);
       expect(doneReason).toBe("stop");
       expect(text).toBe("OK");
diff --git a/src/agents/pi-embedded-runner/model.test.ts b/src/agents/pi-embedded-runner/model.test.ts
index 22980337139..fad57243061 100644
--- a/src/agents/pi-embedded-runner/model.test.ts
+++ b/src/agents/pi-embedded-runner/model.test.ts
@@ -674,6 +674,43 @@ describe("resolveModel", () => {
     expect(result.model?.input).toEqual(["text"]);
   });
 
+  it("resolves custom MLX-style Hugging Face ids without adding the provider prefix", () => {
+    const modelId = "mlx-community/Qwen3-30B-A3B-6bit";
+    const cfg = {
+      agents: {
+        defaults: {
+          model: { primary: `mlx/${modelId}` },
+        },
+      },
+      models: {
+        providers: {
+          mlx: {
+            baseUrl: "http://127.0.0.1:8080/v1",
+            apiKey: "mlx-local",
+            api: "openai-completions",
+            models: [
+              {
+                ...makeModel(modelId),
+                contextWindow: 131072,
+                maxTokens: 8192,
+              },
+            ],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
+
+    const result = resolveModelForTest("mlx", modelId, "/tmp/agent", cfg);
+
+    expect(result.error).toBeUndefined();
+    expect(result.model).toMatchObject({
+      provider: "mlx",
+      id: modelId,
+      api: "openai-completions",
+      baseUrl: "http://127.0.0.1:8080/v1",
+    });
+  });
+
   it("prefers provider-prefixed configured metadata over discovered text-only models", () => {
     mockDiscoveredModel(discoverModels, {
       provider: "custom",