From 769d04b4ce7d6631f35ad78dd531609fc0309837 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 27 Apr 2026 12:53:42 +0100 Subject: [PATCH] docs(models): clarify local chat completions routing --- docs/gateway/config-tools.md | 2 +- docs/gateway/local-models.md | 21 ++++++++++-- docs/gateway/troubleshooting.md | 3 ++ src/agents/model-selection.test.ts | 6 ++++ src/agents/openai-transport-stream.test.ts | 19 +++++++---- src/agents/pi-embedded-runner/model.test.ts | 37 +++++++++++++++++++++ 6 files changed, 78 insertions(+), 10 deletions(-) diff --git a/docs/gateway/config-tools.md b/docs/gateway/config-tools.md index de0322798ff..096eee8881b 100644 --- a/docs/gateway/config-tools.md +++ b/docs/gateway/config-tools.md @@ -432,7 +432,7 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi - Safe edits: use `openclaw config set models.providers. '' --strict-json --merge` or `openclaw config set models.providers..models '' --strict-json --merge` for additive updates. `config set` refuses destructive replacements unless you pass `--replace`. - - `models.providers.*.api`: request adapter (`openai-completions`, `openai-responses`, `anthropic-messages`, `google-generative-ai`, etc). + - `models.providers.*.api`: request adapter (`openai-completions`, `openai-responses`, `anthropic-messages`, `google-generative-ai`, etc). For self-hosted `/v1/chat/completions` backends such as MLX, vLLM, SGLang, and most OpenAI-compatible local servers, use `openai-completions`. Use `openai-responses` only when the backend supports `/v1/responses`. - `models.providers.*.apiKey`: provider credential (prefer SecretRef/env substitution). - `models.providers.*.auth`: auth strategy (`api-key`, `token`, `oauth`, `aws-sdk`). - `models.providers.*.contextWindow`: default native context window for models under this provider when the model entry does not set `contextWindow`. diff --git a/docs/gateway/local-models.md b/docs/gateway/local-models.md index c72cbd7295e..5595ca5c5e5 100644 --- a/docs/gateway/local-models.md +++ b/docs/gateway/local-models.md @@ -113,17 +113,26 @@ Swap the primary and fallback order; keep the same providers block and `models.m ## Other OpenAI-compatible local proxies -vLLM, LiteLLM, OAI-proxy, or custom gateways work if they expose an OpenAI-style `/v1` endpoint. Replace the provider block above with your endpoint and model ID: +MLX (`mlx_lm.server`), vLLM, SGLang, LiteLLM, OAI-proxy, or custom +gateways work if they expose an OpenAI-style `/v1/chat/completions` +endpoint. Use the Chat Completions adapter unless the backend explicitly +documents `/v1/responses` support. Replace the provider block above with your +endpoint and model ID: ```json5 { + agents: { + defaults: { + model: { primary: "local/my-local-model" }, + }, + }, models: { mode: "merge", providers: { local: { baseUrl: "http://127.0.0.1:8000/v1", apiKey: "sk-local", - api: "openai-responses", + api: "openai-completions", timeoutSeconds: 300, models: [ { @@ -142,6 +151,14 @@ vLLM, LiteLLM, OAI-proxy, or custom gateways work if they expose an OpenAI-style } ``` +The `models.providers..models[].id` value is provider-local. Do not +include the provider prefix there. For example, an MLX server started with +`mlx_lm.server --model mlx-community/Qwen3-30B-A3B-6bit` should use this +catalog id and model ref: + +- `models.providers.mlx.models[].id: "mlx-community/Qwen3-30B-A3B-6bit"` +- `agents.defaults.model.primary: "mlx/mlx-community/Qwen3-30B-A3B-6bit"` + Keep `models.mode: "merge"` so hosted models stay available as fallbacks. Use `models.providers..timeoutSeconds` for slow local or remote model servers before raising `agents.defaults.timeoutSeconds`. The provider timeout diff --git a/docs/gateway/troubleshooting.md b/docs/gateway/troubleshooting.md index 7fca62f86a6..32e669ffe2e 100644 --- a/docs/gateway/troubleshooting.md +++ b/docs/gateway/troubleshooting.md @@ -118,12 +118,15 @@ openclaw logs --follow Look for: - direct tiny calls succeed, but OpenClaw runs fail only on larger prompts +- `model_not_found` or 404 errors even though direct `/v1/chat/completions` + works with the same bare model id - backend errors about `messages[].content` expecting a string - intermittent `incomplete turn detected ... stopReason=stop payloads=0` warnings with an OpenAI-compatible local backend - backend crashes that appear only with larger prompt-token counts or full agent runtime prompts + - `model_not_found` with a local MLX/vLLM-style server → verify `baseUrl` includes `/v1`, `api` is `"openai-completions"` for `/v1/chat/completions` backends, and `models.providers..models[].id` is the bare provider-local id. Select it with the provider prefix once, for example `mlx/mlx-community/Qwen3-30B-A3B-6bit`; keep the catalog entry as `mlx-community/Qwen3-30B-A3B-6bit`. - `messages[...].content: invalid type: sequence, expected a string` → backend rejects structured Chat Completions content parts. Fix: set `models.providers..models[].compat.requiresStringContent: true`. - `incomplete turn detected ... stopReason=stop payloads=0` → the backend completed the Chat Completions request but returned no user-visible assistant text for that turn. OpenClaw retries replay-safe empty OpenAI-compatible turns once; persistent failures usually mean the backend is emitting empty/non-text content or suppressing final-answer text. - direct tiny requests succeed, but OpenClaw agent runs fail with backend/model crashes (for example Gemma on some `inferrs` builds) → OpenClaw transport is likely already correct; the backend is failing on the larger agent-runtime prompt shape. diff --git a/src/agents/model-selection.test.ts b/src/agents/model-selection.test.ts index 78fcc88b4bc..21ac8e79b7a 100644 --- a/src/agents/model-selection.test.ts +++ b/src/agents/model-selection.test.ts @@ -216,6 +216,12 @@ describe("model-selection", () => { defaultProvider: "anthropic", expected: { provider: "nvidia", model: "moonshotai/kimi-k2.5" }, }, + { + name: "preserves nested MLX model ids after the provider prefix", + variants: ["mlx/mlx-community/Qwen3-30B-A3B-6bit"], + defaultProvider: "anthropic", + expected: { provider: "mlx", model: "mlx-community/Qwen3-30B-A3B-6bit" }, + }, { name: "normalizes anthropic shorthand aliases", variants: ["anthropic/opus-4.6", "opus-4.6", " anthropic / opus-4.6 "], diff --git a/src/agents/openai-transport-stream.test.ts b/src/agents/openai-transport-stream.test.ts index e961c131352..6f4a0cf22b0 100644 --- a/src/agents/openai-transport-stream.test.ts +++ b/src/agents/openai-transport-stream.test.ts @@ -425,7 +425,7 @@ describe("openai transport stream", () => { }); it("streams OpenAI-compatible loopback requests with the configured SDK timeout", async () => { - let captured: { path?: string; timeout?: string; roles?: string[] } = {}; + let captured: { path?: string; timeout?: string; model?: string; roles?: string[] } = {}; const server = createServer((req, res) => { let body = ""; req.setEncoding("utf8"); @@ -433,12 +433,16 @@ describe("openai transport stream", () => { body += chunk; }); req.on("end", () => { - const parsed = JSON.parse(body) as { messages?: Array<{ role?: string }> }; + const parsed = JSON.parse(body) as { + model?: string; + messages?: Array<{ role?: string }>; + }; captured = { path: req.url, timeout: Array.isArray(req.headers["x-stainless-timeout"]) ? req.headers["x-stainless-timeout"][0] : req.headers["x-stainless-timeout"], + model: parsed.model, roles: parsed.messages?.map((message) => message.role ?? ""), }; res.writeHead(200, { @@ -452,7 +456,7 @@ describe("openai transport stream", () => { id: "chatcmpl-timeout-proof", object: "chat.completion.chunk", created, - model: "slow-local", + model: "mlx-community/Qwen3-30B-A3B-6bit", choices: [ { index: 0, @@ -467,7 +471,7 @@ describe("openai transport stream", () => { id: "chatcmpl-timeout-proof", object: "chat.completion.chunk", created, - model: "slow-local", + model: "mlx-community/Qwen3-30B-A3B-6bit", choices: [{ index: 0, delta: {}, finish_reason: "stop" }], usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 }, })}\n\n`, @@ -484,10 +488,10 @@ describe("openai transport stream", () => { throw new Error("Missing loopback server address"); } const baseModel = { - id: "slow-local", - name: "Slow Local", + id: "mlx-community/Qwen3-30B-A3B-6bit", + name: "Qwen3 MLX", api: "openai-completions", - provider: "custom-openai-compatible", + provider: "mlx", baseUrl: `http://127.0.0.1:${address.port}/v1`, reasoning: false, input: ["text"], @@ -524,6 +528,7 @@ describe("openai transport stream", () => { expect(captured.path).toBe("/v1/chat/completions"); expect(captured.timeout).toBe("900"); + expect(captured.model).toBe("mlx-community/Qwen3-30B-A3B-6bit"); expect(captured.roles).toEqual(["system", "user"]); expect(doneReason).toBe("stop"); expect(text).toBe("OK"); diff --git a/src/agents/pi-embedded-runner/model.test.ts b/src/agents/pi-embedded-runner/model.test.ts index 22980337139..fad57243061 100644 --- a/src/agents/pi-embedded-runner/model.test.ts +++ b/src/agents/pi-embedded-runner/model.test.ts @@ -674,6 +674,43 @@ describe("resolveModel", () => { expect(result.model?.input).toEqual(["text"]); }); + it("resolves custom MLX-style Hugging Face ids without adding the provider prefix", () => { + const modelId = "mlx-community/Qwen3-30B-A3B-6bit"; + const cfg = { + agents: { + defaults: { + model: { primary: `mlx/${modelId}` }, + }, + }, + models: { + providers: { + mlx: { + baseUrl: "http://127.0.0.1:8080/v1", + apiKey: "mlx-local", + api: "openai-completions", + models: [ + { + ...makeModel(modelId), + contextWindow: 131072, + maxTokens: 8192, + }, + ], + }, + }, + }, + } as unknown as OpenClawConfig; + + const result = resolveModelForTest("mlx", modelId, "/tmp/agent", cfg); + + expect(result.error).toBeUndefined(); + expect(result.model).toMatchObject({ + provider: "mlx", + id: modelId, + api: "openai-completions", + baseUrl: "http://127.0.0.1:8080/v1", + }); + }); + it("prefers provider-prefixed configured metadata over discovered text-only models", () => { mockDiscoveredModel(discoverModels, { provider: "custom",