From 91adb69c57b1e4a1ebaf1c151b394628c258ce5d Mon Sep 17 00:00:00 2001 From: zhang-guiping Date: Sun, 26 Apr 2026 06:19:28 +0800 Subject: [PATCH] fix(image): resolve configured image models --- CHANGELOG.md | 3 + src/media-understanding/image.test.ts | 105 ++++++++++++++++++++++++++ src/media-understanding/image.ts | 50 +++++++++++- 3 files changed, 156 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51e7673cd7b..ffb46dfbb50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,9 @@ Docs: https://docs.openclaw.ai - CLI/plugins: keep `message` startup, `channels logs`, `agents delete`, and `agents set-identity` off broad plugin preloading; message delivery still loads plugins when the action actually runs. +- Image understanding: resolve configured image models such as local LM Studio + vision entries before reporting `Unknown model` when the discovery registry + has not registered that provider. Fixes #66486. Thanks @zhanggpcsu. - CLI/agents: keep `agents bind`, `agents unbind`, and `agents bindings` on setup-safe channel metadata paths so they do not preload bundled plugin runtimes or stage runtime dependencies. Fixes #71743. diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index 5cfd61090be..dee9ec0384c 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -18,6 +18,8 @@ const hoisted = vi.hoisted(() => ({ discoverModelsMock: vi.fn(), fetchMock: vi.fn(), registerProviderStreamForModelMock: vi.fn(), + prepareProviderDynamicModelMock: vi.fn(async () => {}), + resolveModelWithRegistryMock: vi.fn(), })); const { completeMock, @@ -29,8 +31,16 @@ const { discoverModelsMock, fetchMock, registerProviderStreamForModelMock, + prepareProviderDynamicModelMock, + resolveModelWithRegistryMock, } = hoisted; +type ResolveModelWithRegistryTestParams = { + modelRegistry: { find: (provider: string, modelId: string) => unknown }; + provider: string; + modelId: string; +}; + vi.mock("@mariozechner/pi-ai", async () => { const actual = await vi.importActual("@mariozechner/pi-ai"); return { @@ -63,6 +73,17 @@ vi.mock("../agents/pi-model-discovery-runtime.js", () => ({ discoverModels: discoverModelsMock, })); +vi.mock("../plugins/provider-runtime.js", async () => ({ + ...(await vi.importActual( + "../plugins/provider-runtime.js", + )), + prepareProviderDynamicModel: prepareProviderDynamicModelMock, +})); + +vi.mock("../agents/pi-embedded-runner/model.js", () => ({ + resolveModelWithRegistry: resolveModelWithRegistryMock, +})); + const { describeImageWithModel } = await import("./image.js"); describe("describeImageWithModel", () => { @@ -93,6 +114,12 @@ describe("describeImageWithModel", () => { baseUrl: "https://api.minimax.io/anthropic", })), }); + resolveModelWithRegistryMock.mockImplementation( + // Delegate to modelRegistry.find so tests that override discoverModelsMock + // automatically get the right model through resolveModelWithRegistry. + ({ modelRegistry, provider, modelId }: ResolveModelWithRegistryTestParams) => + modelRegistry.find(provider, modelId), + ); }); it("routes minimax-portal image models through the MiniMax VLM endpoint", async () => { @@ -188,6 +215,84 @@ describe("describeImageWithModel", () => { expect(fetchMock).not.toHaveBeenCalled(); }); + it("resolves configured image models when discovery has not registered the provider", async () => { + const registryFind = vi.fn(() => null); + discoverModelsMock.mockReturnValue({ find: registryFind }); + resolveModelWithRegistryMock.mockImplementationOnce( + ({ provider, modelId }: ResolveModelWithRegistryTestParams) => ({ + provider, + id: modelId, + api: "anthropic-messages", + input: ["text", "image"], + baseUrl: "http://127.0.0.1:1234", + }), + ); + completeMock.mockResolvedValue({ + role: "assistant", + api: "anthropic-messages", + provider: "lmstudio", + model: "google/gemma-4-e2b", + stopReason: "stop", + timestamp: Date.now(), + content: [{ type: "text", text: "local vision ok" }], + }); + + const result = await describeImageWithModel({ + cfg: { + models: { + providers: { + lmstudio: { + api: "anthropic-messages", + baseUrl: "http://127.0.0.1:1234", + models: [ + { + id: "google/gemma-4-e2b", + name: "google/gemma-4-e2b", + input: ["text", "image"], + reasoning: false, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 131_072, + maxTokens: 4096, + }, + ], + }, + }, + }, + }, + agentDir: "/tmp/openclaw-agent", + provider: "lmstudio", + model: "google/gemma-4-e2b", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "local vision ok", + model: "google/gemma-4-e2b", + }); + expect(registryFind).not.toHaveBeenCalled(); + expect(resolveModelWithRegistryMock).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "lmstudio", + modelId: "google/gemma-4-e2b", + cfg: expect.objectContaining({ + models: expect.objectContaining({ + providers: expect.objectContaining({ + lmstudio: expect.objectContaining({ + baseUrl: "http://127.0.0.1:1234", + }), + }), + }), + }), + }), + ); + expect(prepareProviderDynamicModelMock).not.toHaveBeenCalled(); + expect(completeMock).toHaveBeenCalledOnce(); + }); + it("passes image prompt as system instructions for codex image requests", async () => { discoverModelsMock.mockReturnValue({ find: vi.fn(() => ({ diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index 261fdb9b8a4..02d7624285c 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -6,14 +6,16 @@ import { requireApiKey, resolveApiKeyForProvider, } from "../agents/model-auth.js"; -import { normalizeModelRef } from "../agents/model-selection.js"; +import { findNormalizedProviderValue, normalizeModelRef } from "../agents/model-selection.js"; import { ensureOpenClawModelsJson } from "../agents/models-config.js"; +import { resolveModelWithRegistry } from "../agents/pi-embedded-runner/model.js"; import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js"; import { registerProviderStreamForModel } from "../agents/provider-stream.js"; import { coerceImageAssistantText, hasImageReasoningOnlyResponse, } from "../agents/tools/image-tool.helpers.js"; +import { prepareProviderDynamicModel } from "../plugins/provider-runtime.js"; import type { ImageDescriptionRequest, ImageDescriptionResult, @@ -141,11 +143,55 @@ async function resolveImageRuntime(params: { const authStorage = discoverAuthStorage(params.agentDir); const modelRegistry = discoverModels(authStorage, params.agentDir); const resolvedRef = normalizeModelRef(params.provider, params.model); - const model = modelRegistry.find(resolvedRef.provider, resolvedRef.model) as Model | null; + const configuredProviders = params.cfg.models?.providers; + const providerConfig = + configuredProviders?.[resolvedRef.provider] ?? + findNormalizedProviderValue(configuredProviders, resolvedRef.provider); + // Fast path: resolve without dynamic model preparation first. + // This avoids unnecessary prepare hooks (e.g. OpenRouter catalog fetch) + // for models that are already explicitly resolvable. + let model = resolveModelWithRegistry({ + provider: resolvedRef.provider, + modelId: resolvedRef.model, + modelRegistry, + cfg: params.cfg, + agentDir: params.agentDir, + }) as Model | null; + + // If the model is not in the registry yet, prepare dynamic provider models + // and retry (needed for provider-runtime-backed dynamic models). + if (!model) { + await prepareProviderDynamicModel({ + provider: resolvedRef.provider, + config: params.cfg, + context: { + config: params.cfg, + agentDir: params.agentDir, + provider: resolvedRef.provider, + modelId: resolvedRef.model, + modelRegistry, + providerConfig, + }, + }); + model = resolveModelWithRegistry({ + provider: resolvedRef.provider, + modelId: resolvedRef.model, + modelRegistry, + cfg: params.cfg, + agentDir: params.agentDir, + }) as Model | null; + } if (!model) { throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`); } if (!model.input?.includes("image")) { + // resolveModelWithRegistry may synthesize a text-only fallback for configured + // providers, which would change "Unknown model" → "Model does not support images" + // and skip the MiniMax VLM recovery path. Throw Unknown model for MiniMax VLM + // models so the caller can attempt the fallback. + if (isMinimaxVlmModel(resolvedRef.provider, resolvedRef.model)) { + throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`); + } throw new Error(`Model does not support images: ${params.provider}/${params.model}`); } const apiKeyInfo = await getApiKeyForModel({