refactor(pdf): move document extraction to plugin

* refactor(pdf): move document extraction to plugin

* fix(deps): sync document extract lockfile

* fix(pdf): harden document extraction plugin
This commit is contained in:
Vincent Koc
2026-04-24 17:15:05 -07:00
committed by GitHub
parent 915931aa38
commit e3cba98f39
34 changed files with 1023 additions and 321 deletions

View File

@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Plugins/PDF: move local PDF extraction into a bundled `document-extract` plugin so core no longer owns `pdfjs-dist` or PDF image-rendering dependencies. Thanks @vincentkoc.
- Matrix: require full cross-signing identity trust for self-device verification and add `openclaw matrix verify self` so operators can establish that trust from the CLI. (#70401) Thanks @gumadeiras.
### Fixes

View File

@@ -172,8 +172,9 @@ Current behavior:
rasterized into images and passed to the model, and the injected file block uses
the placeholder `[PDF content rendered to images]`.
PDF parsing uses the Node-friendly `pdfjs-dist` legacy build (no worker). The modern
PDF.js build expects browser workers/DOM globals, so it is not used in the Gateway.
PDF parsing is provided by the bundled `document-extract` plugin, which uses the
Node-friendly `pdfjs-dist` legacy build (no worker). The modern PDF.js build
expects browser workers/DOM globals, so it is not used in the Gateway.
URL fetch defaults:

View File

@@ -112,7 +112,9 @@ Fallback details:
- If text extraction succeeds but image extraction would require vision on a
text-only model, OpenClaw drops the rendered images and continues with the
extracted text.
- Extraction fallback requires `pdfjs-dist` (and `@napi-rs/canvas` for image rendering).
- Extraction fallback uses the bundled `document-extract` plugin. The plugin owns
`pdfjs-dist`; `@napi-rs/canvas` is used only when image rendering fallback is
available.
## Config

View File

@@ -0,0 +1,62 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const { canvasSizes, pdfDocument } = vi.hoisted(() => ({
canvasSizes: [] as Array<{ width: number; height: number }>,
pdfDocument: {
numPages: 2,
getPage: vi.fn(async () => ({
getTextContent: vi.fn(async () => ({ items: [] })),
getViewport: vi.fn(({ scale }: { scale: number }) => ({
width: 1000 * scale,
height: 1000 * scale,
})),
render: vi.fn(() => ({ promise: Promise.resolve() })),
})),
},
}));
vi.mock("pdfjs-dist/legacy/build/pdf.mjs", () => ({
getDocument: vi.fn(() => ({ promise: Promise.resolve(pdfDocument) })),
}));
vi.mock("@napi-rs/canvas", () => ({
createCanvas: vi.fn((width: number, height: number) => {
canvasSizes.push({ width, height });
return {
toBuffer: vi.fn(() => Buffer.from("png")),
};
}),
}));
import { createPdfDocumentExtractor } from "./document-extractor.js";
describe("PDF document extractor", () => {
beforeEach(() => {
canvasSizes.length = 0;
pdfDocument.getPage.mockClear();
});
it("declares PDF support", () => {
const extractor = createPdfDocumentExtractor();
expect(extractor).toMatchObject({
id: "pdf",
label: "PDF",
mimeTypes: ["application/pdf"],
});
});
it("treats maxPixels as a hard total image rendering budget", async () => {
const extractor = createPdfDocumentExtractor();
const result = await extractor.extract({
buffer: Buffer.from("%PDF-1.4"),
mimeType: "application/pdf",
maxPages: 2,
maxPixels: 100,
minTextChars: 10,
});
expect(result?.images).toHaveLength(1);
expect(canvasSizes).toEqual([{ width: 10, height: 10 }]);
});
});

View File

@@ -0,0 +1,216 @@
import type {
DocumentExtractedImage,
DocumentExtractionRequest,
DocumentExtractionResult,
DocumentExtractorPlugin,
} from "openclaw/plugin-sdk/document-extractor";
type CanvasLike = {
toBuffer(type: "image/png"): Buffer;
};
type CanvasModule = {
createCanvas(width: number, height: number): CanvasLike;
};
type PdfTextItem = {
str: string;
};
type PdfTextContent = {
items: Array<PdfTextItem | object>;
};
type PdfViewport = {
width: number;
height: number;
};
type PdfPage = {
getTextContent(): Promise<PdfTextContent>;
getViewport(params: { scale: number }): PdfViewport;
render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise<void> };
};
type PdfDocument = {
numPages: number;
getPage(pageNumber: number): Promise<PdfPage>;
};
type PdfJsModule = {
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
promise: Promise<PdfDocument>;
};
};
const CANVAS_MODULE = "@napi-rs/canvas";
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
const MAX_RENDER_DIMENSION = 10_000;
let canvasModulePromise: Promise<CanvasModule> | null = null;
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
async function loadCanvasModule(): Promise<CanvasModule> {
if (!canvasModulePromise) {
canvasModulePromise = (import(CANVAS_MODULE) as Promise<CanvasModule>).catch((err) => {
canvasModulePromise = null;
throw new Error("Optional dependency @napi-rs/canvas is required for PDF image extraction", {
cause: err,
});
});
}
return canvasModulePromise;
}
async function loadPdfJsModule(): Promise<PdfJsModule> {
if (!pdfJsModulePromise) {
pdfJsModulePromise = (import(PDFJS_MODULE) as Promise<PdfJsModule>).catch((err) => {
pdfJsModulePromise = null;
throw new Error("Optional dependency pdfjs-dist is required for PDF extraction", {
cause: err,
});
});
}
return pdfJsModulePromise;
}
function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number {
if (!pageText) {
return currentLength;
}
const remaining = MAX_EXTRACTED_TEXT_CHARS - currentLength;
if (remaining <= 0) {
return currentLength;
}
const nextText = pageText.length > remaining ? pageText.slice(0, remaining) : pageText;
parts.push(nextText);
return currentLength + nextText.length;
}
function resolveRenderPlan(
viewport: PdfViewport,
remainingPixels: number,
): { scale: number; width: number; height: number; pixels: number } | null {
if (
remainingPixels <= 0 ||
!Number.isFinite(viewport.width) ||
!Number.isFinite(viewport.height) ||
viewport.width <= 0 ||
viewport.height <= 0
) {
return null;
}
const pagePixels = Math.max(1, viewport.width * viewport.height);
const maxScale = Math.min(
1,
Math.sqrt(remainingPixels / pagePixels),
MAX_RENDER_DIMENSION / viewport.width,
MAX_RENDER_DIMENSION / viewport.height,
);
if (!Number.isFinite(maxScale) || maxScale <= 0) {
return null;
}
let best: { scale: number; width: number; height: number; pixels: number } | null = null;
let low = 0;
let high = maxScale;
for (let i = 0; i < 32; i += 1) {
const scale = (low + high) / 2;
const width = Math.max(1, Math.ceil(viewport.width * scale));
const height = Math.max(1, Math.ceil(viewport.height * scale));
const pixels = width * height;
if (
width <= MAX_RENDER_DIMENSION &&
height <= MAX_RENDER_DIMENSION &&
pixels <= remainingPixels
) {
best = { scale, width, height, pixels };
low = scale;
} else {
high = scale;
}
}
return best;
}
async function extractPdfContent(
request: DocumentExtractionRequest,
): Promise<DocumentExtractionResult> {
const pdfJsModule = await loadPdfJsModule();
const pdf = await pdfJsModule.getDocument({
data: new Uint8Array(request.buffer),
disableWorker: true,
}).promise;
const effectivePages: number[] = request.pageNumbers
? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages)
: Array.from({ length: Math.min(pdf.numPages, request.maxPages) }, (_, i) => i + 1);
const textParts: string[] = [];
let extractedTextLength = 0;
for (const pageNum of effectivePages) {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item) => ("str" in item ? item.str : ""))
.filter(Boolean)
.join(" ");
if (pageText) {
extractedTextLength = appendTextWithinLimit(textParts, pageText, extractedTextLength);
if (extractedTextLength >= MAX_EXTRACTED_TEXT_CHARS) {
break;
}
}
}
const text = textParts.join("\n\n");
if (text.trim().length >= request.minTextChars) {
return { text, images: [] };
}
let canvasModule: CanvasModule;
try {
canvasModule = await loadCanvasModule();
} catch (err) {
request.onImageExtractionError?.(err);
return { text, images: [] };
}
const images: DocumentExtractedImage[] = [];
let remainingPixels = Math.max(1, Math.floor(request.maxPixels));
for (const pageNum of effectivePages) {
if (remainingPixels <= 0) {
break;
}
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1 });
const plan = resolveRenderPlan(viewport, remainingPixels);
if (!plan) {
break;
}
const scaled = page.getViewport({ scale: plan.scale });
const canvas = canvasModule.createCanvas(plan.width, plan.height);
await page.render({
canvas: canvas as unknown as HTMLCanvasElement,
viewport: scaled,
}).promise;
const png = canvas.toBuffer("image/png");
images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
remainingPixels -= plan.pixels;
}
return { text, images };
}
export function createPdfDocumentExtractor(): DocumentExtractorPlugin {
return {
id: "pdf",
label: "PDF",
mimeTypes: ["application/pdf"],
autoDetectOrder: 10,
extract: extractPdfContent,
};
}

View File

@@ -0,0 +1,11 @@
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
export default definePluginEntry({
id: "document-extract",
name: "Document Extraction",
description: "Extract text and fallback page images from local document attachments.",
register() {
// Runtime is exposed through document-extractor.ts so document hot paths can
// load only the narrow extractor artifact instead of the full plugin entrypoint.
},
});

View File

@@ -0,0 +1,14 @@
{
"id": "document-extract",
"enabledByDefault": true,
"name": "Document Extraction",
"description": "Extract text and fallback page images from local document attachments.",
"contracts": {
"documentExtractors": ["pdf"]
},
"configSchema": {
"type": "object",
"additionalProperties": false,
"properties": {}
}
}

View File

@@ -0,0 +1,26 @@
{
"name": "@openclaw/document-extract-plugin",
"version": "2026.4.24",
"private": true,
"description": "OpenClaw local document extraction plugin",
"type": "module",
"dependencies": {
"pdfjs-dist": "^5.6.205"
},
"devDependencies": {
"@openclaw/plugin-sdk": "workspace:*"
},
"peerDependencies": {
"@napi-rs/canvas": "^0.1.89"
},
"peerDependenciesMeta": {
"@napi-rs/canvas": {
"optional": true
}
},
"openclaw": {
"extensions": [
"./index.ts"
]
}
}

View File

@@ -1125,6 +1125,10 @@
"types": "./dist/plugin-sdk/web-content-extractor.d.ts",
"default": "./dist/plugin-sdk/web-content-extractor.js"
},
"./plugin-sdk/document-extractor": {
"types": "./dist/plugin-sdk/document-extractor.d.ts",
"default": "./dist/plugin-sdk/document-extractor.js"
},
"./plugin-sdk/provider-web-fetch-contract": {
"types": "./dist/plugin-sdk/provider-web-fetch-contract.d.ts",
"default": "./dist/plugin-sdk/provider-web-fetch-contract.js"
@@ -1609,7 +1613,6 @@
"markdown-it": "14.1.1",
"openai": "^6.34.0",
"osc-progress": "^0.3.0",
"pdfjs-dist": "^5.6.205",
"proxy-agent": "^8.0.1",
"semver": "7.7.4",
"sharp": "^0.34.5",
@@ -1647,7 +1650,6 @@
"vitest": "^4.1.5"
},
"peerDependencies": {
"@napi-rs/canvas": "^0.1.89",
"node-llama-cpp": "3.18.1"
},
"peerDependenciesMeta": {
@@ -1693,7 +1695,6 @@
"onlyBuiltDependencies": [
"@lydell/node-pty",
"@matrix-org/matrix-sdk-crypto-nodejs",
"@napi-rs/canvas",
"@tloncorp/api",
"@tloncorp/tlon-skill",
"@whiskeysockets/baileys",

142
pnpm-lock.yaml generated
View File

@@ -63,9 +63,6 @@ importers:
'@modelcontextprotocol/sdk':
specifier: 1.29.0
version: 1.29.0(zod@4.3.6)
'@napi-rs/canvas':
specifier: ^0.1.89
version: 0.1.92
'@vincentkoc/qrcode-tui':
specifier: 0.2.1
version: 0.2.1
@@ -120,9 +117,6 @@ importers:
osc-progress:
specifier: ^0.3.0
version: 0.3.0
pdfjs-dist:
specifier: ^5.6.205
version: 5.6.205
proxy-agent:
specifier: ^8.0.1
version: 8.0.1
@@ -519,6 +513,19 @@ importers:
specifier: workspace:*
version: link:../..
extensions/document-extract:
dependencies:
'@napi-rs/canvas':
specifier: ^0.1.89
version: 0.1.99
pdfjs-dist:
specifier: ^5.6.205
version: 5.6.205
devDependencies:
'@openclaw/plugin-sdk':
specifier: workspace:*
version: link:../../packages/plugin-sdk
extensions/duckduckgo:
devDependencies:
'@openclaw/plugin-sdk':
@@ -2774,61 +2781,30 @@ packages:
resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==}
engines: {node: '>=14.0.0'}
'@napi-rs/canvas-android-arm64@0.1.92':
resolution: {integrity: sha512-rDOtq53ujfOuevD5taxAuIFALuf1QsQWZe1yS/N4MtT+tNiDBEdjufvQRPWZ11FubL2uwgP8ApYU3YOaNu1ZsQ==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [android]
'@napi-rs/canvas-android-arm64@0.1.99':
resolution: {integrity: sha512-9OCRt8VVxA17m32NWZKyNC2qamdaS/SC5CEOIQwFngRq0DIeVm4PDal+6Ljnhqm2whZiC63DNuKZ4xSp2nbj9w==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [android]
'@napi-rs/canvas-darwin-arm64@0.1.92':
resolution: {integrity: sha512-4PT6GRGCr7yMRehp42x0LJb1V0IEy1cDZDDayv7eKbFUIGbPFkV7CRC9Bee5MPkjg1EB4ZPXXUyy3gjQm7mR8Q==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [darwin]
'@napi-rs/canvas-darwin-arm64@0.1.99':
resolution: {integrity: sha512-lupMDMy1+H38dhyCcLirOKKVUyzzlxi7j7rGPLI3vViMHOoPjcXO1b10ivy+ad+q6MiwHfoLjKTCoLke5ySOBg==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [darwin]
'@napi-rs/canvas-darwin-x64@0.1.92':
resolution: {integrity: sha512-5e/3ZapP7CqPtDcZPtmowCsjoyQwuNMMD7c0GKPtZQ8pgQhLkeq/3fmk0HqNSD1i227FyJN/9pDrhw/UMTkaWA==}
engines: {node: '>= 10'}
cpu: [x64]
os: [darwin]
'@napi-rs/canvas-darwin-x64@0.1.99':
resolution: {integrity: sha512-fdz02t4w8n6Ii/rYhWig6STb/zcTmCC/6YZTGmjoDeidDwn9Wf0ukQVynhCPEs29vqUc66wHZKsuIgMs9tycCg==}
engines: {node: '>= 10'}
cpu: [x64]
os: [darwin]
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.92':
resolution: {integrity: sha512-j6KaLL9iir68lwpzzY+aBGag1PZp3+gJE2mQ3ar4VJVmyLRVOh+1qsdNK1gfWoAVy5w6U7OEYFrLzN2vOFUSng==}
engines: {node: '>= 10'}
cpu: [arm]
os: [linux]
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.99':
resolution: {integrity: sha512-w4FwVwlNo00ezeRhfY62IVIyt6G3u8wodkPtiqWc52BUHx+VDBUM2vkS3ogfANaLI7hnf3s6WK4LyZVUjBg1lA==}
engines: {node: '>= 10'}
cpu: [arm]
os: [linux]
'@napi-rs/canvas-linux-arm64-gnu@0.1.92':
resolution: {integrity: sha512-s3NlnJMHOSotUYVoTCoC1OcomaChFdKmZg0VsHFeIkeHbwX0uPHP4eCX1irjSfMykyvsGHTQDfBAtGYuqxCxhQ==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [linux]
libc: [glibc]
'@napi-rs/canvas-linux-arm64-gnu@0.1.99':
resolution: {integrity: sha512-8JvHeexKQ8c7g0q7YJ29NVQwnf1ePghP9ys9ZN0R0qzyqJQ9Uw6N9qnDINArlm3IYHexB7LjzArIfhQiqSDGvQ==}
engines: {node: '>= 10'}
@@ -2836,13 +2812,6 @@ packages:
os: [linux]
libc: [glibc]
'@napi-rs/canvas-linux-arm64-musl@0.1.92':
resolution: {integrity: sha512-xV0GQnukYq5qY+ebkAwHjnP2OrSGBxS3vSi1zQNQj0bkXU6Ou+Tw7JjCM7pZcQ28MUyEBS1yKfo7rc7ip2IPFQ==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [linux]
libc: [musl]
'@napi-rs/canvas-linux-arm64-musl@0.1.99':
resolution: {integrity: sha512-Z+6nyLdJXWzLPVxi4H6g9TJop4DwN3KSgHWto5JCbZV5/uKoVqcSynPs0tGlUHOoWI8S8tEvJspz51GQkvr07w==}
engines: {node: '>= 10'}
@@ -2850,13 +2819,6 @@ packages:
os: [linux]
libc: [musl]
'@napi-rs/canvas-linux-riscv64-gnu@0.1.92':
resolution: {integrity: sha512-+GKvIFbQ74eB/TopEdH6XIXcvOGcuKvCITLGXy7WLJAyNp3Kdn1ncjxg91ihatBaPR+t63QOE99yHuIWn3UQ9w==}
engines: {node: '>= 10'}
cpu: [riscv64]
os: [linux]
libc: [glibc]
'@napi-rs/canvas-linux-riscv64-gnu@0.1.99':
resolution: {integrity: sha512-jAnfOUv4IO1l8Levk5t85oVtEBOXLa07KnIUgWo1CDlPxiqpxS3uBfiE38Lvj/CQgHaNF6Nxk/SaemwLgsVJgw==}
engines: {node: '>= 10'}
@@ -2864,13 +2826,6 @@ packages:
os: [linux]
libc: [glibc]
'@napi-rs/canvas-linux-x64-gnu@0.1.92':
resolution: {integrity: sha512-tFd6MwbEhZ1g64iVY2asV+dOJC+GT3Yd6UH4G3Hp0/VHQ6qikB+nvXEULskFYZ0+wFqlGPtXjG1Jmv7sJy+3Ww==}
engines: {node: '>= 10'}
cpu: [x64]
os: [linux]
libc: [glibc]
'@napi-rs/canvas-linux-x64-gnu@0.1.99':
resolution: {integrity: sha512-mIkXw3fGmbYyFjSmfWEvty4jN+rwEOmv0+Dy9bRvvTzLYWCgm3RMgUEQVfAKFw96nIRFnyNZiK83KNQaVVFjng==}
engines: {node: '>= 10'}
@@ -2878,13 +2833,6 @@ packages:
os: [linux]
libc: [glibc]
'@napi-rs/canvas-linux-x64-musl@0.1.92':
resolution: {integrity: sha512-uSuqeSveB/ZGd72VfNbHCSXO9sArpZTvznMVsb42nqPP7gBGEH6NJQ0+hmF+w24unEmxBhPYakP/Wiosm16KkA==}
engines: {node: '>= 10'}
cpu: [x64]
os: [linux]
libc: [musl]
'@napi-rs/canvas-linux-x64-musl@0.1.99':
resolution: {integrity: sha512-f3Uz2P0RgrtBHISxZqr6yiYXJlTDyCVBumDacxo+4AmSg7z0HiqYZKGWC/gszq3fbPhyQUya1W2AEteKxT9Y6A==}
engines: {node: '>= 10'}
@@ -2892,34 +2840,18 @@ packages:
os: [linux]
libc: [musl]
'@napi-rs/canvas-win32-arm64-msvc@0.1.92':
resolution: {integrity: sha512-20SK5AU/OUNz9ZuoAPj5ekWai45EIBDh/XsdrVZ8le/pJVlhjFU3olbumSQUXRFn7lBRS+qwM8kA//uLaDx6iQ==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [win32]
'@napi-rs/canvas-win32-arm64-msvc@0.1.99':
resolution: {integrity: sha512-XE6KUkfqRsCNejcoRMiMr3RaUeObxNf6y7dut3hrq2rn7PzfRTZgrjF1F/B2C7FcdgqY/vSHWpQeMuNz1vTNHg==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [win32]
'@napi-rs/canvas-win32-x64-msvc@0.1.92':
resolution: {integrity: sha512-KEhyZLzq1MXCNlXybz4k25MJmHFp+uK1SIb8yJB0xfrQjz5aogAMhyseSzewo+XxAq3OAOdyKvfHGNzT3w1RPg==}
engines: {node: '>= 10'}
cpu: [x64]
os: [win32]
'@napi-rs/canvas-win32-x64-msvc@0.1.99':
resolution: {integrity: sha512-plMYGVbc/vmmPF9MtmHbwNk1rL1Aj53vQZt+Gnv1oZn6gmd9jEHHJ0n9Nd2nxa5sKH7TS5IjkCDM6289O0d6PQ==}
engines: {node: '>= 10'}
cpu: [x64]
os: [win32]
'@napi-rs/canvas@0.1.92':
resolution: {integrity: sha512-q7ZaUCJkEU5BeOdE7fBx1XWRd2T5Ady65nxq4brMf5L4cE1VV/ACq5w9Z5b/IVJs8CwSSIwc30nlthH0gFo4Ig==}
engines: {node: '>= 10'}
'@napi-rs/canvas@0.1.99':
resolution: {integrity: sha512-zN4eQlK3eBf7aJBcTHZilpBH3tDekBzPMIWC8r0s94Ecl73XfOyFi4w7yKFMRVUT0lvNQjtOL8YSrwqQj6mZFg==}
engines: {node: '>= 10'}
@@ -9914,86 +9846,39 @@ snapshots:
'@mozilla/readability@0.6.0': {}
'@napi-rs/canvas-android-arm64@0.1.92':
optional: true
'@napi-rs/canvas-android-arm64@0.1.99':
optional: true
'@napi-rs/canvas-darwin-arm64@0.1.92':
optional: true
'@napi-rs/canvas-darwin-arm64@0.1.99':
optional: true
'@napi-rs/canvas-darwin-x64@0.1.92':
optional: true
'@napi-rs/canvas-darwin-x64@0.1.99':
optional: true
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.92':
optional: true
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.99':
optional: true
'@napi-rs/canvas-linux-arm64-gnu@0.1.92':
optional: true
'@napi-rs/canvas-linux-arm64-gnu@0.1.99':
optional: true
'@napi-rs/canvas-linux-arm64-musl@0.1.92':
optional: true
'@napi-rs/canvas-linux-arm64-musl@0.1.99':
optional: true
'@napi-rs/canvas-linux-riscv64-gnu@0.1.92':
optional: true
'@napi-rs/canvas-linux-riscv64-gnu@0.1.99':
optional: true
'@napi-rs/canvas-linux-x64-gnu@0.1.92':
optional: true
'@napi-rs/canvas-linux-x64-gnu@0.1.99':
optional: true
'@napi-rs/canvas-linux-x64-musl@0.1.92':
optional: true
'@napi-rs/canvas-linux-x64-musl@0.1.99':
optional: true
'@napi-rs/canvas-win32-arm64-msvc@0.1.92':
optional: true
'@napi-rs/canvas-win32-arm64-msvc@0.1.99':
optional: true
'@napi-rs/canvas-win32-x64-msvc@0.1.92':
optional: true
'@napi-rs/canvas-win32-x64-msvc@0.1.99':
optional: true
'@napi-rs/canvas@0.1.92':
optionalDependencies:
'@napi-rs/canvas-android-arm64': 0.1.92
'@napi-rs/canvas-darwin-arm64': 0.1.92
'@napi-rs/canvas-darwin-x64': 0.1.92
'@napi-rs/canvas-linux-arm-gnueabihf': 0.1.92
'@napi-rs/canvas-linux-arm64-gnu': 0.1.92
'@napi-rs/canvas-linux-arm64-musl': 0.1.92
'@napi-rs/canvas-linux-riscv64-gnu': 0.1.92
'@napi-rs/canvas-linux-x64-gnu': 0.1.92
'@napi-rs/canvas-linux-x64-musl': 0.1.92
'@napi-rs/canvas-win32-arm64-msvc': 0.1.92
'@napi-rs/canvas-win32-x64-msvc': 0.1.92
'@napi-rs/canvas@0.1.99':
optionalDependencies:
'@napi-rs/canvas-android-arm64': 0.1.99
@@ -10007,7 +9892,6 @@ snapshots:
'@napi-rs/canvas-linux-x64-musl': 0.1.99
'@napi-rs/canvas-win32-arm64-msvc': 0.1.99
'@napi-rs/canvas-win32-x64-msvc': 0.1.99
optional: true
'@napi-rs/wasm-runtime@1.1.4(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)':
dependencies:

View File

@@ -48,8 +48,12 @@
"risk": ["parser", "untrusted-html"]
},
"@napi-rs/canvas": {
"owner": "capability:document-and-image-rendering",
"class": "default-runtime-initially",
"owner": "plugin:document-extract",
"class": "optional-peer-runtime",
"activation": [
"input_file.application_pdf.image_fallback",
"plugins.entries.document-extract.enabled"
],
"risk": ["native", "parser", "untrusted-files"]
},
"@vincentkoc/qrcode-tui": {
@@ -149,8 +153,9 @@
"risk": ["terminal-rendering"]
},
"pdfjs-dist": {
"owner": "capability:document-extract",
"class": "default-runtime-initially",
"owner": "plugin:document-extract",
"class": "plugin-runtime",
"activation": ["input_file.application_pdf", "plugins.entries.document-extract.enabled"],
"risk": ["parser", "untrusted-files"]
},
"proxy-agent": {

View File

@@ -266,6 +266,7 @@
"provider-stream",
"provider-tools",
"provider-usage",
"document-extractor",
"web-content-extractor",
"provider-web-fetch-contract",
"provider-web-fetch",

View File

@@ -432,6 +432,7 @@ export function createPdfTool(options?: {
maxPixels: PDF_MAX_PIXELS,
minTextChars: PDF_MIN_TEXT_CHARS,
pageNumbers,
config: options?.config,
});
extractedAll.push(extracted);
}

View File

@@ -343,10 +343,11 @@ function isBinaryMediaMime(mime?: string): boolean {
async function extractFileBlocks(params: {
attachments: ReturnType<typeof normalizeMediaAttachments>;
cache: ReturnType<typeof createMediaAttachmentCache>;
cfg: OpenClawConfig;
limits: ReturnType<typeof resolveFileLimits>;
skipAttachmentIndexes?: Set<number>;
}): Promise<string[]> {
const { attachments, cache, limits, skipAttachmentIndexes } = params;
const { attachments, cache, cfg, limits, skipAttachmentIndexes } = params;
if (!attachments || attachments.length === 0) {
return [];
}
@@ -447,6 +448,7 @@ async function extractFileBlocks(params: {
...baseLimits,
allowedMimes,
},
config: cfg,
});
} catch (err) {
if (shouldLogVerbose()) {
@@ -565,6 +567,7 @@ export async function applyMediaUnderstanding(params: {
const fileBlocks = await extractFileBlocks({
attachments,
cache,
cfg,
limits: resolveFileLimits(cfg),
skipAttachmentIndexes: audioAttachmentIndexes.size > 0 ? audioAttachmentIndexes : undefined,
});

View File

@@ -0,0 +1,81 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const { resolvePluginDocumentExtractorsMock } = vi.hoisted(() => ({
resolvePluginDocumentExtractorsMock: vi.fn(),
}));
vi.mock("../plugins/document-extractors.runtime.js", () => ({
resolvePluginDocumentExtractors: resolvePluginDocumentExtractorsMock,
}));
import { extractDocumentContent } from "./document-extractors.runtime.js";
describe("extractDocumentContent", () => {
beforeEach(() => {
resolvePluginDocumentExtractorsMock.mockReset();
});
it("passes only public extraction request fields to plugins", async () => {
const extract = vi.fn().mockResolvedValue({ text: "pdf text", images: [] });
resolvePluginDocumentExtractorsMock.mockReturnValue([
{
id: "pdf",
pluginId: "document-extract",
label: "PDF",
mimeTypes: ["application/pdf"],
extract,
},
]);
await expect(
extractDocumentContent({
buffer: Buffer.from("pdf"),
mimeType: "application/pdf",
maxPages: 1,
maxPixels: 100,
minTextChars: 10,
config: {
env: {
vars: {
SECRET_VALUE: "do-not-pass",
},
},
},
}),
).resolves.toMatchObject({ text: "pdf text", extractor: "pdf" });
expect(extract).toHaveBeenCalledWith({
buffer: Buffer.from("pdf"),
mimeType: "application/pdf",
maxPages: 1,
maxPixels: 100,
minTextChars: 10,
});
});
it("surfaces matching extractor failures instead of reporting disablement", async () => {
const cause = new Error("password required");
resolvePluginDocumentExtractorsMock.mockReturnValue([
{
id: "pdf",
pluginId: "document-extract",
label: "PDF",
mimeTypes: ["application/pdf"],
extract: vi.fn().mockRejectedValue(cause),
},
]);
await expect(
extractDocumentContent({
buffer: Buffer.from("pdf"),
mimeType: "application/pdf",
maxPages: 1,
maxPixels: 100,
minTextChars: 10,
}),
).rejects.toMatchObject({
message: "Document extraction failed for application/pdf",
cause,
});
});
});

View File

@@ -0,0 +1,76 @@
import type { OpenClawConfig } from "../config/types.openclaw.js";
import type {
DocumentExtractionRequest,
DocumentExtractionResult,
} from "../plugins/document-extractor-types.js";
import { resolvePluginDocumentExtractors } from "../plugins/document-extractors.runtime.js";
import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js";
let extractorPromise: Promise<ReturnType<typeof resolvePluginDocumentExtractors>> | undefined;
const extractorPromisesByConfig = new WeakMap<
OpenClawConfig,
Promise<ReturnType<typeof resolvePluginDocumentExtractors>>
>();
async function loadDocumentExtractors(config?: OpenClawConfig) {
if (config) {
const cached = extractorPromisesByConfig.get(config);
if (cached) {
return await cached;
}
const promise = Promise.resolve().then(() => resolvePluginDocumentExtractors({ config }));
extractorPromisesByConfig.set(config, promise);
void promise.catch(() => {
extractorPromisesByConfig.delete(config);
});
return await promise;
}
extractorPromise ??= Promise.resolve(resolvePluginDocumentExtractors());
return await extractorPromise;
}
export async function extractDocumentContent(
params: DocumentExtractionRequest & {
config?: OpenClawConfig;
},
): Promise<(DocumentExtractionResult & { extractor: string }) | null> {
const mimeType = normalizeLowercaseStringOrEmpty(params.mimeType);
const extractors = await loadDocumentExtractors(params.config);
const request: DocumentExtractionRequest = {
buffer: params.buffer,
mimeType: params.mimeType,
maxPages: params.maxPages,
maxPixels: params.maxPixels,
minTextChars: params.minTextChars,
...(params.pageNumbers ? { pageNumbers: params.pageNumbers } : {}),
...(params.onImageExtractionError
? { onImageExtractionError: params.onImageExtractionError }
: {}),
};
const errors: unknown[] = [];
for (const extractor of extractors) {
if (
!extractor.mimeTypes.map((entry) => normalizeLowercaseStringOrEmpty(entry)).includes(mimeType)
) {
continue;
}
try {
const result = await extractor.extract(request);
if (result) {
return {
...result,
extractor: extractor.id,
};
}
} catch (error) {
errors.push(error);
}
}
if (errors.length > 0) {
throw new Error(`Document extraction failed for ${mimeType || "unknown MIME type"}`, {
cause: errors.length === 1 ? errors[0] : new AggregateError(errors),
});
}
return null;
}

View File

@@ -1,3 +1,4 @@
import type { OpenClawConfig } from "../config/types.openclaw.js";
import { fetchWithSsrFGuard } from "../infra/net/fetch-guard.js";
import type { SsrFPolicy } from "../infra/net/ssrf.js";
import { logWarn } from "../logger.js";
@@ -322,6 +323,7 @@ export async function extractImageContentFromSource(
export async function extractFileContentFromSource(params: {
source: InputFileSource;
limits: InputFileLimits;
config?: OpenClawConfig;
}): Promise<InputFileExtractResult> {
const { source, limits } = params;
const filename = source.filename || "file";
@@ -378,6 +380,7 @@ export async function extractFileContentFromSource(params: {
maxPages: limits.pdf.maxPages,
maxPixels: limits.pdf.maxPixels,
minTextChars: limits.pdf.minTextChars,
...(params.config ? { config: params.config } : {}),
onImageExtractionError: (err) => {
logWarn(`media: PDF image extraction skipped, ${String(err)}`);
},

View File

@@ -0,0 +1,54 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const { extractDocumentContentMock } = vi.hoisted(() => ({
extractDocumentContentMock: vi.fn(),
}));
vi.mock("./document-extractors.runtime.js", () => ({
extractDocumentContent: extractDocumentContentMock,
}));
import { extractPdfContent } from "./pdf-extract.js";
describe("extractPdfContent", () => {
beforeEach(() => {
extractDocumentContentMock.mockReset();
});
it("dispatches PDF extraction through document extractors", async () => {
extractDocumentContentMock.mockResolvedValue({
text: "extracted pdf",
images: [],
extractor: "pdf",
});
await expect(
extractPdfContent({
buffer: Buffer.from("%PDF-1.4"),
maxPages: 2,
maxPixels: 100,
minTextChars: 10,
}),
).resolves.toEqual({ text: "extracted pdf", images: [] });
expect(extractDocumentContentMock).toHaveBeenCalledWith({
buffer: Buffer.from("%PDF-1.4"),
mimeType: "application/pdf",
maxPages: 2,
maxPixels: 100,
minTextChars: 10,
});
});
it("throws a clear disabled error when no document extractor is available", async () => {
extractDocumentContentMock.mockResolvedValue(null);
await expect(
extractPdfContent({
buffer: Buffer.from("%PDF-1.4"),
maxPages: 2,
maxPixels: 100,
minTextChars: 10,
}),
).rejects.toThrow("PDF extraction disabled or unavailable");
});
});

View File

@@ -1,81 +1,12 @@
type CanvasLike = {
toBuffer(type: "image/png"): Buffer;
};
import type { OpenClawConfig } from "../config/types.openclaw.js";
import type {
DocumentExtractedImage,
DocumentExtractionResult,
} from "../plugins/document-extractor-types.js";
import { extractDocumentContent } from "./document-extractors.runtime.js";
type CanvasModule = {
createCanvas(width: number, height: number): CanvasLike;
};
type PdfTextItem = {
str: string;
};
type PdfTextContent = {
items: Array<PdfTextItem | object>;
};
type PdfViewport = {
width: number;
height: number;
};
type PdfPage = {
getTextContent(): Promise<PdfTextContent>;
getViewport(params: { scale: number }): PdfViewport;
render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise<void> };
};
type PdfDocument = {
numPages: number;
getPage(pageNumber: number): Promise<PdfPage>;
};
type PdfJsModule = {
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
promise: Promise<PdfDocument>;
};
};
const CANVAS_MODULE = "@napi-rs/canvas";
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
let canvasModulePromise: Promise<CanvasModule> | null = null;
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
async function loadCanvasModule(): Promise<CanvasModule> {
if (!canvasModulePromise) {
canvasModulePromise = (import(CANVAS_MODULE) as Promise<CanvasModule>).catch((err) => {
canvasModulePromise = null;
throw new Error(
`Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`,
);
});
}
return canvasModulePromise;
}
async function loadPdfJsModule(): Promise<PdfJsModule> {
if (!pdfJsModulePromise) {
pdfJsModulePromise = (import(PDFJS_MODULE) as Promise<PdfJsModule>).catch((err) => {
pdfJsModulePromise = null;
throw new Error(
`Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`,
);
});
}
return pdfJsModulePromise;
}
export type PdfExtractedImage = {
type: "image";
data: string;
mimeType: string;
};
export type PdfExtractedContent = {
text: string;
images: PdfExtractedImage[];
};
export type PdfExtractedImage = DocumentExtractedImage;
export type PdfExtractedContent = DocumentExtractionResult;
export async function extractPdfContent(params: {
buffer: Buffer;
@@ -83,60 +14,28 @@ export async function extractPdfContent(params: {
maxPixels: number;
minTextChars: number;
pageNumbers?: number[];
config?: OpenClawConfig;
onImageExtractionError?: (error: unknown) => void;
}): Promise<PdfExtractedContent> {
const { buffer, maxPages, maxPixels, minTextChars, pageNumbers, onImageExtractionError } = params;
const pdfJsModule = await loadPdfJsModule();
const pdf = await pdfJsModule.getDocument({ data: new Uint8Array(buffer), disableWorker: true })
.promise;
const effectivePages: number[] = pageNumbers
? pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, maxPages)
: Array.from({ length: Math.min(pdf.numPages, maxPages) }, (_, i) => i + 1);
const textParts: string[] = [];
for (const pageNum of effectivePages) {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item) => ("str" in item ? item.str : ""))
.filter(Boolean)
.join(" ");
if (pageText) {
textParts.push(pageText);
}
const extracted = await extractDocumentContent({
buffer: params.buffer,
mimeType: "application/pdf",
maxPages: params.maxPages,
maxPixels: params.maxPixels,
minTextChars: params.minTextChars,
...(params.pageNumbers ? { pageNumbers: params.pageNumbers } : {}),
...(params.config ? { config: params.config } : {}),
...(params.onImageExtractionError
? { onImageExtractionError: params.onImageExtractionError }
: {}),
});
if (!extracted) {
throw new Error(
"PDF extraction disabled or unavailable: enable the document-extract plugin to process application/pdf files.",
);
}
const text = textParts.join("\n\n");
if (text.trim().length >= minTextChars) {
return { text, images: [] };
}
let canvasModule: CanvasModule;
try {
canvasModule = await loadCanvasModule();
} catch (err) {
onImageExtractionError?.(err);
return { text, images: [] };
}
const images: PdfExtractedImage[] = [];
const pixelBudget = Math.max(1, maxPixels);
for (const pageNum of effectivePages) {
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1 });
const pagePixels = viewport.width * viewport.height;
const scale = Math.min(1, Math.sqrt(pixelBudget / Math.max(1, pagePixels)));
const scaled = page.getViewport({ scale: Math.max(0.1, scale) });
const canvas = canvasModule.createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height));
await page.render({
canvas: canvas as unknown as HTMLCanvasElement,
viewport: scaled,
}).promise;
const png = canvas.toBuffer("image/png");
images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
}
return { text, images };
return {
text: extracted.text,
images: extracted.images,
};
}

View File

@@ -0,0 +1,6 @@
export type {
DocumentExtractedImage,
DocumentExtractionRequest,
DocumentExtractionResult,
DocumentExtractorPlugin,
} from "../plugins/document-extractor-types.js";

View File

@@ -20,6 +20,7 @@ export type BundledPluginContractSnapshot = {
realtimeTranscriptionProviderIds: string[];
realtimeVoiceProviderIds: string[];
mediaUnderstandingProviderIds: string[];
documentExtractorIds: string[];
imageGenerationProviderIds: string[];
videoGenerationProviderIds: string[];
musicGenerationProviderIds: string[];
@@ -116,6 +117,9 @@ export function buildBundledPluginContractSnapshot(
manifest.contracts?.mediaUnderstandingProviders,
(value) => value.trim(),
),
documentExtractorIds: uniqueStrings(manifest.contracts?.documentExtractors, (value) =>
value.trim(),
),
imageGenerationProviderIds: uniqueStrings(
manifest.contracts?.imageGenerationProviders,
(value) => value.trim(),
@@ -151,6 +155,7 @@ export function hasBundledPluginContractSnapshotCapabilities(
entry.realtimeTranscriptionProviderIds.length > 0 ||
entry.realtimeVoiceProviderIds.length > 0 ||
entry.mediaUnderstandingProviderIds.length > 0 ||
entry.documentExtractorIds.length > 0 ||
entry.imageGenerationProviderIds.length > 0 ||
entry.videoGenerationProviderIds.length > 0 ||
entry.musicGenerationProviderIds.length > 0 ||

View File

@@ -64,6 +64,7 @@ type ManifestContractKey =
| "realtimeTranscriptionProviders"
| "realtimeVoiceProviders"
| "mediaUnderstandingProviders"
| "documentExtractors"
| "imageGenerationProviders"
| "videoGenerationProviders"
| "musicGenerationProviders"
@@ -84,6 +85,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] {
realtimeTranscriptionProviderIds: [...entry.realtimeTranscriptionProviderIds],
realtimeVoiceProviderIds: [...entry.realtimeVoiceProviderIds],
mediaUnderstandingProviderIds: [...entry.mediaUnderstandingProviderIds],
documentExtractorIds: [...entry.documentExtractorIds],
imageGenerationProviderIds: [...entry.imageGenerationProviderIds],
videoGenerationProviderIds: [...entry.videoGenerationProviderIds],
musicGenerationProviderIds: [...entry.musicGenerationProviderIds],
@@ -103,6 +105,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] {
(plugin.contracts?.realtimeTranscriptionProviders?.length ?? 0) > 0 ||
(plugin.contracts?.realtimeVoiceProviders?.length ?? 0) > 0 ||
(plugin.contracts?.mediaUnderstandingProviders?.length ?? 0) > 0 ||
(plugin.contracts?.documentExtractors?.length ?? 0) > 0 ||
(plugin.contracts?.imageGenerationProviders?.length ?? 0) > 0 ||
(plugin.contracts?.videoGenerationProviders?.length ?? 0) > 0 ||
(plugin.contracts?.musicGenerationProviders?.length ?? 0) > 0 ||
@@ -123,6 +126,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] {
mediaUnderstandingProviderIds: uniqueStrings(
plugin.contracts?.mediaUnderstandingProviders ?? [],
),
documentExtractorIds: uniqueStrings(plugin.contracts?.documentExtractors ?? []),
imageGenerationProviderIds: uniqueStrings(plugin.contracts?.imageGenerationProviders ?? []),
videoGenerationProviderIds: uniqueStrings(plugin.contracts?.videoGenerationProviders ?? []),
musicGenerationProviderIds: uniqueStrings(plugin.contracts?.musicGenerationProviders ?? []),
@@ -175,6 +179,8 @@ function resolveBundledManifestPluginIdsForContract(contract: ManifestContractKe
return entry.realtimeVoiceProviderIds.length > 0;
case "mediaUnderstandingProviders":
return entry.mediaUnderstandingProviderIds.length > 0;
case "documentExtractors":
return entry.documentExtractorIds.length > 0;
case "imageGenerationProviders":
return entry.imageGenerationProviderIds.length > 0;
case "videoGenerationProviders":

View File

@@ -0,0 +1,55 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const { publicArtifactModule } = vi.hoisted(() => ({
publicArtifactModule: {} as Record<string, unknown>,
}));
vi.mock("./public-surface-loader.js", () => ({
loadBundledPluginPublicArtifactModuleSync: vi.fn(() => publicArtifactModule),
resolveBundledPluginPublicArtifactPath: vi.fn(
() => "/repo/extensions/demo/document-extractor.ts",
),
}));
import { loadBundledDocumentExtractorEntriesFromDir } from "./document-extractor-public-artifacts.js";
describe("loadBundledDocumentExtractorEntriesFromDir", () => {
beforeEach(() => {
for (const key of Object.keys(publicArtifactModule)) {
delete publicArtifactModule[key];
}
});
it("isolates a throwing factory when another extractor factory succeeds", () => {
publicArtifactModule.createBrokenDocumentExtractor = () => {
throw new Error("native probe failed");
};
publicArtifactModule.createPdfDocumentExtractor = () => ({
id: "pdf",
label: "PDF",
mimeTypes: ["application/pdf"],
extract: vi.fn(),
});
expect(
loadBundledDocumentExtractorEntriesFromDir({
dirName: "demo",
pluginId: "demo",
}),
).toMatchObject([{ id: "pdf", pluginId: "demo" }]);
});
it("surfaces initialization failure when every matching factory throws", () => {
const cause = new Error("native probe failed");
publicArtifactModule.createPdfDocumentExtractor = () => {
throw cause;
};
expect(() =>
loadBundledDocumentExtractorEntriesFromDir({
dirName: "demo",
pluginId: "demo",
}),
).toThrow("Unable to initialize document extractors for plugin demo");
});
});

View File

@@ -0,0 +1,108 @@
import type {
DocumentExtractorPlugin,
PluginDocumentExtractorEntry,
} from "./document-extractor-types.js";
import {
loadBundledPluginPublicArtifactModuleSync,
resolveBundledPluginPublicArtifactPath,
} from "./public-surface-loader.js";
const DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES = [
"document-extractor.js",
"document-extractor-api.js",
] as const;
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
function isDocumentExtractorPlugin(value: unknown): value is DocumentExtractorPlugin {
return (
isRecord(value) &&
typeof value.id === "string" &&
typeof value.label === "string" &&
Array.isArray(value.mimeTypes) &&
value.mimeTypes.every((mimeType) => typeof mimeType === "string" && mimeType.trim()) &&
(value.autoDetectOrder === undefined || typeof value.autoDetectOrder === "number") &&
typeof value.extract === "function"
);
}
function tryLoadBundledPublicArtifactModule(params: {
dirName: string;
}): Record<string, unknown> | null {
for (const artifactBasename of DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES) {
try {
return loadBundledPluginPublicArtifactModuleSync<Record<string, unknown>>({
dirName: params.dirName,
artifactBasename,
});
} catch (error) {
if (
error instanceof Error &&
error.message.startsWith("Unable to resolve bundled plugin public surface ")
) {
continue;
}
throw error;
}
}
return null;
}
function collectExtractorFactories(mod: Record<string, unknown>): {
extractors: DocumentExtractorPlugin[];
errors: unknown[];
} {
const extractors: DocumentExtractorPlugin[] = [];
const errors: unknown[] = [];
for (const [name, exported] of Object.entries(mod).toSorted(([left], [right]) =>
left.localeCompare(right),
)) {
if (
typeof exported !== "function" ||
exported.length !== 0 ||
!name.startsWith("create") ||
!name.endsWith("DocumentExtractor")
) {
continue;
}
let candidate: unknown;
try {
candidate = exported();
} catch (error) {
errors.push(error);
continue;
}
if (isDocumentExtractorPlugin(candidate)) {
extractors.push(candidate);
}
}
return { extractors, errors };
}
export function loadBundledDocumentExtractorEntriesFromDir(params: {
dirName: string;
pluginId: string;
}): PluginDocumentExtractorEntry[] | null {
const mod = tryLoadBundledPublicArtifactModule({ dirName: params.dirName });
if (!mod) {
return null;
}
const { extractors, errors } = collectExtractorFactories(mod);
if (extractors.length === 0) {
if (errors.length > 0) {
throw new Error(`Unable to initialize document extractors for plugin ${params.pluginId}`, {
cause: errors.length === 1 ? errors[0] : new AggregateError(errors),
});
}
return null;
}
return extractors.map((extractor) => Object.assign({}, extractor, { pluginId: params.pluginId }));
}
export function hasBundledDocumentExtractorPublicArtifact(pluginId: string): boolean {
return DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES.some((artifactBasename) =>
Boolean(resolveBundledPluginPublicArtifactPath({ dirName: pluginId, artifactBasename })),
);
}

View File

@@ -0,0 +1,32 @@
export type DocumentExtractedImage = {
type: "image";
data: string;
mimeType: string;
};
export type DocumentExtractionRequest = {
buffer: Buffer;
mimeType: string;
maxPages: number;
maxPixels: number;
minTextChars: number;
pageNumbers?: number[];
onImageExtractionError?: (error: unknown) => void;
};
export type DocumentExtractionResult = {
text: string;
images: DocumentExtractedImage[];
};
export type DocumentExtractorPlugin = {
id: string;
label: string;
mimeTypes: readonly string[];
autoDetectOrder?: number;
extract: (request: DocumentExtractionRequest) => Promise<DocumentExtractionResult | null>;
};
export type PluginDocumentExtractorEntry = DocumentExtractorPlugin & {
pluginId: string;
};

View File

@@ -0,0 +1,28 @@
import { describe, expect, it } from "vitest";
import { resolvePluginDocumentExtractors } from "./document-extractors.runtime.js";
describe("resolvePluginDocumentExtractors", () => {
it("respects global plugin disablement", () => {
expect(
resolvePluginDocumentExtractors({
config: {
plugins: {
enabled: false,
},
},
}),
).toEqual([]);
});
it("does not expand an operator plugin allowlist", () => {
expect(
resolvePluginDocumentExtractors({
config: {
plugins: {
allow: ["openai"],
},
},
}),
).toEqual([]);
});
});

View File

@@ -0,0 +1,134 @@
import type { OpenClawConfig } from "../config/types.openclaw.js";
import { resolveBundledPluginCompatibleLoadValues } from "./activation-context.js";
import {
createPluginActivationSource,
normalizePluginsConfig,
resolveEffectivePluginActivationState,
} from "./config-state.js";
import { loadBundledDocumentExtractorEntriesFromDir } from "./document-extractor-public-artifacts.js";
import type { PluginDocumentExtractorEntry } from "./document-extractor-types.js";
import { loadPluginManifestRegistry } from "./manifest-registry.js";
import type { PluginManifestRecord } from "./manifest-registry.js";
function compareExtractors(
left: PluginDocumentExtractorEntry,
right: PluginDocumentExtractorEntry,
): number {
const leftOrder = left.autoDetectOrder ?? Number.MAX_SAFE_INTEGER;
const rightOrder = right.autoDetectOrder ?? Number.MAX_SAFE_INTEGER;
if (leftOrder !== rightOrder) {
return leftOrder - rightOrder;
}
return left.id.localeCompare(right.id) || left.pluginId.localeCompare(right.pluginId);
}
function resolveBundledDocumentExtractorCompatPluginIds(params: {
config?: OpenClawConfig;
workspaceDir?: string;
env?: NodeJS.ProcessEnv;
onlyPluginIds?: readonly string[];
}): string[] {
const onlyPluginIdSet =
params.onlyPluginIds && params.onlyPluginIds.length > 0 ? new Set(params.onlyPluginIds) : null;
return loadPluginManifestRegistry({
config: params.config,
workspaceDir: params.workspaceDir,
env: params.env,
})
.plugins.filter(
(plugin) =>
plugin.origin === "bundled" &&
(!onlyPluginIdSet || onlyPluginIdSet.has(plugin.id)) &&
(plugin.contracts?.documentExtractors?.length ?? 0) > 0,
)
.map((plugin) => plugin.id)
.toSorted((left, right) => left.localeCompare(right));
}
function resolveEnabledBundledDocumentExtractorPlugins(params: {
config?: OpenClawConfig;
workspaceDir?: string;
env?: NodeJS.ProcessEnv;
onlyPluginIds?: readonly string[];
}): PluginManifestRecord[] {
if (params.config?.plugins?.enabled === false) {
return [];
}
const activation = resolveBundledPluginCompatibleLoadValues({
rawConfig: params.config,
env: params.env,
workspaceDir: params.workspaceDir,
onlyPluginIds: params.onlyPluginIds,
applyAutoEnable: true,
compatMode: {
allowlist: false,
enablement: "allowlist",
vitest: true,
},
resolveCompatPluginIds: resolveBundledDocumentExtractorCompatPluginIds,
});
const normalizedPlugins = normalizePluginsConfig(activation.config?.plugins);
const activationSource = createPluginActivationSource({
config: activation.activationSourceConfig,
});
const onlyPluginIdSet =
params.onlyPluginIds && params.onlyPluginIds.length > 0 ? new Set(params.onlyPluginIds) : null;
return loadPluginManifestRegistry({
config: activation.config,
workspaceDir: params.workspaceDir,
env: params.env,
}).plugins.filter((plugin) => {
if (
plugin.origin !== "bundled" ||
(onlyPluginIdSet && !onlyPluginIdSet.has(plugin.id)) ||
(plugin.contracts?.documentExtractors?.length ?? 0) === 0
) {
return false;
}
return resolveEffectivePluginActivationState({
id: plugin.id,
origin: plugin.origin,
config: normalizedPlugins,
rootConfig: activation.config,
enabledByDefault: plugin.enabledByDefault,
activationSource,
}).enabled;
});
}
export function resolvePluginDocumentExtractors(params?: {
config?: OpenClawConfig;
workspaceDir?: string;
env?: NodeJS.ProcessEnv;
onlyPluginIds?: readonly string[];
}): PluginDocumentExtractorEntry[] {
const extractors: PluginDocumentExtractorEntry[] = [];
const loadErrors: unknown[] = [];
for (const plugin of resolveEnabledBundledDocumentExtractorPlugins({
config: params?.config,
workspaceDir: params?.workspaceDir,
env: params?.env,
onlyPluginIds: params?.onlyPluginIds,
})) {
let loaded: PluginDocumentExtractorEntry[] | null;
try {
loaded = loadBundledDocumentExtractorEntriesFromDir({
dirName: plugin.id,
pluginId: plugin.id,
});
} catch (error) {
loadErrors.push(error);
continue;
}
if (loaded) {
extractors.push(...loaded);
}
}
if (extractors.length === 0 && loadErrors.length > 0) {
throw new Error("Unable to load document extractor plugins", {
cause: loadErrors.length === 1 ? loadErrors[0] : new AggregateError(loadErrors),
});
}
return extractors.toSorted(compareExtractors);
}

View File

@@ -52,6 +52,7 @@ function hasRuntimeContractSurface(plugin: PluginManifestRecord): boolean {
plugin.cliBackends.length > 0 ||
plugin.contracts?.speechProviders?.length ||
plugin.contracts?.mediaUnderstandingProviders?.length ||
plugin.contracts?.documentExtractors?.length ||
plugin.contracts?.imageGenerationProviders?.length ||
plugin.contracts?.videoGenerationProviders?.length ||
plugin.contracts?.musicGenerationProviders?.length ||

View File

@@ -67,6 +67,7 @@ type PluginManifestContractListKey =
| "speechProviders"
| "externalAuthProviders"
| "mediaUnderstandingProviders"
| "documentExtractors"
| "realtimeVoiceProviders"
| "realtimeTranscriptionProviders"
| "imageGenerationProviders"

View File

@@ -251,6 +251,7 @@ export type PluginManifestContracts = {
realtimeTranscriptionProviders?: string[];
realtimeVoiceProviders?: string[];
mediaUnderstandingProviders?: string[];
documentExtractors?: string[];
imageGenerationProviders?: string[];
videoGenerationProviders?: string[];
musicGenerationProviders?: string[];
@@ -443,6 +444,7 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
);
const realtimeVoiceProviders = normalizeTrimmedStringList(value.realtimeVoiceProviders);
const mediaUnderstandingProviders = normalizeTrimmedStringList(value.mediaUnderstandingProviders);
const documentExtractors = normalizeTrimmedStringList(value.documentExtractors);
const imageGenerationProviders = normalizeTrimmedStringList(value.imageGenerationProviders);
const videoGenerationProviders = normalizeTrimmedStringList(value.videoGenerationProviders);
const musicGenerationProviders = normalizeTrimmedStringList(value.musicGenerationProviders);
@@ -459,6 +461,7 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
...(realtimeTranscriptionProviders.length > 0 ? { realtimeTranscriptionProviders } : {}),
...(realtimeVoiceProviders.length > 0 ? { realtimeVoiceProviders } : {}),
...(mediaUnderstandingProviders.length > 0 ? { mediaUnderstandingProviders } : {}),
...(documentExtractors.length > 0 ? { documentExtractors } : {}),
...(imageGenerationProviders.length > 0 ? { imageGenerationProviders } : {}),
...(videoGenerationProviders.length > 0 ? { videoGenerationProviders } : {}),
...(musicGenerationProviders.length > 0 ? { musicGenerationProviders } : {}),

View File

@@ -5,6 +5,7 @@ import { afterEach, describe, expect, it } from "vitest";
import {
PUBLIC_SURFACE_SOURCE_EXTENSIONS,
normalizeBundledPluginArtifactSubpath,
normalizeBundledPluginDirName,
resolveBundledPluginPublicSurfacePath,
resolveBundledPluginSourcePublicSurfacePath,
} from "./public-surface-runtime.js";
@@ -96,4 +97,12 @@ describe("bundled plugin public surface runtime", () => {
/must stay plugin-local/,
);
});
it("rejects bundled plugin directory traversal", () => {
expect(normalizeBundledPluginDirName("document-extract")).toBe("document-extract");
expect(() => normalizeBundledPluginDirName("../outside")).toThrow(/single directory/);
expect(() => normalizeBundledPluginDirName("nested/plugin")).toThrow(/single directory/);
expect(() => normalizeBundledPluginDirName("nested\\plugin")).toThrow(/single directory/);
expect(() => normalizeBundledPluginDirName("C:plugin")).toThrow(/single directory/);
});
});

View File

@@ -38,19 +38,31 @@ export function normalizeBundledPluginArtifactSubpath(artifactBasename: string):
return normalized;
}
export function normalizeBundledPluginDirName(dirName: string): string {
const normalized = dirName.trim();
if (
!normalized ||
normalized === "." ||
normalized === ".." ||
normalized.includes("/") ||
normalized.includes("\\") ||
normalized.includes(":")
) {
throw new Error(`Bundled plugin dirName must be a single directory: ${dirName}`);
}
return normalized;
}
export function resolveBundledPluginSourcePublicSurfacePath(params: {
sourceRoot: string;
dirName: string;
artifactBasename: string;
}): string | null {
const artifactBasename = normalizeBundledPluginArtifactSubpath(params.artifactBasename);
const dirName = normalizeBundledPluginDirName(params.dirName);
const sourceBaseName = artifactBasename.replace(/\.js$/u, "");
for (const ext of PUBLIC_SURFACE_SOURCE_EXTENSIONS) {
const sourceCandidate = path.resolve(
params.sourceRoot,
params.dirName,
`${sourceBaseName}${ext}`,
);
const sourceCandidate = path.resolve(params.sourceRoot, dirName, `${sourceBaseName}${ext}`);
if (fs.existsSync(sourceCandidate)) {
return sourceCandidate;
}
@@ -88,11 +100,12 @@ export function resolveBundledPluginPublicSurfacePath(params: {
bundledPluginsDir?: string;
}): string | null {
const artifactBasename = normalizeBundledPluginArtifactSubpath(params.artifactBasename);
const dirName = normalizeBundledPluginDirName(params.dirName);
const explicitBundledPluginsDir =
params.bundledPluginsDir ?? resolveBundledPluginsDir(params.env ?? process.env);
if (explicitBundledPluginsDir) {
const explicitPluginDir = path.resolve(explicitBundledPluginsDir, params.dirName);
const explicitPluginDir = path.resolve(explicitBundledPluginsDir, dirName);
const explicitBuiltCandidate = path.join(explicitPluginDir, artifactBasename);
if (fs.existsSync(explicitBuiltCandidate)) {
return explicitBuiltCandidate;
@@ -100,21 +113,21 @@ export function resolveBundledPluginPublicSurfacePath(params: {
return (
resolveBundledPluginSourcePublicSurfacePath({
sourceRoot: explicitBundledPluginsDir,
dirName: params.dirName,
dirName,
artifactBasename,
}) ??
resolvePackageSourceFallbackForBundledDir({
rootDir: params.rootDir,
bundledPluginsDir: explicitBundledPluginsDir,
dirName: params.dirName,
dirName,
artifactBasename,
})
);
}
for (const candidate of [
path.resolve(params.rootDir, "dist", "extensions", params.dirName, artifactBasename),
path.resolve(params.rootDir, "dist-runtime", "extensions", params.dirName, artifactBasename),
path.resolve(params.rootDir, "dist", "extensions", dirName, artifactBasename),
path.resolve(params.rootDir, "dist-runtime", "extensions", dirName, artifactBasename),
]) {
if (fs.existsSync(candidate)) {
return candidate;
@@ -123,7 +136,7 @@ export function resolveBundledPluginPublicSurfacePath(params: {
return resolveBundledPluginSourcePublicSurfacePath({
sourceRoot: path.resolve(params.rootDir, "extensions"),
dirName: params.dirName,
dirName,
artifactBasename,
});
}

View File

@@ -1,7 +0,0 @@
declare module "@napi-rs/canvas" {
export type Canvas = {
toBuffer(type?: string): Buffer;
};
export function createCanvas(width: number, height: number): Canvas;
}

View File

@@ -1,33 +0,0 @@
declare module "pdfjs-dist/legacy/build/pdf.mjs" {
export type TextItem = {
str: string;
};
export type TextMarkedContent = {
type?: string;
};
export type TextContent = {
items: Array<TextItem | TextMarkedContent>;
};
export type Viewport = {
width: number;
height: number;
};
export type PDFPageProxy = {
getTextContent(): Promise<TextContent>;
getViewport(params: { scale: number }): Viewport;
render(params: { canvas: unknown; viewport: Viewport }): { promise: Promise<void> };
};
export type PDFDocumentProxy = {
numPages: number;
getPage(pageNumber: number): Promise<PDFPageProxy>;
};
export function getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
promise: Promise<PDFDocumentProxy>;
};
}