mirror of
https://fastgit.cc/github.com/openclaw/openclaw
synced 2026-05-01 06:36:23 +08:00
refactor(pdf): move document extraction to plugin
* refactor(pdf): move document extraction to plugin * fix(deps): sync document extract lockfile * fix(pdf): harden document extraction plugin
This commit is contained in:
@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Changes
|
||||
|
||||
- Plugins/PDF: move local PDF extraction into a bundled `document-extract` plugin so core no longer owns `pdfjs-dist` or PDF image-rendering dependencies. Thanks @vincentkoc.
|
||||
- Matrix: require full cross-signing identity trust for self-device verification and add `openclaw matrix verify self` so operators can establish that trust from the CLI. (#70401) Thanks @gumadeiras.
|
||||
|
||||
### Fixes
|
||||
|
||||
@@ -172,8 +172,9 @@ Current behavior:
|
||||
rasterized into images and passed to the model, and the injected file block uses
|
||||
the placeholder `[PDF content rendered to images]`.
|
||||
|
||||
PDF parsing uses the Node-friendly `pdfjs-dist` legacy build (no worker). The modern
|
||||
PDF.js build expects browser workers/DOM globals, so it is not used in the Gateway.
|
||||
PDF parsing is provided by the bundled `document-extract` plugin, which uses the
|
||||
Node-friendly `pdfjs-dist` legacy build (no worker). The modern PDF.js build
|
||||
expects browser workers/DOM globals, so it is not used in the Gateway.
|
||||
|
||||
URL fetch defaults:
|
||||
|
||||
|
||||
@@ -112,7 +112,9 @@ Fallback details:
|
||||
- If text extraction succeeds but image extraction would require vision on a
|
||||
text-only model, OpenClaw drops the rendered images and continues with the
|
||||
extracted text.
|
||||
- Extraction fallback requires `pdfjs-dist` (and `@napi-rs/canvas` for image rendering).
|
||||
- Extraction fallback uses the bundled `document-extract` plugin. The plugin owns
|
||||
`pdfjs-dist`; `@napi-rs/canvas` is used only when image rendering fallback is
|
||||
available.
|
||||
|
||||
## Config
|
||||
|
||||
|
||||
62
extensions/document-extract/document-extractor.test.ts
Normal file
62
extensions/document-extract/document-extractor.test.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const { canvasSizes, pdfDocument } = vi.hoisted(() => ({
|
||||
canvasSizes: [] as Array<{ width: number; height: number }>,
|
||||
pdfDocument: {
|
||||
numPages: 2,
|
||||
getPage: vi.fn(async () => ({
|
||||
getTextContent: vi.fn(async () => ({ items: [] })),
|
||||
getViewport: vi.fn(({ scale }: { scale: number }) => ({
|
||||
width: 1000 * scale,
|
||||
height: 1000 * scale,
|
||||
})),
|
||||
render: vi.fn(() => ({ promise: Promise.resolve() })),
|
||||
})),
|
||||
},
|
||||
}));
|
||||
|
||||
vi.mock("pdfjs-dist/legacy/build/pdf.mjs", () => ({
|
||||
getDocument: vi.fn(() => ({ promise: Promise.resolve(pdfDocument) })),
|
||||
}));
|
||||
|
||||
vi.mock("@napi-rs/canvas", () => ({
|
||||
createCanvas: vi.fn((width: number, height: number) => {
|
||||
canvasSizes.push({ width, height });
|
||||
return {
|
||||
toBuffer: vi.fn(() => Buffer.from("png")),
|
||||
};
|
||||
}),
|
||||
}));
|
||||
|
||||
import { createPdfDocumentExtractor } from "./document-extractor.js";
|
||||
|
||||
describe("PDF document extractor", () => {
|
||||
beforeEach(() => {
|
||||
canvasSizes.length = 0;
|
||||
pdfDocument.getPage.mockClear();
|
||||
});
|
||||
|
||||
it("declares PDF support", () => {
|
||||
const extractor = createPdfDocumentExtractor();
|
||||
expect(extractor).toMatchObject({
|
||||
id: "pdf",
|
||||
label: "PDF",
|
||||
mimeTypes: ["application/pdf"],
|
||||
});
|
||||
});
|
||||
|
||||
it("treats maxPixels as a hard total image rendering budget", async () => {
|
||||
const extractor = createPdfDocumentExtractor();
|
||||
|
||||
const result = await extractor.extract({
|
||||
buffer: Buffer.from("%PDF-1.4"),
|
||||
mimeType: "application/pdf",
|
||||
maxPages: 2,
|
||||
maxPixels: 100,
|
||||
minTextChars: 10,
|
||||
});
|
||||
|
||||
expect(result?.images).toHaveLength(1);
|
||||
expect(canvasSizes).toEqual([{ width: 10, height: 10 }]);
|
||||
});
|
||||
});
|
||||
216
extensions/document-extract/document-extractor.ts
Normal file
216
extensions/document-extract/document-extractor.ts
Normal file
@@ -0,0 +1,216 @@
|
||||
import type {
|
||||
DocumentExtractedImage,
|
||||
DocumentExtractionRequest,
|
||||
DocumentExtractionResult,
|
||||
DocumentExtractorPlugin,
|
||||
} from "openclaw/plugin-sdk/document-extractor";
|
||||
|
||||
type CanvasLike = {
|
||||
toBuffer(type: "image/png"): Buffer;
|
||||
};
|
||||
|
||||
type CanvasModule = {
|
||||
createCanvas(width: number, height: number): CanvasLike;
|
||||
};
|
||||
|
||||
type PdfTextItem = {
|
||||
str: string;
|
||||
};
|
||||
|
||||
type PdfTextContent = {
|
||||
items: Array<PdfTextItem | object>;
|
||||
};
|
||||
|
||||
type PdfViewport = {
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
|
||||
type PdfPage = {
|
||||
getTextContent(): Promise<PdfTextContent>;
|
||||
getViewport(params: { scale: number }): PdfViewport;
|
||||
render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise<void> };
|
||||
};
|
||||
|
||||
type PdfDocument = {
|
||||
numPages: number;
|
||||
getPage(pageNumber: number): Promise<PdfPage>;
|
||||
};
|
||||
|
||||
type PdfJsModule = {
|
||||
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
|
||||
promise: Promise<PdfDocument>;
|
||||
};
|
||||
};
|
||||
|
||||
const CANVAS_MODULE = "@napi-rs/canvas";
|
||||
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
|
||||
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
|
||||
const MAX_RENDER_DIMENSION = 10_000;
|
||||
|
||||
let canvasModulePromise: Promise<CanvasModule> | null = null;
|
||||
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
|
||||
|
||||
async function loadCanvasModule(): Promise<CanvasModule> {
|
||||
if (!canvasModulePromise) {
|
||||
canvasModulePromise = (import(CANVAS_MODULE) as Promise<CanvasModule>).catch((err) => {
|
||||
canvasModulePromise = null;
|
||||
throw new Error("Optional dependency @napi-rs/canvas is required for PDF image extraction", {
|
||||
cause: err,
|
||||
});
|
||||
});
|
||||
}
|
||||
return canvasModulePromise;
|
||||
}
|
||||
|
||||
async function loadPdfJsModule(): Promise<PdfJsModule> {
|
||||
if (!pdfJsModulePromise) {
|
||||
pdfJsModulePromise = (import(PDFJS_MODULE) as Promise<PdfJsModule>).catch((err) => {
|
||||
pdfJsModulePromise = null;
|
||||
throw new Error("Optional dependency pdfjs-dist is required for PDF extraction", {
|
||||
cause: err,
|
||||
});
|
||||
});
|
||||
}
|
||||
return pdfJsModulePromise;
|
||||
}
|
||||
|
||||
function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number {
|
||||
if (!pageText) {
|
||||
return currentLength;
|
||||
}
|
||||
const remaining = MAX_EXTRACTED_TEXT_CHARS - currentLength;
|
||||
if (remaining <= 0) {
|
||||
return currentLength;
|
||||
}
|
||||
const nextText = pageText.length > remaining ? pageText.slice(0, remaining) : pageText;
|
||||
parts.push(nextText);
|
||||
return currentLength + nextText.length;
|
||||
}
|
||||
|
||||
function resolveRenderPlan(
|
||||
viewport: PdfViewport,
|
||||
remainingPixels: number,
|
||||
): { scale: number; width: number; height: number; pixels: number } | null {
|
||||
if (
|
||||
remainingPixels <= 0 ||
|
||||
!Number.isFinite(viewport.width) ||
|
||||
!Number.isFinite(viewport.height) ||
|
||||
viewport.width <= 0 ||
|
||||
viewport.height <= 0
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const pagePixels = Math.max(1, viewport.width * viewport.height);
|
||||
const maxScale = Math.min(
|
||||
1,
|
||||
Math.sqrt(remainingPixels / pagePixels),
|
||||
MAX_RENDER_DIMENSION / viewport.width,
|
||||
MAX_RENDER_DIMENSION / viewport.height,
|
||||
);
|
||||
if (!Number.isFinite(maxScale) || maxScale <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let best: { scale: number; width: number; height: number; pixels: number } | null = null;
|
||||
let low = 0;
|
||||
let high = maxScale;
|
||||
for (let i = 0; i < 32; i += 1) {
|
||||
const scale = (low + high) / 2;
|
||||
const width = Math.max(1, Math.ceil(viewport.width * scale));
|
||||
const height = Math.max(1, Math.ceil(viewport.height * scale));
|
||||
const pixels = width * height;
|
||||
if (
|
||||
width <= MAX_RENDER_DIMENSION &&
|
||||
height <= MAX_RENDER_DIMENSION &&
|
||||
pixels <= remainingPixels
|
||||
) {
|
||||
best = { scale, width, height, pixels };
|
||||
low = scale;
|
||||
} else {
|
||||
high = scale;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
async function extractPdfContent(
|
||||
request: DocumentExtractionRequest,
|
||||
): Promise<DocumentExtractionResult> {
|
||||
const pdfJsModule = await loadPdfJsModule();
|
||||
const pdf = await pdfJsModule.getDocument({
|
||||
data: new Uint8Array(request.buffer),
|
||||
disableWorker: true,
|
||||
}).promise;
|
||||
|
||||
const effectivePages: number[] = request.pageNumbers
|
||||
? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages)
|
||||
: Array.from({ length: Math.min(pdf.numPages, request.maxPages) }, (_, i) => i + 1);
|
||||
|
||||
const textParts: string[] = [];
|
||||
let extractedTextLength = 0;
|
||||
for (const pageNum of effectivePages) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const textContent = await page.getTextContent();
|
||||
const pageText = textContent.items
|
||||
.map((item) => ("str" in item ? item.str : ""))
|
||||
.filter(Boolean)
|
||||
.join(" ");
|
||||
if (pageText) {
|
||||
extractedTextLength = appendTextWithinLimit(textParts, pageText, extractedTextLength);
|
||||
if (extractedTextLength >= MAX_EXTRACTED_TEXT_CHARS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const text = textParts.join("\n\n");
|
||||
if (text.trim().length >= request.minTextChars) {
|
||||
return { text, images: [] };
|
||||
}
|
||||
|
||||
let canvasModule: CanvasModule;
|
||||
try {
|
||||
canvasModule = await loadCanvasModule();
|
||||
} catch (err) {
|
||||
request.onImageExtractionError?.(err);
|
||||
return { text, images: [] };
|
||||
}
|
||||
|
||||
const images: DocumentExtractedImage[] = [];
|
||||
let remainingPixels = Math.max(1, Math.floor(request.maxPixels));
|
||||
|
||||
for (const pageNum of effectivePages) {
|
||||
if (remainingPixels <= 0) {
|
||||
break;
|
||||
}
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const plan = resolveRenderPlan(viewport, remainingPixels);
|
||||
if (!plan) {
|
||||
break;
|
||||
}
|
||||
const scaled = page.getViewport({ scale: plan.scale });
|
||||
const canvas = canvasModule.createCanvas(plan.width, plan.height);
|
||||
await page.render({
|
||||
canvas: canvas as unknown as HTMLCanvasElement,
|
||||
viewport: scaled,
|
||||
}).promise;
|
||||
const png = canvas.toBuffer("image/png");
|
||||
images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
|
||||
remainingPixels -= plan.pixels;
|
||||
}
|
||||
|
||||
return { text, images };
|
||||
}
|
||||
|
||||
export function createPdfDocumentExtractor(): DocumentExtractorPlugin {
|
||||
return {
|
||||
id: "pdf",
|
||||
label: "PDF",
|
||||
mimeTypes: ["application/pdf"],
|
||||
autoDetectOrder: 10,
|
||||
extract: extractPdfContent,
|
||||
};
|
||||
}
|
||||
11
extensions/document-extract/index.ts
Normal file
11
extensions/document-extract/index.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
|
||||
|
||||
export default definePluginEntry({
|
||||
id: "document-extract",
|
||||
name: "Document Extraction",
|
||||
description: "Extract text and fallback page images from local document attachments.",
|
||||
register() {
|
||||
// Runtime is exposed through document-extractor.ts so document hot paths can
|
||||
// load only the narrow extractor artifact instead of the full plugin entrypoint.
|
||||
},
|
||||
});
|
||||
14
extensions/document-extract/openclaw.plugin.json
Normal file
14
extensions/document-extract/openclaw.plugin.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"id": "document-extract",
|
||||
"enabledByDefault": true,
|
||||
"name": "Document Extraction",
|
||||
"description": "Extract text and fallback page images from local document attachments.",
|
||||
"contracts": {
|
||||
"documentExtractors": ["pdf"]
|
||||
},
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
26
extensions/document-extract/package.json
Normal file
26
extensions/document-extract/package.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"name": "@openclaw/document-extract-plugin",
|
||||
"version": "2026.4.24",
|
||||
"private": true,
|
||||
"description": "OpenClaw local document extraction plugin",
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"pdfjs-dist": "^5.6.205"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@openclaw/plugin-sdk": "workspace:*"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@napi-rs/canvas": "^0.1.89"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@napi-rs/canvas": {
|
||||
"optional": true
|
||||
}
|
||||
},
|
||||
"openclaw": {
|
||||
"extensions": [
|
||||
"./index.ts"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1125,6 +1125,10 @@
|
||||
"types": "./dist/plugin-sdk/web-content-extractor.d.ts",
|
||||
"default": "./dist/plugin-sdk/web-content-extractor.js"
|
||||
},
|
||||
"./plugin-sdk/document-extractor": {
|
||||
"types": "./dist/plugin-sdk/document-extractor.d.ts",
|
||||
"default": "./dist/plugin-sdk/document-extractor.js"
|
||||
},
|
||||
"./plugin-sdk/provider-web-fetch-contract": {
|
||||
"types": "./dist/plugin-sdk/provider-web-fetch-contract.d.ts",
|
||||
"default": "./dist/plugin-sdk/provider-web-fetch-contract.js"
|
||||
@@ -1609,7 +1613,6 @@
|
||||
"markdown-it": "14.1.1",
|
||||
"openai": "^6.34.0",
|
||||
"osc-progress": "^0.3.0",
|
||||
"pdfjs-dist": "^5.6.205",
|
||||
"proxy-agent": "^8.0.1",
|
||||
"semver": "7.7.4",
|
||||
"sharp": "^0.34.5",
|
||||
@@ -1647,7 +1650,6 @@
|
||||
"vitest": "^4.1.5"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@napi-rs/canvas": "^0.1.89",
|
||||
"node-llama-cpp": "3.18.1"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
@@ -1693,7 +1695,6 @@
|
||||
"onlyBuiltDependencies": [
|
||||
"@lydell/node-pty",
|
||||
"@matrix-org/matrix-sdk-crypto-nodejs",
|
||||
"@napi-rs/canvas",
|
||||
"@tloncorp/api",
|
||||
"@tloncorp/tlon-skill",
|
||||
"@whiskeysockets/baileys",
|
||||
|
||||
142
pnpm-lock.yaml
generated
142
pnpm-lock.yaml
generated
@@ -63,9 +63,6 @@ importers:
|
||||
'@modelcontextprotocol/sdk':
|
||||
specifier: 1.29.0
|
||||
version: 1.29.0(zod@4.3.6)
|
||||
'@napi-rs/canvas':
|
||||
specifier: ^0.1.89
|
||||
version: 0.1.92
|
||||
'@vincentkoc/qrcode-tui':
|
||||
specifier: 0.2.1
|
||||
version: 0.2.1
|
||||
@@ -120,9 +117,6 @@ importers:
|
||||
osc-progress:
|
||||
specifier: ^0.3.0
|
||||
version: 0.3.0
|
||||
pdfjs-dist:
|
||||
specifier: ^5.6.205
|
||||
version: 5.6.205
|
||||
proxy-agent:
|
||||
specifier: ^8.0.1
|
||||
version: 8.0.1
|
||||
@@ -519,6 +513,19 @@ importers:
|
||||
specifier: workspace:*
|
||||
version: link:../..
|
||||
|
||||
extensions/document-extract:
|
||||
dependencies:
|
||||
'@napi-rs/canvas':
|
||||
specifier: ^0.1.89
|
||||
version: 0.1.99
|
||||
pdfjs-dist:
|
||||
specifier: ^5.6.205
|
||||
version: 5.6.205
|
||||
devDependencies:
|
||||
'@openclaw/plugin-sdk':
|
||||
specifier: workspace:*
|
||||
version: link:../../packages/plugin-sdk
|
||||
|
||||
extensions/duckduckgo:
|
||||
devDependencies:
|
||||
'@openclaw/plugin-sdk':
|
||||
@@ -2774,61 +2781,30 @@ packages:
|
||||
resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==}
|
||||
engines: {node: '>=14.0.0'}
|
||||
|
||||
'@napi-rs/canvas-android-arm64@0.1.92':
|
||||
resolution: {integrity: sha512-rDOtq53ujfOuevD5taxAuIFALuf1QsQWZe1yS/N4MtT+tNiDBEdjufvQRPWZ11FubL2uwgP8ApYU3YOaNu1ZsQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [android]
|
||||
|
||||
'@napi-rs/canvas-android-arm64@0.1.99':
|
||||
resolution: {integrity: sha512-9OCRt8VVxA17m32NWZKyNC2qamdaS/SC5CEOIQwFngRq0DIeVm4PDal+6Ljnhqm2whZiC63DNuKZ4xSp2nbj9w==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [android]
|
||||
|
||||
'@napi-rs/canvas-darwin-arm64@0.1.92':
|
||||
resolution: {integrity: sha512-4PT6GRGCr7yMRehp42x0LJb1V0IEy1cDZDDayv7eKbFUIGbPFkV7CRC9Bee5MPkjg1EB4ZPXXUyy3gjQm7mR8Q==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [darwin]
|
||||
|
||||
'@napi-rs/canvas-darwin-arm64@0.1.99':
|
||||
resolution: {integrity: sha512-lupMDMy1+H38dhyCcLirOKKVUyzzlxi7j7rGPLI3vViMHOoPjcXO1b10ivy+ad+q6MiwHfoLjKTCoLke5ySOBg==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [darwin]
|
||||
|
||||
'@napi-rs/canvas-darwin-x64@0.1.92':
|
||||
resolution: {integrity: sha512-5e/3ZapP7CqPtDcZPtmowCsjoyQwuNMMD7c0GKPtZQ8pgQhLkeq/3fmk0HqNSD1i227FyJN/9pDrhw/UMTkaWA==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [darwin]
|
||||
|
||||
'@napi-rs/canvas-darwin-x64@0.1.99':
|
||||
resolution: {integrity: sha512-fdz02t4w8n6Ii/rYhWig6STb/zcTmCC/6YZTGmjoDeidDwn9Wf0ukQVynhCPEs29vqUc66wHZKsuIgMs9tycCg==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [darwin]
|
||||
|
||||
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.92':
|
||||
resolution: {integrity: sha512-j6KaLL9iir68lwpzzY+aBGag1PZp3+gJE2mQ3ar4VJVmyLRVOh+1qsdNK1gfWoAVy5w6U7OEYFrLzN2vOFUSng==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm]
|
||||
os: [linux]
|
||||
|
||||
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.99':
|
||||
resolution: {integrity: sha512-w4FwVwlNo00ezeRhfY62IVIyt6G3u8wodkPtiqWc52BUHx+VDBUM2vkS3ogfANaLI7hnf3s6WK4LyZVUjBg1lA==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm]
|
||||
os: [linux]
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-gnu@0.1.92':
|
||||
resolution: {integrity: sha512-s3NlnJMHOSotUYVoTCoC1OcomaChFdKmZg0VsHFeIkeHbwX0uPHP4eCX1irjSfMykyvsGHTQDfBAtGYuqxCxhQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [linux]
|
||||
libc: [glibc]
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-gnu@0.1.99':
|
||||
resolution: {integrity: sha512-8JvHeexKQ8c7g0q7YJ29NVQwnf1ePghP9ys9ZN0R0qzyqJQ9Uw6N9qnDINArlm3IYHexB7LjzArIfhQiqSDGvQ==}
|
||||
engines: {node: '>= 10'}
|
||||
@@ -2836,13 +2812,6 @@ packages:
|
||||
os: [linux]
|
||||
libc: [glibc]
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-musl@0.1.92':
|
||||
resolution: {integrity: sha512-xV0GQnukYq5qY+ebkAwHjnP2OrSGBxS3vSi1zQNQj0bkXU6Ou+Tw7JjCM7pZcQ28MUyEBS1yKfo7rc7ip2IPFQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [linux]
|
||||
libc: [musl]
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-musl@0.1.99':
|
||||
resolution: {integrity: sha512-Z+6nyLdJXWzLPVxi4H6g9TJop4DwN3KSgHWto5JCbZV5/uKoVqcSynPs0tGlUHOoWI8S8tEvJspz51GQkvr07w==}
|
||||
engines: {node: '>= 10'}
|
||||
@@ -2850,13 +2819,6 @@ packages:
|
||||
os: [linux]
|
||||
libc: [musl]
|
||||
|
||||
'@napi-rs/canvas-linux-riscv64-gnu@0.1.92':
|
||||
resolution: {integrity: sha512-+GKvIFbQ74eB/TopEdH6XIXcvOGcuKvCITLGXy7WLJAyNp3Kdn1ncjxg91ihatBaPR+t63QOE99yHuIWn3UQ9w==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [riscv64]
|
||||
os: [linux]
|
||||
libc: [glibc]
|
||||
|
||||
'@napi-rs/canvas-linux-riscv64-gnu@0.1.99':
|
||||
resolution: {integrity: sha512-jAnfOUv4IO1l8Levk5t85oVtEBOXLa07KnIUgWo1CDlPxiqpxS3uBfiE38Lvj/CQgHaNF6Nxk/SaemwLgsVJgw==}
|
||||
engines: {node: '>= 10'}
|
||||
@@ -2864,13 +2826,6 @@ packages:
|
||||
os: [linux]
|
||||
libc: [glibc]
|
||||
|
||||
'@napi-rs/canvas-linux-x64-gnu@0.1.92':
|
||||
resolution: {integrity: sha512-tFd6MwbEhZ1g64iVY2asV+dOJC+GT3Yd6UH4G3Hp0/VHQ6qikB+nvXEULskFYZ0+wFqlGPtXjG1Jmv7sJy+3Ww==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [linux]
|
||||
libc: [glibc]
|
||||
|
||||
'@napi-rs/canvas-linux-x64-gnu@0.1.99':
|
||||
resolution: {integrity: sha512-mIkXw3fGmbYyFjSmfWEvty4jN+rwEOmv0+Dy9bRvvTzLYWCgm3RMgUEQVfAKFw96nIRFnyNZiK83KNQaVVFjng==}
|
||||
engines: {node: '>= 10'}
|
||||
@@ -2878,13 +2833,6 @@ packages:
|
||||
os: [linux]
|
||||
libc: [glibc]
|
||||
|
||||
'@napi-rs/canvas-linux-x64-musl@0.1.92':
|
||||
resolution: {integrity: sha512-uSuqeSveB/ZGd72VfNbHCSXO9sArpZTvznMVsb42nqPP7gBGEH6NJQ0+hmF+w24unEmxBhPYakP/Wiosm16KkA==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [linux]
|
||||
libc: [musl]
|
||||
|
||||
'@napi-rs/canvas-linux-x64-musl@0.1.99':
|
||||
resolution: {integrity: sha512-f3Uz2P0RgrtBHISxZqr6yiYXJlTDyCVBumDacxo+4AmSg7z0HiqYZKGWC/gszq3fbPhyQUya1W2AEteKxT9Y6A==}
|
||||
engines: {node: '>= 10'}
|
||||
@@ -2892,34 +2840,18 @@ packages:
|
||||
os: [linux]
|
||||
libc: [musl]
|
||||
|
||||
'@napi-rs/canvas-win32-arm64-msvc@0.1.92':
|
||||
resolution: {integrity: sha512-20SK5AU/OUNz9ZuoAPj5ekWai45EIBDh/XsdrVZ8le/pJVlhjFU3olbumSQUXRFn7lBRS+qwM8kA//uLaDx6iQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [win32]
|
||||
|
||||
'@napi-rs/canvas-win32-arm64-msvc@0.1.99':
|
||||
resolution: {integrity: sha512-XE6KUkfqRsCNejcoRMiMr3RaUeObxNf6y7dut3hrq2rn7PzfRTZgrjF1F/B2C7FcdgqY/vSHWpQeMuNz1vTNHg==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [win32]
|
||||
|
||||
'@napi-rs/canvas-win32-x64-msvc@0.1.92':
|
||||
resolution: {integrity: sha512-KEhyZLzq1MXCNlXybz4k25MJmHFp+uK1SIb8yJB0xfrQjz5aogAMhyseSzewo+XxAq3OAOdyKvfHGNzT3w1RPg==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [win32]
|
||||
|
||||
'@napi-rs/canvas-win32-x64-msvc@0.1.99':
|
||||
resolution: {integrity: sha512-plMYGVbc/vmmPF9MtmHbwNk1rL1Aj53vQZt+Gnv1oZn6gmd9jEHHJ0n9Nd2nxa5sKH7TS5IjkCDM6289O0d6PQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [win32]
|
||||
|
||||
'@napi-rs/canvas@0.1.92':
|
||||
resolution: {integrity: sha512-q7ZaUCJkEU5BeOdE7fBx1XWRd2T5Ady65nxq4brMf5L4cE1VV/ACq5w9Z5b/IVJs8CwSSIwc30nlthH0gFo4Ig==}
|
||||
engines: {node: '>= 10'}
|
||||
|
||||
'@napi-rs/canvas@0.1.99':
|
||||
resolution: {integrity: sha512-zN4eQlK3eBf7aJBcTHZilpBH3tDekBzPMIWC8r0s94Ecl73XfOyFi4w7yKFMRVUT0lvNQjtOL8YSrwqQj6mZFg==}
|
||||
engines: {node: '>= 10'}
|
||||
@@ -9914,86 +9846,39 @@ snapshots:
|
||||
|
||||
'@mozilla/readability@0.6.0': {}
|
||||
|
||||
'@napi-rs/canvas-android-arm64@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-android-arm64@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-darwin-arm64@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-darwin-arm64@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-darwin-x64@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-darwin-x64@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-arm-gnueabihf@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-gnu@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-gnu@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-musl@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-arm64-musl@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-riscv64-gnu@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-riscv64-gnu@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-x64-gnu@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-x64-gnu@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-x64-musl@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-linux-x64-musl@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-win32-arm64-msvc@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-win32-arm64-msvc@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-win32-x64-msvc@0.1.92':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas-win32-x64-msvc@0.1.99':
|
||||
optional: true
|
||||
|
||||
'@napi-rs/canvas@0.1.92':
|
||||
optionalDependencies:
|
||||
'@napi-rs/canvas-android-arm64': 0.1.92
|
||||
'@napi-rs/canvas-darwin-arm64': 0.1.92
|
||||
'@napi-rs/canvas-darwin-x64': 0.1.92
|
||||
'@napi-rs/canvas-linux-arm-gnueabihf': 0.1.92
|
||||
'@napi-rs/canvas-linux-arm64-gnu': 0.1.92
|
||||
'@napi-rs/canvas-linux-arm64-musl': 0.1.92
|
||||
'@napi-rs/canvas-linux-riscv64-gnu': 0.1.92
|
||||
'@napi-rs/canvas-linux-x64-gnu': 0.1.92
|
||||
'@napi-rs/canvas-linux-x64-musl': 0.1.92
|
||||
'@napi-rs/canvas-win32-arm64-msvc': 0.1.92
|
||||
'@napi-rs/canvas-win32-x64-msvc': 0.1.92
|
||||
|
||||
'@napi-rs/canvas@0.1.99':
|
||||
optionalDependencies:
|
||||
'@napi-rs/canvas-android-arm64': 0.1.99
|
||||
@@ -10007,7 +9892,6 @@ snapshots:
|
||||
'@napi-rs/canvas-linux-x64-musl': 0.1.99
|
||||
'@napi-rs/canvas-win32-arm64-msvc': 0.1.99
|
||||
'@napi-rs/canvas-win32-x64-msvc': 0.1.99
|
||||
optional: true
|
||||
|
||||
'@napi-rs/wasm-runtime@1.1.4(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)':
|
||||
dependencies:
|
||||
|
||||
@@ -48,8 +48,12 @@
|
||||
"risk": ["parser", "untrusted-html"]
|
||||
},
|
||||
"@napi-rs/canvas": {
|
||||
"owner": "capability:document-and-image-rendering",
|
||||
"class": "default-runtime-initially",
|
||||
"owner": "plugin:document-extract",
|
||||
"class": "optional-peer-runtime",
|
||||
"activation": [
|
||||
"input_file.application_pdf.image_fallback",
|
||||
"plugins.entries.document-extract.enabled"
|
||||
],
|
||||
"risk": ["native", "parser", "untrusted-files"]
|
||||
},
|
||||
"@vincentkoc/qrcode-tui": {
|
||||
@@ -149,8 +153,9 @@
|
||||
"risk": ["terminal-rendering"]
|
||||
},
|
||||
"pdfjs-dist": {
|
||||
"owner": "capability:document-extract",
|
||||
"class": "default-runtime-initially",
|
||||
"owner": "plugin:document-extract",
|
||||
"class": "plugin-runtime",
|
||||
"activation": ["input_file.application_pdf", "plugins.entries.document-extract.enabled"],
|
||||
"risk": ["parser", "untrusted-files"]
|
||||
},
|
||||
"proxy-agent": {
|
||||
|
||||
@@ -266,6 +266,7 @@
|
||||
"provider-stream",
|
||||
"provider-tools",
|
||||
"provider-usage",
|
||||
"document-extractor",
|
||||
"web-content-extractor",
|
||||
"provider-web-fetch-contract",
|
||||
"provider-web-fetch",
|
||||
|
||||
@@ -432,6 +432,7 @@ export function createPdfTool(options?: {
|
||||
maxPixels: PDF_MAX_PIXELS,
|
||||
minTextChars: PDF_MIN_TEXT_CHARS,
|
||||
pageNumbers,
|
||||
config: options?.config,
|
||||
});
|
||||
extractedAll.push(extracted);
|
||||
}
|
||||
|
||||
@@ -343,10 +343,11 @@ function isBinaryMediaMime(mime?: string): boolean {
|
||||
async function extractFileBlocks(params: {
|
||||
attachments: ReturnType<typeof normalizeMediaAttachments>;
|
||||
cache: ReturnType<typeof createMediaAttachmentCache>;
|
||||
cfg: OpenClawConfig;
|
||||
limits: ReturnType<typeof resolveFileLimits>;
|
||||
skipAttachmentIndexes?: Set<number>;
|
||||
}): Promise<string[]> {
|
||||
const { attachments, cache, limits, skipAttachmentIndexes } = params;
|
||||
const { attachments, cache, cfg, limits, skipAttachmentIndexes } = params;
|
||||
if (!attachments || attachments.length === 0) {
|
||||
return [];
|
||||
}
|
||||
@@ -447,6 +448,7 @@ async function extractFileBlocks(params: {
|
||||
...baseLimits,
|
||||
allowedMimes,
|
||||
},
|
||||
config: cfg,
|
||||
});
|
||||
} catch (err) {
|
||||
if (shouldLogVerbose()) {
|
||||
@@ -565,6 +567,7 @@ export async function applyMediaUnderstanding(params: {
|
||||
const fileBlocks = await extractFileBlocks({
|
||||
attachments,
|
||||
cache,
|
||||
cfg,
|
||||
limits: resolveFileLimits(cfg),
|
||||
skipAttachmentIndexes: audioAttachmentIndexes.size > 0 ? audioAttachmentIndexes : undefined,
|
||||
});
|
||||
|
||||
81
src/media/document-extractors.runtime.test.ts
Normal file
81
src/media/document-extractors.runtime.test.ts
Normal file
@@ -0,0 +1,81 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const { resolvePluginDocumentExtractorsMock } = vi.hoisted(() => ({
|
||||
resolvePluginDocumentExtractorsMock: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock("../plugins/document-extractors.runtime.js", () => ({
|
||||
resolvePluginDocumentExtractors: resolvePluginDocumentExtractorsMock,
|
||||
}));
|
||||
|
||||
import { extractDocumentContent } from "./document-extractors.runtime.js";
|
||||
|
||||
describe("extractDocumentContent", () => {
|
||||
beforeEach(() => {
|
||||
resolvePluginDocumentExtractorsMock.mockReset();
|
||||
});
|
||||
|
||||
it("passes only public extraction request fields to plugins", async () => {
|
||||
const extract = vi.fn().mockResolvedValue({ text: "pdf text", images: [] });
|
||||
resolvePluginDocumentExtractorsMock.mockReturnValue([
|
||||
{
|
||||
id: "pdf",
|
||||
pluginId: "document-extract",
|
||||
label: "PDF",
|
||||
mimeTypes: ["application/pdf"],
|
||||
extract,
|
||||
},
|
||||
]);
|
||||
|
||||
await expect(
|
||||
extractDocumentContent({
|
||||
buffer: Buffer.from("pdf"),
|
||||
mimeType: "application/pdf",
|
||||
maxPages: 1,
|
||||
maxPixels: 100,
|
||||
minTextChars: 10,
|
||||
config: {
|
||||
env: {
|
||||
vars: {
|
||||
SECRET_VALUE: "do-not-pass",
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
).resolves.toMatchObject({ text: "pdf text", extractor: "pdf" });
|
||||
|
||||
expect(extract).toHaveBeenCalledWith({
|
||||
buffer: Buffer.from("pdf"),
|
||||
mimeType: "application/pdf",
|
||||
maxPages: 1,
|
||||
maxPixels: 100,
|
||||
minTextChars: 10,
|
||||
});
|
||||
});
|
||||
|
||||
it("surfaces matching extractor failures instead of reporting disablement", async () => {
|
||||
const cause = new Error("password required");
|
||||
resolvePluginDocumentExtractorsMock.mockReturnValue([
|
||||
{
|
||||
id: "pdf",
|
||||
pluginId: "document-extract",
|
||||
label: "PDF",
|
||||
mimeTypes: ["application/pdf"],
|
||||
extract: vi.fn().mockRejectedValue(cause),
|
||||
},
|
||||
]);
|
||||
|
||||
await expect(
|
||||
extractDocumentContent({
|
||||
buffer: Buffer.from("pdf"),
|
||||
mimeType: "application/pdf",
|
||||
maxPages: 1,
|
||||
maxPixels: 100,
|
||||
minTextChars: 10,
|
||||
}),
|
||||
).rejects.toMatchObject({
|
||||
message: "Document extraction failed for application/pdf",
|
||||
cause,
|
||||
});
|
||||
});
|
||||
});
|
||||
76
src/media/document-extractors.runtime.ts
Normal file
76
src/media/document-extractors.runtime.ts
Normal file
@@ -0,0 +1,76 @@
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import type {
|
||||
DocumentExtractionRequest,
|
||||
DocumentExtractionResult,
|
||||
} from "../plugins/document-extractor-types.js";
|
||||
import { resolvePluginDocumentExtractors } from "../plugins/document-extractors.runtime.js";
|
||||
import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js";
|
||||
|
||||
let extractorPromise: Promise<ReturnType<typeof resolvePluginDocumentExtractors>> | undefined;
|
||||
const extractorPromisesByConfig = new WeakMap<
|
||||
OpenClawConfig,
|
||||
Promise<ReturnType<typeof resolvePluginDocumentExtractors>>
|
||||
>();
|
||||
|
||||
async function loadDocumentExtractors(config?: OpenClawConfig) {
|
||||
if (config) {
|
||||
const cached = extractorPromisesByConfig.get(config);
|
||||
if (cached) {
|
||||
return await cached;
|
||||
}
|
||||
const promise = Promise.resolve().then(() => resolvePluginDocumentExtractors({ config }));
|
||||
extractorPromisesByConfig.set(config, promise);
|
||||
void promise.catch(() => {
|
||||
extractorPromisesByConfig.delete(config);
|
||||
});
|
||||
return await promise;
|
||||
}
|
||||
extractorPromise ??= Promise.resolve(resolvePluginDocumentExtractors());
|
||||
return await extractorPromise;
|
||||
}
|
||||
|
||||
export async function extractDocumentContent(
|
||||
params: DocumentExtractionRequest & {
|
||||
config?: OpenClawConfig;
|
||||
},
|
||||
): Promise<(DocumentExtractionResult & { extractor: string }) | null> {
|
||||
const mimeType = normalizeLowercaseStringOrEmpty(params.mimeType);
|
||||
const extractors = await loadDocumentExtractors(params.config);
|
||||
const request: DocumentExtractionRequest = {
|
||||
buffer: params.buffer,
|
||||
mimeType: params.mimeType,
|
||||
maxPages: params.maxPages,
|
||||
maxPixels: params.maxPixels,
|
||||
minTextChars: params.minTextChars,
|
||||
...(params.pageNumbers ? { pageNumbers: params.pageNumbers } : {}),
|
||||
...(params.onImageExtractionError
|
||||
? { onImageExtractionError: params.onImageExtractionError }
|
||||
: {}),
|
||||
};
|
||||
const errors: unknown[] = [];
|
||||
|
||||
for (const extractor of extractors) {
|
||||
if (
|
||||
!extractor.mimeTypes.map((entry) => normalizeLowercaseStringOrEmpty(entry)).includes(mimeType)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
const result = await extractor.extract(request);
|
||||
if (result) {
|
||||
return {
|
||||
...result,
|
||||
extractor: extractor.id,
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
errors.push(error);
|
||||
}
|
||||
}
|
||||
if (errors.length > 0) {
|
||||
throw new Error(`Document extraction failed for ${mimeType || "unknown MIME type"}`, {
|
||||
cause: errors.length === 1 ? errors[0] : new AggregateError(errors),
|
||||
});
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import { fetchWithSsrFGuard } from "../infra/net/fetch-guard.js";
|
||||
import type { SsrFPolicy } from "../infra/net/ssrf.js";
|
||||
import { logWarn } from "../logger.js";
|
||||
@@ -322,6 +323,7 @@ export async function extractImageContentFromSource(
|
||||
export async function extractFileContentFromSource(params: {
|
||||
source: InputFileSource;
|
||||
limits: InputFileLimits;
|
||||
config?: OpenClawConfig;
|
||||
}): Promise<InputFileExtractResult> {
|
||||
const { source, limits } = params;
|
||||
const filename = source.filename || "file";
|
||||
@@ -378,6 +380,7 @@ export async function extractFileContentFromSource(params: {
|
||||
maxPages: limits.pdf.maxPages,
|
||||
maxPixels: limits.pdf.maxPixels,
|
||||
minTextChars: limits.pdf.minTextChars,
|
||||
...(params.config ? { config: params.config } : {}),
|
||||
onImageExtractionError: (err) => {
|
||||
logWarn(`media: PDF image extraction skipped, ${String(err)}`);
|
||||
},
|
||||
|
||||
54
src/media/pdf-extract.test.ts
Normal file
54
src/media/pdf-extract.test.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const { extractDocumentContentMock } = vi.hoisted(() => ({
|
||||
extractDocumentContentMock: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock("./document-extractors.runtime.js", () => ({
|
||||
extractDocumentContent: extractDocumentContentMock,
|
||||
}));
|
||||
|
||||
import { extractPdfContent } from "./pdf-extract.js";
|
||||
|
||||
describe("extractPdfContent", () => {
|
||||
beforeEach(() => {
|
||||
extractDocumentContentMock.mockReset();
|
||||
});
|
||||
|
||||
it("dispatches PDF extraction through document extractors", async () => {
|
||||
extractDocumentContentMock.mockResolvedValue({
|
||||
text: "extracted pdf",
|
||||
images: [],
|
||||
extractor: "pdf",
|
||||
});
|
||||
|
||||
await expect(
|
||||
extractPdfContent({
|
||||
buffer: Buffer.from("%PDF-1.4"),
|
||||
maxPages: 2,
|
||||
maxPixels: 100,
|
||||
minTextChars: 10,
|
||||
}),
|
||||
).resolves.toEqual({ text: "extracted pdf", images: [] });
|
||||
expect(extractDocumentContentMock).toHaveBeenCalledWith({
|
||||
buffer: Buffer.from("%PDF-1.4"),
|
||||
mimeType: "application/pdf",
|
||||
maxPages: 2,
|
||||
maxPixels: 100,
|
||||
minTextChars: 10,
|
||||
});
|
||||
});
|
||||
|
||||
it("throws a clear disabled error when no document extractor is available", async () => {
|
||||
extractDocumentContentMock.mockResolvedValue(null);
|
||||
|
||||
await expect(
|
||||
extractPdfContent({
|
||||
buffer: Buffer.from("%PDF-1.4"),
|
||||
maxPages: 2,
|
||||
maxPixels: 100,
|
||||
minTextChars: 10,
|
||||
}),
|
||||
).rejects.toThrow("PDF extraction disabled or unavailable");
|
||||
});
|
||||
});
|
||||
@@ -1,81 +1,12 @@
|
||||
type CanvasLike = {
|
||||
toBuffer(type: "image/png"): Buffer;
|
||||
};
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import type {
|
||||
DocumentExtractedImage,
|
||||
DocumentExtractionResult,
|
||||
} from "../plugins/document-extractor-types.js";
|
||||
import { extractDocumentContent } from "./document-extractors.runtime.js";
|
||||
|
||||
type CanvasModule = {
|
||||
createCanvas(width: number, height: number): CanvasLike;
|
||||
};
|
||||
|
||||
type PdfTextItem = {
|
||||
str: string;
|
||||
};
|
||||
|
||||
type PdfTextContent = {
|
||||
items: Array<PdfTextItem | object>;
|
||||
};
|
||||
|
||||
type PdfViewport = {
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
|
||||
type PdfPage = {
|
||||
getTextContent(): Promise<PdfTextContent>;
|
||||
getViewport(params: { scale: number }): PdfViewport;
|
||||
render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise<void> };
|
||||
};
|
||||
|
||||
type PdfDocument = {
|
||||
numPages: number;
|
||||
getPage(pageNumber: number): Promise<PdfPage>;
|
||||
};
|
||||
|
||||
type PdfJsModule = {
|
||||
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
|
||||
promise: Promise<PdfDocument>;
|
||||
};
|
||||
};
|
||||
|
||||
const CANVAS_MODULE = "@napi-rs/canvas";
|
||||
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
|
||||
|
||||
let canvasModulePromise: Promise<CanvasModule> | null = null;
|
||||
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
|
||||
|
||||
async function loadCanvasModule(): Promise<CanvasModule> {
|
||||
if (!canvasModulePromise) {
|
||||
canvasModulePromise = (import(CANVAS_MODULE) as Promise<CanvasModule>).catch((err) => {
|
||||
canvasModulePromise = null;
|
||||
throw new Error(
|
||||
`Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`,
|
||||
);
|
||||
});
|
||||
}
|
||||
return canvasModulePromise;
|
||||
}
|
||||
|
||||
async function loadPdfJsModule(): Promise<PdfJsModule> {
|
||||
if (!pdfJsModulePromise) {
|
||||
pdfJsModulePromise = (import(PDFJS_MODULE) as Promise<PdfJsModule>).catch((err) => {
|
||||
pdfJsModulePromise = null;
|
||||
throw new Error(
|
||||
`Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`,
|
||||
);
|
||||
});
|
||||
}
|
||||
return pdfJsModulePromise;
|
||||
}
|
||||
|
||||
export type PdfExtractedImage = {
|
||||
type: "image";
|
||||
data: string;
|
||||
mimeType: string;
|
||||
};
|
||||
|
||||
export type PdfExtractedContent = {
|
||||
text: string;
|
||||
images: PdfExtractedImage[];
|
||||
};
|
||||
export type PdfExtractedImage = DocumentExtractedImage;
|
||||
export type PdfExtractedContent = DocumentExtractionResult;
|
||||
|
||||
export async function extractPdfContent(params: {
|
||||
buffer: Buffer;
|
||||
@@ -83,60 +14,28 @@ export async function extractPdfContent(params: {
|
||||
maxPixels: number;
|
||||
minTextChars: number;
|
||||
pageNumbers?: number[];
|
||||
config?: OpenClawConfig;
|
||||
onImageExtractionError?: (error: unknown) => void;
|
||||
}): Promise<PdfExtractedContent> {
|
||||
const { buffer, maxPages, maxPixels, minTextChars, pageNumbers, onImageExtractionError } = params;
|
||||
const pdfJsModule = await loadPdfJsModule();
|
||||
const pdf = await pdfJsModule.getDocument({ data: new Uint8Array(buffer), disableWorker: true })
|
||||
.promise;
|
||||
|
||||
const effectivePages: number[] = pageNumbers
|
||||
? pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, maxPages)
|
||||
: Array.from({ length: Math.min(pdf.numPages, maxPages) }, (_, i) => i + 1);
|
||||
|
||||
const textParts: string[] = [];
|
||||
for (const pageNum of effectivePages) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const textContent = await page.getTextContent();
|
||||
const pageText = textContent.items
|
||||
.map((item) => ("str" in item ? item.str : ""))
|
||||
.filter(Boolean)
|
||||
.join(" ");
|
||||
if (pageText) {
|
||||
textParts.push(pageText);
|
||||
}
|
||||
const extracted = await extractDocumentContent({
|
||||
buffer: params.buffer,
|
||||
mimeType: "application/pdf",
|
||||
maxPages: params.maxPages,
|
||||
maxPixels: params.maxPixels,
|
||||
minTextChars: params.minTextChars,
|
||||
...(params.pageNumbers ? { pageNumbers: params.pageNumbers } : {}),
|
||||
...(params.config ? { config: params.config } : {}),
|
||||
...(params.onImageExtractionError
|
||||
? { onImageExtractionError: params.onImageExtractionError }
|
||||
: {}),
|
||||
});
|
||||
if (!extracted) {
|
||||
throw new Error(
|
||||
"PDF extraction disabled or unavailable: enable the document-extract plugin to process application/pdf files.",
|
||||
);
|
||||
}
|
||||
|
||||
const text = textParts.join("\n\n");
|
||||
if (text.trim().length >= minTextChars) {
|
||||
return { text, images: [] };
|
||||
}
|
||||
|
||||
let canvasModule: CanvasModule;
|
||||
try {
|
||||
canvasModule = await loadCanvasModule();
|
||||
} catch (err) {
|
||||
onImageExtractionError?.(err);
|
||||
return { text, images: [] };
|
||||
}
|
||||
|
||||
const images: PdfExtractedImage[] = [];
|
||||
const pixelBudget = Math.max(1, maxPixels);
|
||||
|
||||
for (const pageNum of effectivePages) {
|
||||
const page = await pdf.getPage(pageNum);
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pagePixels = viewport.width * viewport.height;
|
||||
const scale = Math.min(1, Math.sqrt(pixelBudget / Math.max(1, pagePixels)));
|
||||
const scaled = page.getViewport({ scale: Math.max(0.1, scale) });
|
||||
const canvas = canvasModule.createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height));
|
||||
await page.render({
|
||||
canvas: canvas as unknown as HTMLCanvasElement,
|
||||
viewport: scaled,
|
||||
}).promise;
|
||||
const png = canvas.toBuffer("image/png");
|
||||
images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
|
||||
}
|
||||
|
||||
return { text, images };
|
||||
return {
|
||||
text: extracted.text,
|
||||
images: extracted.images,
|
||||
};
|
||||
}
|
||||
|
||||
6
src/plugin-sdk/document-extractor.ts
Normal file
6
src/plugin-sdk/document-extractor.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
export type {
|
||||
DocumentExtractedImage,
|
||||
DocumentExtractionRequest,
|
||||
DocumentExtractionResult,
|
||||
DocumentExtractorPlugin,
|
||||
} from "../plugins/document-extractor-types.js";
|
||||
@@ -20,6 +20,7 @@ export type BundledPluginContractSnapshot = {
|
||||
realtimeTranscriptionProviderIds: string[];
|
||||
realtimeVoiceProviderIds: string[];
|
||||
mediaUnderstandingProviderIds: string[];
|
||||
documentExtractorIds: string[];
|
||||
imageGenerationProviderIds: string[];
|
||||
videoGenerationProviderIds: string[];
|
||||
musicGenerationProviderIds: string[];
|
||||
@@ -116,6 +117,9 @@ export function buildBundledPluginContractSnapshot(
|
||||
manifest.contracts?.mediaUnderstandingProviders,
|
||||
(value) => value.trim(),
|
||||
),
|
||||
documentExtractorIds: uniqueStrings(manifest.contracts?.documentExtractors, (value) =>
|
||||
value.trim(),
|
||||
),
|
||||
imageGenerationProviderIds: uniqueStrings(
|
||||
manifest.contracts?.imageGenerationProviders,
|
||||
(value) => value.trim(),
|
||||
@@ -151,6 +155,7 @@ export function hasBundledPluginContractSnapshotCapabilities(
|
||||
entry.realtimeTranscriptionProviderIds.length > 0 ||
|
||||
entry.realtimeVoiceProviderIds.length > 0 ||
|
||||
entry.mediaUnderstandingProviderIds.length > 0 ||
|
||||
entry.documentExtractorIds.length > 0 ||
|
||||
entry.imageGenerationProviderIds.length > 0 ||
|
||||
entry.videoGenerationProviderIds.length > 0 ||
|
||||
entry.musicGenerationProviderIds.length > 0 ||
|
||||
|
||||
@@ -64,6 +64,7 @@ type ManifestContractKey =
|
||||
| "realtimeTranscriptionProviders"
|
||||
| "realtimeVoiceProviders"
|
||||
| "mediaUnderstandingProviders"
|
||||
| "documentExtractors"
|
||||
| "imageGenerationProviders"
|
||||
| "videoGenerationProviders"
|
||||
| "musicGenerationProviders"
|
||||
@@ -84,6 +85,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] {
|
||||
realtimeTranscriptionProviderIds: [...entry.realtimeTranscriptionProviderIds],
|
||||
realtimeVoiceProviderIds: [...entry.realtimeVoiceProviderIds],
|
||||
mediaUnderstandingProviderIds: [...entry.mediaUnderstandingProviderIds],
|
||||
documentExtractorIds: [...entry.documentExtractorIds],
|
||||
imageGenerationProviderIds: [...entry.imageGenerationProviderIds],
|
||||
videoGenerationProviderIds: [...entry.videoGenerationProviderIds],
|
||||
musicGenerationProviderIds: [...entry.musicGenerationProviderIds],
|
||||
@@ -103,6 +105,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] {
|
||||
(plugin.contracts?.realtimeTranscriptionProviders?.length ?? 0) > 0 ||
|
||||
(plugin.contracts?.realtimeVoiceProviders?.length ?? 0) > 0 ||
|
||||
(plugin.contracts?.mediaUnderstandingProviders?.length ?? 0) > 0 ||
|
||||
(plugin.contracts?.documentExtractors?.length ?? 0) > 0 ||
|
||||
(plugin.contracts?.imageGenerationProviders?.length ?? 0) > 0 ||
|
||||
(plugin.contracts?.videoGenerationProviders?.length ?? 0) > 0 ||
|
||||
(plugin.contracts?.musicGenerationProviders?.length ?? 0) > 0 ||
|
||||
@@ -123,6 +126,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] {
|
||||
mediaUnderstandingProviderIds: uniqueStrings(
|
||||
plugin.contracts?.mediaUnderstandingProviders ?? [],
|
||||
),
|
||||
documentExtractorIds: uniqueStrings(plugin.contracts?.documentExtractors ?? []),
|
||||
imageGenerationProviderIds: uniqueStrings(plugin.contracts?.imageGenerationProviders ?? []),
|
||||
videoGenerationProviderIds: uniqueStrings(plugin.contracts?.videoGenerationProviders ?? []),
|
||||
musicGenerationProviderIds: uniqueStrings(plugin.contracts?.musicGenerationProviders ?? []),
|
||||
@@ -175,6 +179,8 @@ function resolveBundledManifestPluginIdsForContract(contract: ManifestContractKe
|
||||
return entry.realtimeVoiceProviderIds.length > 0;
|
||||
case "mediaUnderstandingProviders":
|
||||
return entry.mediaUnderstandingProviderIds.length > 0;
|
||||
case "documentExtractors":
|
||||
return entry.documentExtractorIds.length > 0;
|
||||
case "imageGenerationProviders":
|
||||
return entry.imageGenerationProviderIds.length > 0;
|
||||
case "videoGenerationProviders":
|
||||
|
||||
55
src/plugins/document-extractor-public-artifacts.test.ts
Normal file
55
src/plugins/document-extractor-public-artifacts.test.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const { publicArtifactModule } = vi.hoisted(() => ({
|
||||
publicArtifactModule: {} as Record<string, unknown>,
|
||||
}));
|
||||
|
||||
vi.mock("./public-surface-loader.js", () => ({
|
||||
loadBundledPluginPublicArtifactModuleSync: vi.fn(() => publicArtifactModule),
|
||||
resolveBundledPluginPublicArtifactPath: vi.fn(
|
||||
() => "/repo/extensions/demo/document-extractor.ts",
|
||||
),
|
||||
}));
|
||||
|
||||
import { loadBundledDocumentExtractorEntriesFromDir } from "./document-extractor-public-artifacts.js";
|
||||
|
||||
describe("loadBundledDocumentExtractorEntriesFromDir", () => {
|
||||
beforeEach(() => {
|
||||
for (const key of Object.keys(publicArtifactModule)) {
|
||||
delete publicArtifactModule[key];
|
||||
}
|
||||
});
|
||||
|
||||
it("isolates a throwing factory when another extractor factory succeeds", () => {
|
||||
publicArtifactModule.createBrokenDocumentExtractor = () => {
|
||||
throw new Error("native probe failed");
|
||||
};
|
||||
publicArtifactModule.createPdfDocumentExtractor = () => ({
|
||||
id: "pdf",
|
||||
label: "PDF",
|
||||
mimeTypes: ["application/pdf"],
|
||||
extract: vi.fn(),
|
||||
});
|
||||
|
||||
expect(
|
||||
loadBundledDocumentExtractorEntriesFromDir({
|
||||
dirName: "demo",
|
||||
pluginId: "demo",
|
||||
}),
|
||||
).toMatchObject([{ id: "pdf", pluginId: "demo" }]);
|
||||
});
|
||||
|
||||
it("surfaces initialization failure when every matching factory throws", () => {
|
||||
const cause = new Error("native probe failed");
|
||||
publicArtifactModule.createPdfDocumentExtractor = () => {
|
||||
throw cause;
|
||||
};
|
||||
|
||||
expect(() =>
|
||||
loadBundledDocumentExtractorEntriesFromDir({
|
||||
dirName: "demo",
|
||||
pluginId: "demo",
|
||||
}),
|
||||
).toThrow("Unable to initialize document extractors for plugin demo");
|
||||
});
|
||||
});
|
||||
108
src/plugins/document-extractor-public-artifacts.ts
Normal file
108
src/plugins/document-extractor-public-artifacts.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
import type {
|
||||
DocumentExtractorPlugin,
|
||||
PluginDocumentExtractorEntry,
|
||||
} from "./document-extractor-types.js";
|
||||
import {
|
||||
loadBundledPluginPublicArtifactModuleSync,
|
||||
resolveBundledPluginPublicArtifactPath,
|
||||
} from "./public-surface-loader.js";
|
||||
|
||||
const DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES = [
|
||||
"document-extractor.js",
|
||||
"document-extractor-api.js",
|
||||
] as const;
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === "object" && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function isDocumentExtractorPlugin(value: unknown): value is DocumentExtractorPlugin {
|
||||
return (
|
||||
isRecord(value) &&
|
||||
typeof value.id === "string" &&
|
||||
typeof value.label === "string" &&
|
||||
Array.isArray(value.mimeTypes) &&
|
||||
value.mimeTypes.every((mimeType) => typeof mimeType === "string" && mimeType.trim()) &&
|
||||
(value.autoDetectOrder === undefined || typeof value.autoDetectOrder === "number") &&
|
||||
typeof value.extract === "function"
|
||||
);
|
||||
}
|
||||
|
||||
function tryLoadBundledPublicArtifactModule(params: {
|
||||
dirName: string;
|
||||
}): Record<string, unknown> | null {
|
||||
for (const artifactBasename of DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES) {
|
||||
try {
|
||||
return loadBundledPluginPublicArtifactModuleSync<Record<string, unknown>>({
|
||||
dirName: params.dirName,
|
||||
artifactBasename,
|
||||
});
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof Error &&
|
||||
error.message.startsWith("Unable to resolve bundled plugin public surface ")
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function collectExtractorFactories(mod: Record<string, unknown>): {
|
||||
extractors: DocumentExtractorPlugin[];
|
||||
errors: unknown[];
|
||||
} {
|
||||
const extractors: DocumentExtractorPlugin[] = [];
|
||||
const errors: unknown[] = [];
|
||||
for (const [name, exported] of Object.entries(mod).toSorted(([left], [right]) =>
|
||||
left.localeCompare(right),
|
||||
)) {
|
||||
if (
|
||||
typeof exported !== "function" ||
|
||||
exported.length !== 0 ||
|
||||
!name.startsWith("create") ||
|
||||
!name.endsWith("DocumentExtractor")
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
let candidate: unknown;
|
||||
try {
|
||||
candidate = exported();
|
||||
} catch (error) {
|
||||
errors.push(error);
|
||||
continue;
|
||||
}
|
||||
if (isDocumentExtractorPlugin(candidate)) {
|
||||
extractors.push(candidate);
|
||||
}
|
||||
}
|
||||
return { extractors, errors };
|
||||
}
|
||||
|
||||
export function loadBundledDocumentExtractorEntriesFromDir(params: {
|
||||
dirName: string;
|
||||
pluginId: string;
|
||||
}): PluginDocumentExtractorEntry[] | null {
|
||||
const mod = tryLoadBundledPublicArtifactModule({ dirName: params.dirName });
|
||||
if (!mod) {
|
||||
return null;
|
||||
}
|
||||
const { extractors, errors } = collectExtractorFactories(mod);
|
||||
if (extractors.length === 0) {
|
||||
if (errors.length > 0) {
|
||||
throw new Error(`Unable to initialize document extractors for plugin ${params.pluginId}`, {
|
||||
cause: errors.length === 1 ? errors[0] : new AggregateError(errors),
|
||||
});
|
||||
}
|
||||
return null;
|
||||
}
|
||||
return extractors.map((extractor) => Object.assign({}, extractor, { pluginId: params.pluginId }));
|
||||
}
|
||||
|
||||
export function hasBundledDocumentExtractorPublicArtifact(pluginId: string): boolean {
|
||||
return DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES.some((artifactBasename) =>
|
||||
Boolean(resolveBundledPluginPublicArtifactPath({ dirName: pluginId, artifactBasename })),
|
||||
);
|
||||
}
|
||||
32
src/plugins/document-extractor-types.ts
Normal file
32
src/plugins/document-extractor-types.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
export type DocumentExtractedImage = {
|
||||
type: "image";
|
||||
data: string;
|
||||
mimeType: string;
|
||||
};
|
||||
|
||||
export type DocumentExtractionRequest = {
|
||||
buffer: Buffer;
|
||||
mimeType: string;
|
||||
maxPages: number;
|
||||
maxPixels: number;
|
||||
minTextChars: number;
|
||||
pageNumbers?: number[];
|
||||
onImageExtractionError?: (error: unknown) => void;
|
||||
};
|
||||
|
||||
export type DocumentExtractionResult = {
|
||||
text: string;
|
||||
images: DocumentExtractedImage[];
|
||||
};
|
||||
|
||||
export type DocumentExtractorPlugin = {
|
||||
id: string;
|
||||
label: string;
|
||||
mimeTypes: readonly string[];
|
||||
autoDetectOrder?: number;
|
||||
extract: (request: DocumentExtractionRequest) => Promise<DocumentExtractionResult | null>;
|
||||
};
|
||||
|
||||
export type PluginDocumentExtractorEntry = DocumentExtractorPlugin & {
|
||||
pluginId: string;
|
||||
};
|
||||
28
src/plugins/document-extractors.runtime.test.ts
Normal file
28
src/plugins/document-extractors.runtime.test.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { resolvePluginDocumentExtractors } from "./document-extractors.runtime.js";
|
||||
|
||||
describe("resolvePluginDocumentExtractors", () => {
|
||||
it("respects global plugin disablement", () => {
|
||||
expect(
|
||||
resolvePluginDocumentExtractors({
|
||||
config: {
|
||||
plugins: {
|
||||
enabled: false,
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual([]);
|
||||
});
|
||||
|
||||
it("does not expand an operator plugin allowlist", () => {
|
||||
expect(
|
||||
resolvePluginDocumentExtractors({
|
||||
config: {
|
||||
plugins: {
|
||||
allow: ["openai"],
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual([]);
|
||||
});
|
||||
});
|
||||
134
src/plugins/document-extractors.runtime.ts
Normal file
134
src/plugins/document-extractors.runtime.ts
Normal file
@@ -0,0 +1,134 @@
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import { resolveBundledPluginCompatibleLoadValues } from "./activation-context.js";
|
||||
import {
|
||||
createPluginActivationSource,
|
||||
normalizePluginsConfig,
|
||||
resolveEffectivePluginActivationState,
|
||||
} from "./config-state.js";
|
||||
import { loadBundledDocumentExtractorEntriesFromDir } from "./document-extractor-public-artifacts.js";
|
||||
import type { PluginDocumentExtractorEntry } from "./document-extractor-types.js";
|
||||
import { loadPluginManifestRegistry } from "./manifest-registry.js";
|
||||
import type { PluginManifestRecord } from "./manifest-registry.js";
|
||||
|
||||
function compareExtractors(
|
||||
left: PluginDocumentExtractorEntry,
|
||||
right: PluginDocumentExtractorEntry,
|
||||
): number {
|
||||
const leftOrder = left.autoDetectOrder ?? Number.MAX_SAFE_INTEGER;
|
||||
const rightOrder = right.autoDetectOrder ?? Number.MAX_SAFE_INTEGER;
|
||||
if (leftOrder !== rightOrder) {
|
||||
return leftOrder - rightOrder;
|
||||
}
|
||||
return left.id.localeCompare(right.id) || left.pluginId.localeCompare(right.pluginId);
|
||||
}
|
||||
|
||||
function resolveBundledDocumentExtractorCompatPluginIds(params: {
|
||||
config?: OpenClawConfig;
|
||||
workspaceDir?: string;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
onlyPluginIds?: readonly string[];
|
||||
}): string[] {
|
||||
const onlyPluginIdSet =
|
||||
params.onlyPluginIds && params.onlyPluginIds.length > 0 ? new Set(params.onlyPluginIds) : null;
|
||||
return loadPluginManifestRegistry({
|
||||
config: params.config,
|
||||
workspaceDir: params.workspaceDir,
|
||||
env: params.env,
|
||||
})
|
||||
.plugins.filter(
|
||||
(plugin) =>
|
||||
plugin.origin === "bundled" &&
|
||||
(!onlyPluginIdSet || onlyPluginIdSet.has(plugin.id)) &&
|
||||
(plugin.contracts?.documentExtractors?.length ?? 0) > 0,
|
||||
)
|
||||
.map((plugin) => plugin.id)
|
||||
.toSorted((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
function resolveEnabledBundledDocumentExtractorPlugins(params: {
|
||||
config?: OpenClawConfig;
|
||||
workspaceDir?: string;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
onlyPluginIds?: readonly string[];
|
||||
}): PluginManifestRecord[] {
|
||||
if (params.config?.plugins?.enabled === false) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const activation = resolveBundledPluginCompatibleLoadValues({
|
||||
rawConfig: params.config,
|
||||
env: params.env,
|
||||
workspaceDir: params.workspaceDir,
|
||||
onlyPluginIds: params.onlyPluginIds,
|
||||
applyAutoEnable: true,
|
||||
compatMode: {
|
||||
allowlist: false,
|
||||
enablement: "allowlist",
|
||||
vitest: true,
|
||||
},
|
||||
resolveCompatPluginIds: resolveBundledDocumentExtractorCompatPluginIds,
|
||||
});
|
||||
const normalizedPlugins = normalizePluginsConfig(activation.config?.plugins);
|
||||
const activationSource = createPluginActivationSource({
|
||||
config: activation.activationSourceConfig,
|
||||
});
|
||||
const onlyPluginIdSet =
|
||||
params.onlyPluginIds && params.onlyPluginIds.length > 0 ? new Set(params.onlyPluginIds) : null;
|
||||
return loadPluginManifestRegistry({
|
||||
config: activation.config,
|
||||
workspaceDir: params.workspaceDir,
|
||||
env: params.env,
|
||||
}).plugins.filter((plugin) => {
|
||||
if (
|
||||
plugin.origin !== "bundled" ||
|
||||
(onlyPluginIdSet && !onlyPluginIdSet.has(plugin.id)) ||
|
||||
(plugin.contracts?.documentExtractors?.length ?? 0) === 0
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
return resolveEffectivePluginActivationState({
|
||||
id: plugin.id,
|
||||
origin: plugin.origin,
|
||||
config: normalizedPlugins,
|
||||
rootConfig: activation.config,
|
||||
enabledByDefault: plugin.enabledByDefault,
|
||||
activationSource,
|
||||
}).enabled;
|
||||
});
|
||||
}
|
||||
|
||||
export function resolvePluginDocumentExtractors(params?: {
|
||||
config?: OpenClawConfig;
|
||||
workspaceDir?: string;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
onlyPluginIds?: readonly string[];
|
||||
}): PluginDocumentExtractorEntry[] {
|
||||
const extractors: PluginDocumentExtractorEntry[] = [];
|
||||
const loadErrors: unknown[] = [];
|
||||
for (const plugin of resolveEnabledBundledDocumentExtractorPlugins({
|
||||
config: params?.config,
|
||||
workspaceDir: params?.workspaceDir,
|
||||
env: params?.env,
|
||||
onlyPluginIds: params?.onlyPluginIds,
|
||||
})) {
|
||||
let loaded: PluginDocumentExtractorEntry[] | null;
|
||||
try {
|
||||
loaded = loadBundledDocumentExtractorEntriesFromDir({
|
||||
dirName: plugin.id,
|
||||
pluginId: plugin.id,
|
||||
});
|
||||
} catch (error) {
|
||||
loadErrors.push(error);
|
||||
continue;
|
||||
}
|
||||
if (loaded) {
|
||||
extractors.push(...loaded);
|
||||
}
|
||||
}
|
||||
if (extractors.length === 0 && loadErrors.length > 0) {
|
||||
throw new Error("Unable to load document extractor plugins", {
|
||||
cause: loadErrors.length === 1 ? loadErrors[0] : new AggregateError(loadErrors),
|
||||
});
|
||||
}
|
||||
return extractors.toSorted(compareExtractors);
|
||||
}
|
||||
@@ -52,6 +52,7 @@ function hasRuntimeContractSurface(plugin: PluginManifestRecord): boolean {
|
||||
plugin.cliBackends.length > 0 ||
|
||||
plugin.contracts?.speechProviders?.length ||
|
||||
plugin.contracts?.mediaUnderstandingProviders?.length ||
|
||||
plugin.contracts?.documentExtractors?.length ||
|
||||
plugin.contracts?.imageGenerationProviders?.length ||
|
||||
plugin.contracts?.videoGenerationProviders?.length ||
|
||||
plugin.contracts?.musicGenerationProviders?.length ||
|
||||
|
||||
@@ -67,6 +67,7 @@ type PluginManifestContractListKey =
|
||||
| "speechProviders"
|
||||
| "externalAuthProviders"
|
||||
| "mediaUnderstandingProviders"
|
||||
| "documentExtractors"
|
||||
| "realtimeVoiceProviders"
|
||||
| "realtimeTranscriptionProviders"
|
||||
| "imageGenerationProviders"
|
||||
|
||||
@@ -251,6 +251,7 @@ export type PluginManifestContracts = {
|
||||
realtimeTranscriptionProviders?: string[];
|
||||
realtimeVoiceProviders?: string[];
|
||||
mediaUnderstandingProviders?: string[];
|
||||
documentExtractors?: string[];
|
||||
imageGenerationProviders?: string[];
|
||||
videoGenerationProviders?: string[];
|
||||
musicGenerationProviders?: string[];
|
||||
@@ -443,6 +444,7 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
|
||||
);
|
||||
const realtimeVoiceProviders = normalizeTrimmedStringList(value.realtimeVoiceProviders);
|
||||
const mediaUnderstandingProviders = normalizeTrimmedStringList(value.mediaUnderstandingProviders);
|
||||
const documentExtractors = normalizeTrimmedStringList(value.documentExtractors);
|
||||
const imageGenerationProviders = normalizeTrimmedStringList(value.imageGenerationProviders);
|
||||
const videoGenerationProviders = normalizeTrimmedStringList(value.videoGenerationProviders);
|
||||
const musicGenerationProviders = normalizeTrimmedStringList(value.musicGenerationProviders);
|
||||
@@ -459,6 +461,7 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
|
||||
...(realtimeTranscriptionProviders.length > 0 ? { realtimeTranscriptionProviders } : {}),
|
||||
...(realtimeVoiceProviders.length > 0 ? { realtimeVoiceProviders } : {}),
|
||||
...(mediaUnderstandingProviders.length > 0 ? { mediaUnderstandingProviders } : {}),
|
||||
...(documentExtractors.length > 0 ? { documentExtractors } : {}),
|
||||
...(imageGenerationProviders.length > 0 ? { imageGenerationProviders } : {}),
|
||||
...(videoGenerationProviders.length > 0 ? { videoGenerationProviders } : {}),
|
||||
...(musicGenerationProviders.length > 0 ? { musicGenerationProviders } : {}),
|
||||
|
||||
@@ -5,6 +5,7 @@ import { afterEach, describe, expect, it } from "vitest";
|
||||
import {
|
||||
PUBLIC_SURFACE_SOURCE_EXTENSIONS,
|
||||
normalizeBundledPluginArtifactSubpath,
|
||||
normalizeBundledPluginDirName,
|
||||
resolveBundledPluginPublicSurfacePath,
|
||||
resolveBundledPluginSourcePublicSurfacePath,
|
||||
} from "./public-surface-runtime.js";
|
||||
@@ -96,4 +97,12 @@ describe("bundled plugin public surface runtime", () => {
|
||||
/must stay plugin-local/,
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects bundled plugin directory traversal", () => {
|
||||
expect(normalizeBundledPluginDirName("document-extract")).toBe("document-extract");
|
||||
expect(() => normalizeBundledPluginDirName("../outside")).toThrow(/single directory/);
|
||||
expect(() => normalizeBundledPluginDirName("nested/plugin")).toThrow(/single directory/);
|
||||
expect(() => normalizeBundledPluginDirName("nested\\plugin")).toThrow(/single directory/);
|
||||
expect(() => normalizeBundledPluginDirName("C:plugin")).toThrow(/single directory/);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -38,19 +38,31 @@ export function normalizeBundledPluginArtifactSubpath(artifactBasename: string):
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function normalizeBundledPluginDirName(dirName: string): string {
|
||||
const normalized = dirName.trim();
|
||||
if (
|
||||
!normalized ||
|
||||
normalized === "." ||
|
||||
normalized === ".." ||
|
||||
normalized.includes("/") ||
|
||||
normalized.includes("\\") ||
|
||||
normalized.includes(":")
|
||||
) {
|
||||
throw new Error(`Bundled plugin dirName must be a single directory: ${dirName}`);
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function resolveBundledPluginSourcePublicSurfacePath(params: {
|
||||
sourceRoot: string;
|
||||
dirName: string;
|
||||
artifactBasename: string;
|
||||
}): string | null {
|
||||
const artifactBasename = normalizeBundledPluginArtifactSubpath(params.artifactBasename);
|
||||
const dirName = normalizeBundledPluginDirName(params.dirName);
|
||||
const sourceBaseName = artifactBasename.replace(/\.js$/u, "");
|
||||
for (const ext of PUBLIC_SURFACE_SOURCE_EXTENSIONS) {
|
||||
const sourceCandidate = path.resolve(
|
||||
params.sourceRoot,
|
||||
params.dirName,
|
||||
`${sourceBaseName}${ext}`,
|
||||
);
|
||||
const sourceCandidate = path.resolve(params.sourceRoot, dirName, `${sourceBaseName}${ext}`);
|
||||
if (fs.existsSync(sourceCandidate)) {
|
||||
return sourceCandidate;
|
||||
}
|
||||
@@ -88,11 +100,12 @@ export function resolveBundledPluginPublicSurfacePath(params: {
|
||||
bundledPluginsDir?: string;
|
||||
}): string | null {
|
||||
const artifactBasename = normalizeBundledPluginArtifactSubpath(params.artifactBasename);
|
||||
const dirName = normalizeBundledPluginDirName(params.dirName);
|
||||
|
||||
const explicitBundledPluginsDir =
|
||||
params.bundledPluginsDir ?? resolveBundledPluginsDir(params.env ?? process.env);
|
||||
if (explicitBundledPluginsDir) {
|
||||
const explicitPluginDir = path.resolve(explicitBundledPluginsDir, params.dirName);
|
||||
const explicitPluginDir = path.resolve(explicitBundledPluginsDir, dirName);
|
||||
const explicitBuiltCandidate = path.join(explicitPluginDir, artifactBasename);
|
||||
if (fs.existsSync(explicitBuiltCandidate)) {
|
||||
return explicitBuiltCandidate;
|
||||
@@ -100,21 +113,21 @@ export function resolveBundledPluginPublicSurfacePath(params: {
|
||||
return (
|
||||
resolveBundledPluginSourcePublicSurfacePath({
|
||||
sourceRoot: explicitBundledPluginsDir,
|
||||
dirName: params.dirName,
|
||||
dirName,
|
||||
artifactBasename,
|
||||
}) ??
|
||||
resolvePackageSourceFallbackForBundledDir({
|
||||
rootDir: params.rootDir,
|
||||
bundledPluginsDir: explicitBundledPluginsDir,
|
||||
dirName: params.dirName,
|
||||
dirName,
|
||||
artifactBasename,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
for (const candidate of [
|
||||
path.resolve(params.rootDir, "dist", "extensions", params.dirName, artifactBasename),
|
||||
path.resolve(params.rootDir, "dist-runtime", "extensions", params.dirName, artifactBasename),
|
||||
path.resolve(params.rootDir, "dist", "extensions", dirName, artifactBasename),
|
||||
path.resolve(params.rootDir, "dist-runtime", "extensions", dirName, artifactBasename),
|
||||
]) {
|
||||
if (fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
@@ -123,7 +136,7 @@ export function resolveBundledPluginPublicSurfacePath(params: {
|
||||
|
||||
return resolveBundledPluginSourcePublicSurfacePath({
|
||||
sourceRoot: path.resolve(params.rootDir, "extensions"),
|
||||
dirName: params.dirName,
|
||||
dirName,
|
||||
artifactBasename,
|
||||
});
|
||||
}
|
||||
|
||||
7
src/types/napi-rs-canvas.d.ts
vendored
7
src/types/napi-rs-canvas.d.ts
vendored
@@ -1,7 +0,0 @@
|
||||
declare module "@napi-rs/canvas" {
|
||||
export type Canvas = {
|
||||
toBuffer(type?: string): Buffer;
|
||||
};
|
||||
|
||||
export function createCanvas(width: number, height: number): Canvas;
|
||||
}
|
||||
33
src/types/pdfjs-dist-legacy.d.ts
vendored
33
src/types/pdfjs-dist-legacy.d.ts
vendored
@@ -1,33 +0,0 @@
|
||||
declare module "pdfjs-dist/legacy/build/pdf.mjs" {
|
||||
export type TextItem = {
|
||||
str: string;
|
||||
};
|
||||
|
||||
export type TextMarkedContent = {
|
||||
type?: string;
|
||||
};
|
||||
|
||||
export type TextContent = {
|
||||
items: Array<TextItem | TextMarkedContent>;
|
||||
};
|
||||
|
||||
export type Viewport = {
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
|
||||
export type PDFPageProxy = {
|
||||
getTextContent(): Promise<TextContent>;
|
||||
getViewport(params: { scale: number }): Viewport;
|
||||
render(params: { canvas: unknown; viewport: Viewport }): { promise: Promise<void> };
|
||||
};
|
||||
|
||||
export type PDFDocumentProxy = {
|
||||
numPages: number;
|
||||
getPage(pageNumber: number): Promise<PDFPageProxy>;
|
||||
};
|
||||
|
||||
export function getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
|
||||
promise: Promise<PDFDocumentProxy>;
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user