From e3cba98f3904c74702464082f07fd50fecb38085 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 17:15:05 -0700 Subject: [PATCH] refactor(pdf): move document extraction to plugin * refactor(pdf): move document extraction to plugin * fix(deps): sync document extract lockfile * fix(pdf): harden document extraction plugin --- CHANGELOG.md | 1 + docs/gateway/openresponses-http-api.md | 5 +- docs/tools/pdf.md | 4 +- .../document-extractor.test.ts | 62 +++++ .../document-extract/document-extractor.ts | 216 ++++++++++++++++++ extensions/document-extract/index.ts | 11 + .../document-extract/openclaw.plugin.json | 14 ++ extensions/document-extract/package.json | 26 +++ package.json | 7 +- pnpm-lock.yaml | 142 ++---------- scripts/lib/dependency-ownership.json | 13 +- scripts/lib/plugin-sdk-entrypoints.json | 1 + src/agents/tools/pdf-tool.ts | 1 + src/media-understanding/apply.ts | 5 +- src/media/document-extractors.runtime.test.ts | 81 +++++++ src/media/document-extractors.runtime.ts | 76 ++++++ src/media/input-files.ts | 3 + src/media/pdf-extract.test.ts | 54 +++++ src/media/pdf-extract.ts | 159 +++---------- src/plugin-sdk/document-extractor.ts | 6 + .../inventory/bundled-capability-metadata.ts | 5 + src/plugins/contracts/registry.ts | 6 + ...ocument-extractor-public-artifacts.test.ts | 55 +++++ .../document-extractor-public-artifacts.ts | 108 +++++++++ src/plugins/document-extractor-types.ts | 32 +++ .../document-extractors.runtime.test.ts | 28 +++ src/plugins/document-extractors.runtime.ts | 134 +++++++++++ src/plugins/gateway-startup-plugin-ids.ts | 1 + src/plugins/manifest-registry.ts | 1 + src/plugins/manifest.ts | 3 + src/plugins/public-surface-runtime.test.ts | 9 + src/plugins/public-surface-runtime.ts | 35 ++- src/types/napi-rs-canvas.d.ts | 7 - src/types/pdfjs-dist-legacy.d.ts | 33 --- 34 files changed, 1023 insertions(+), 321 deletions(-) create mode 100644 extensions/document-extract/document-extractor.test.ts create mode 100644 extensions/document-extract/document-extractor.ts create mode 100644 extensions/document-extract/index.ts create mode 100644 extensions/document-extract/openclaw.plugin.json create mode 100644 extensions/document-extract/package.json create mode 100644 src/media/document-extractors.runtime.test.ts create mode 100644 src/media/document-extractors.runtime.ts create mode 100644 src/media/pdf-extract.test.ts create mode 100644 src/plugin-sdk/document-extractor.ts create mode 100644 src/plugins/document-extractor-public-artifacts.test.ts create mode 100644 src/plugins/document-extractor-public-artifacts.ts create mode 100644 src/plugins/document-extractor-types.ts create mode 100644 src/plugins/document-extractors.runtime.test.ts create mode 100644 src/plugins/document-extractors.runtime.ts delete mode 100644 src/types/napi-rs-canvas.d.ts delete mode 100644 src/types/pdfjs-dist-legacy.d.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index bd5b1460bdc..1c70982b432 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai ### Changes +- Plugins/PDF: move local PDF extraction into a bundled `document-extract` plugin so core no longer owns `pdfjs-dist` or PDF image-rendering dependencies. Thanks @vincentkoc. - Matrix: require full cross-signing identity trust for self-device verification and add `openclaw matrix verify self` so operators can establish that trust from the CLI. (#70401) Thanks @gumadeiras. ### Fixes diff --git a/docs/gateway/openresponses-http-api.md b/docs/gateway/openresponses-http-api.md index 8e8d4de5ac9..9b6d2ebcb73 100644 --- a/docs/gateway/openresponses-http-api.md +++ b/docs/gateway/openresponses-http-api.md @@ -172,8 +172,9 @@ Current behavior: rasterized into images and passed to the model, and the injected file block uses the placeholder `[PDF content rendered to images]`. -PDF parsing uses the Node-friendly `pdfjs-dist` legacy build (no worker). The modern -PDF.js build expects browser workers/DOM globals, so it is not used in the Gateway. +PDF parsing is provided by the bundled `document-extract` plugin, which uses the +Node-friendly `pdfjs-dist` legacy build (no worker). The modern PDF.js build +expects browser workers/DOM globals, so it is not used in the Gateway. URL fetch defaults: diff --git a/docs/tools/pdf.md b/docs/tools/pdf.md index 51c847dd935..c849dc5c810 100644 --- a/docs/tools/pdf.md +++ b/docs/tools/pdf.md @@ -112,7 +112,9 @@ Fallback details: - If text extraction succeeds but image extraction would require vision on a text-only model, OpenClaw drops the rendered images and continues with the extracted text. -- Extraction fallback requires `pdfjs-dist` (and `@napi-rs/canvas` for image rendering). +- Extraction fallback uses the bundled `document-extract` plugin. The plugin owns + `pdfjs-dist`; `@napi-rs/canvas` is used only when image rendering fallback is + available. ## Config diff --git a/extensions/document-extract/document-extractor.test.ts b/extensions/document-extract/document-extractor.test.ts new file mode 100644 index 00000000000..d65168d3234 --- /dev/null +++ b/extensions/document-extract/document-extractor.test.ts @@ -0,0 +1,62 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const { canvasSizes, pdfDocument } = vi.hoisted(() => ({ + canvasSizes: [] as Array<{ width: number; height: number }>, + pdfDocument: { + numPages: 2, + getPage: vi.fn(async () => ({ + getTextContent: vi.fn(async () => ({ items: [] })), + getViewport: vi.fn(({ scale }: { scale: number }) => ({ + width: 1000 * scale, + height: 1000 * scale, + })), + render: vi.fn(() => ({ promise: Promise.resolve() })), + })), + }, +})); + +vi.mock("pdfjs-dist/legacy/build/pdf.mjs", () => ({ + getDocument: vi.fn(() => ({ promise: Promise.resolve(pdfDocument) })), +})); + +vi.mock("@napi-rs/canvas", () => ({ + createCanvas: vi.fn((width: number, height: number) => { + canvasSizes.push({ width, height }); + return { + toBuffer: vi.fn(() => Buffer.from("png")), + }; + }), +})); + +import { createPdfDocumentExtractor } from "./document-extractor.js"; + +describe("PDF document extractor", () => { + beforeEach(() => { + canvasSizes.length = 0; + pdfDocument.getPage.mockClear(); + }); + + it("declares PDF support", () => { + const extractor = createPdfDocumentExtractor(); + expect(extractor).toMatchObject({ + id: "pdf", + label: "PDF", + mimeTypes: ["application/pdf"], + }); + }); + + it("treats maxPixels as a hard total image rendering budget", async () => { + const extractor = createPdfDocumentExtractor(); + + const result = await extractor.extract({ + buffer: Buffer.from("%PDF-1.4"), + mimeType: "application/pdf", + maxPages: 2, + maxPixels: 100, + minTextChars: 10, + }); + + expect(result?.images).toHaveLength(1); + expect(canvasSizes).toEqual([{ width: 10, height: 10 }]); + }); +}); diff --git a/extensions/document-extract/document-extractor.ts b/extensions/document-extract/document-extractor.ts new file mode 100644 index 00000000000..4bea3b566ef --- /dev/null +++ b/extensions/document-extract/document-extractor.ts @@ -0,0 +1,216 @@ +import type { + DocumentExtractedImage, + DocumentExtractionRequest, + DocumentExtractionResult, + DocumentExtractorPlugin, +} from "openclaw/plugin-sdk/document-extractor"; + +type CanvasLike = { + toBuffer(type: "image/png"): Buffer; +}; + +type CanvasModule = { + createCanvas(width: number, height: number): CanvasLike; +}; + +type PdfTextItem = { + str: string; +}; + +type PdfTextContent = { + items: Array; +}; + +type PdfViewport = { + width: number; + height: number; +}; + +type PdfPage = { + getTextContent(): Promise; + getViewport(params: { scale: number }): PdfViewport; + render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise }; +}; + +type PdfDocument = { + numPages: number; + getPage(pageNumber: number): Promise; +}; + +type PdfJsModule = { + getDocument(params: { data: Uint8Array; disableWorker?: boolean }): { + promise: Promise; + }; +}; + +const CANVAS_MODULE = "@napi-rs/canvas"; +const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs"; +const MAX_EXTRACTED_TEXT_CHARS = 200_000; +const MAX_RENDER_DIMENSION = 10_000; + +let canvasModulePromise: Promise | null = null; +let pdfJsModulePromise: Promise | null = null; + +async function loadCanvasModule(): Promise { + if (!canvasModulePromise) { + canvasModulePromise = (import(CANVAS_MODULE) as Promise).catch((err) => { + canvasModulePromise = null; + throw new Error("Optional dependency @napi-rs/canvas is required for PDF image extraction", { + cause: err, + }); + }); + } + return canvasModulePromise; +} + +async function loadPdfJsModule(): Promise { + if (!pdfJsModulePromise) { + pdfJsModulePromise = (import(PDFJS_MODULE) as Promise).catch((err) => { + pdfJsModulePromise = null; + throw new Error("Optional dependency pdfjs-dist is required for PDF extraction", { + cause: err, + }); + }); + } + return pdfJsModulePromise; +} + +function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number { + if (!pageText) { + return currentLength; + } + const remaining = MAX_EXTRACTED_TEXT_CHARS - currentLength; + if (remaining <= 0) { + return currentLength; + } + const nextText = pageText.length > remaining ? pageText.slice(0, remaining) : pageText; + parts.push(nextText); + return currentLength + nextText.length; +} + +function resolveRenderPlan( + viewport: PdfViewport, + remainingPixels: number, +): { scale: number; width: number; height: number; pixels: number } | null { + if ( + remainingPixels <= 0 || + !Number.isFinite(viewport.width) || + !Number.isFinite(viewport.height) || + viewport.width <= 0 || + viewport.height <= 0 + ) { + return null; + } + + const pagePixels = Math.max(1, viewport.width * viewport.height); + const maxScale = Math.min( + 1, + Math.sqrt(remainingPixels / pagePixels), + MAX_RENDER_DIMENSION / viewport.width, + MAX_RENDER_DIMENSION / viewport.height, + ); + if (!Number.isFinite(maxScale) || maxScale <= 0) { + return null; + } + + let best: { scale: number; width: number; height: number; pixels: number } | null = null; + let low = 0; + let high = maxScale; + for (let i = 0; i < 32; i += 1) { + const scale = (low + high) / 2; + const width = Math.max(1, Math.ceil(viewport.width * scale)); + const height = Math.max(1, Math.ceil(viewport.height * scale)); + const pixels = width * height; + if ( + width <= MAX_RENDER_DIMENSION && + height <= MAX_RENDER_DIMENSION && + pixels <= remainingPixels + ) { + best = { scale, width, height, pixels }; + low = scale; + } else { + high = scale; + } + } + return best; +} + +async function extractPdfContent( + request: DocumentExtractionRequest, +): Promise { + const pdfJsModule = await loadPdfJsModule(); + const pdf = await pdfJsModule.getDocument({ + data: new Uint8Array(request.buffer), + disableWorker: true, + }).promise; + + const effectivePages: number[] = request.pageNumbers + ? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages) + : Array.from({ length: Math.min(pdf.numPages, request.maxPages) }, (_, i) => i + 1); + + const textParts: string[] = []; + let extractedTextLength = 0; + for (const pageNum of effectivePages) { + const page = await pdf.getPage(pageNum); + const textContent = await page.getTextContent(); + const pageText = textContent.items + .map((item) => ("str" in item ? item.str : "")) + .filter(Boolean) + .join(" "); + if (pageText) { + extractedTextLength = appendTextWithinLimit(textParts, pageText, extractedTextLength); + if (extractedTextLength >= MAX_EXTRACTED_TEXT_CHARS) { + break; + } + } + } + + const text = textParts.join("\n\n"); + if (text.trim().length >= request.minTextChars) { + return { text, images: [] }; + } + + let canvasModule: CanvasModule; + try { + canvasModule = await loadCanvasModule(); + } catch (err) { + request.onImageExtractionError?.(err); + return { text, images: [] }; + } + + const images: DocumentExtractedImage[] = []; + let remainingPixels = Math.max(1, Math.floor(request.maxPixels)); + + for (const pageNum of effectivePages) { + if (remainingPixels <= 0) { + break; + } + const page = await pdf.getPage(pageNum); + const viewport = page.getViewport({ scale: 1 }); + const plan = resolveRenderPlan(viewport, remainingPixels); + if (!plan) { + break; + } + const scaled = page.getViewport({ scale: plan.scale }); + const canvas = canvasModule.createCanvas(plan.width, plan.height); + await page.render({ + canvas: canvas as unknown as HTMLCanvasElement, + viewport: scaled, + }).promise; + const png = canvas.toBuffer("image/png"); + images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" }); + remainingPixels -= plan.pixels; + } + + return { text, images }; +} + +export function createPdfDocumentExtractor(): DocumentExtractorPlugin { + return { + id: "pdf", + label: "PDF", + mimeTypes: ["application/pdf"], + autoDetectOrder: 10, + extract: extractPdfContent, + }; +} diff --git a/extensions/document-extract/index.ts b/extensions/document-extract/index.ts new file mode 100644 index 00000000000..d92af29dbc9 --- /dev/null +++ b/extensions/document-extract/index.ts @@ -0,0 +1,11 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; + +export default definePluginEntry({ + id: "document-extract", + name: "Document Extraction", + description: "Extract text and fallback page images from local document attachments.", + register() { + // Runtime is exposed through document-extractor.ts so document hot paths can + // load only the narrow extractor artifact instead of the full plugin entrypoint. + }, +}); diff --git a/extensions/document-extract/openclaw.plugin.json b/extensions/document-extract/openclaw.plugin.json new file mode 100644 index 00000000000..fda53cb0537 --- /dev/null +++ b/extensions/document-extract/openclaw.plugin.json @@ -0,0 +1,14 @@ +{ + "id": "document-extract", + "enabledByDefault": true, + "name": "Document Extraction", + "description": "Extract text and fallback page images from local document attachments.", + "contracts": { + "documentExtractors": ["pdf"] + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/document-extract/package.json b/extensions/document-extract/package.json new file mode 100644 index 00000000000..e5dd18fb91d --- /dev/null +++ b/extensions/document-extract/package.json @@ -0,0 +1,26 @@ +{ + "name": "@openclaw/document-extract-plugin", + "version": "2026.4.24", + "private": true, + "description": "OpenClaw local document extraction plugin", + "type": "module", + "dependencies": { + "pdfjs-dist": "^5.6.205" + }, + "devDependencies": { + "@openclaw/plugin-sdk": "workspace:*" + }, + "peerDependencies": { + "@napi-rs/canvas": "^0.1.89" + }, + "peerDependenciesMeta": { + "@napi-rs/canvas": { + "optional": true + } + }, + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/package.json b/package.json index e55dc4de4f7..b16620f229c 100644 --- a/package.json +++ b/package.json @@ -1125,6 +1125,10 @@ "types": "./dist/plugin-sdk/web-content-extractor.d.ts", "default": "./dist/plugin-sdk/web-content-extractor.js" }, + "./plugin-sdk/document-extractor": { + "types": "./dist/plugin-sdk/document-extractor.d.ts", + "default": "./dist/plugin-sdk/document-extractor.js" + }, "./plugin-sdk/provider-web-fetch-contract": { "types": "./dist/plugin-sdk/provider-web-fetch-contract.d.ts", "default": "./dist/plugin-sdk/provider-web-fetch-contract.js" @@ -1609,7 +1613,6 @@ "markdown-it": "14.1.1", "openai": "^6.34.0", "osc-progress": "^0.3.0", - "pdfjs-dist": "^5.6.205", "proxy-agent": "^8.0.1", "semver": "7.7.4", "sharp": "^0.34.5", @@ -1647,7 +1650,6 @@ "vitest": "^4.1.5" }, "peerDependencies": { - "@napi-rs/canvas": "^0.1.89", "node-llama-cpp": "3.18.1" }, "peerDependenciesMeta": { @@ -1693,7 +1695,6 @@ "onlyBuiltDependencies": [ "@lydell/node-pty", "@matrix-org/matrix-sdk-crypto-nodejs", - "@napi-rs/canvas", "@tloncorp/api", "@tloncorp/tlon-skill", "@whiskeysockets/baileys", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 991d3e73d64..d82fceca9b5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -63,9 +63,6 @@ importers: '@modelcontextprotocol/sdk': specifier: 1.29.0 version: 1.29.0(zod@4.3.6) - '@napi-rs/canvas': - specifier: ^0.1.89 - version: 0.1.92 '@vincentkoc/qrcode-tui': specifier: 0.2.1 version: 0.2.1 @@ -120,9 +117,6 @@ importers: osc-progress: specifier: ^0.3.0 version: 0.3.0 - pdfjs-dist: - specifier: ^5.6.205 - version: 5.6.205 proxy-agent: specifier: ^8.0.1 version: 8.0.1 @@ -519,6 +513,19 @@ importers: specifier: workspace:* version: link:../.. + extensions/document-extract: + dependencies: + '@napi-rs/canvas': + specifier: ^0.1.89 + version: 0.1.99 + pdfjs-dist: + specifier: ^5.6.205 + version: 5.6.205 + devDependencies: + '@openclaw/plugin-sdk': + specifier: workspace:* + version: link:../../packages/plugin-sdk + extensions/duckduckgo: devDependencies: '@openclaw/plugin-sdk': @@ -2774,61 +2781,30 @@ packages: resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==} engines: {node: '>=14.0.0'} - '@napi-rs/canvas-android-arm64@0.1.92': - resolution: {integrity: sha512-rDOtq53ujfOuevD5taxAuIFALuf1QsQWZe1yS/N4MtT+tNiDBEdjufvQRPWZ11FubL2uwgP8ApYU3YOaNu1ZsQ==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [android] - '@napi-rs/canvas-android-arm64@0.1.99': resolution: {integrity: sha512-9OCRt8VVxA17m32NWZKyNC2qamdaS/SC5CEOIQwFngRq0DIeVm4PDal+6Ljnhqm2whZiC63DNuKZ4xSp2nbj9w==} engines: {node: '>= 10'} cpu: [arm64] os: [android] - '@napi-rs/canvas-darwin-arm64@0.1.92': - resolution: {integrity: sha512-4PT6GRGCr7yMRehp42x0LJb1V0IEy1cDZDDayv7eKbFUIGbPFkV7CRC9Bee5MPkjg1EB4ZPXXUyy3gjQm7mR8Q==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [darwin] - '@napi-rs/canvas-darwin-arm64@0.1.99': resolution: {integrity: sha512-lupMDMy1+H38dhyCcLirOKKVUyzzlxi7j7rGPLI3vViMHOoPjcXO1b10ivy+ad+q6MiwHfoLjKTCoLke5ySOBg==} engines: {node: '>= 10'} cpu: [arm64] os: [darwin] - '@napi-rs/canvas-darwin-x64@0.1.92': - resolution: {integrity: sha512-5e/3ZapP7CqPtDcZPtmowCsjoyQwuNMMD7c0GKPtZQ8pgQhLkeq/3fmk0HqNSD1i227FyJN/9pDrhw/UMTkaWA==} - engines: {node: '>= 10'} - cpu: [x64] - os: [darwin] - '@napi-rs/canvas-darwin-x64@0.1.99': resolution: {integrity: sha512-fdz02t4w8n6Ii/rYhWig6STb/zcTmCC/6YZTGmjoDeidDwn9Wf0ukQVynhCPEs29vqUc66wHZKsuIgMs9tycCg==} engines: {node: '>= 10'} cpu: [x64] os: [darwin] - '@napi-rs/canvas-linux-arm-gnueabihf@0.1.92': - resolution: {integrity: sha512-j6KaLL9iir68lwpzzY+aBGag1PZp3+gJE2mQ3ar4VJVmyLRVOh+1qsdNK1gfWoAVy5w6U7OEYFrLzN2vOFUSng==} - engines: {node: '>= 10'} - cpu: [arm] - os: [linux] - '@napi-rs/canvas-linux-arm-gnueabihf@0.1.99': resolution: {integrity: sha512-w4FwVwlNo00ezeRhfY62IVIyt6G3u8wodkPtiqWc52BUHx+VDBUM2vkS3ogfANaLI7hnf3s6WK4LyZVUjBg1lA==} engines: {node: '>= 10'} cpu: [arm] os: [linux] - '@napi-rs/canvas-linux-arm64-gnu@0.1.92': - resolution: {integrity: sha512-s3NlnJMHOSotUYVoTCoC1OcomaChFdKmZg0VsHFeIkeHbwX0uPHP4eCX1irjSfMykyvsGHTQDfBAtGYuqxCxhQ==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [linux] - libc: [glibc] - '@napi-rs/canvas-linux-arm64-gnu@0.1.99': resolution: {integrity: sha512-8JvHeexKQ8c7g0q7YJ29NVQwnf1ePghP9ys9ZN0R0qzyqJQ9Uw6N9qnDINArlm3IYHexB7LjzArIfhQiqSDGvQ==} engines: {node: '>= 10'} @@ -2836,13 +2812,6 @@ packages: os: [linux] libc: [glibc] - '@napi-rs/canvas-linux-arm64-musl@0.1.92': - resolution: {integrity: sha512-xV0GQnukYq5qY+ebkAwHjnP2OrSGBxS3vSi1zQNQj0bkXU6Ou+Tw7JjCM7pZcQ28MUyEBS1yKfo7rc7ip2IPFQ==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [linux] - libc: [musl] - '@napi-rs/canvas-linux-arm64-musl@0.1.99': resolution: {integrity: sha512-Z+6nyLdJXWzLPVxi4H6g9TJop4DwN3KSgHWto5JCbZV5/uKoVqcSynPs0tGlUHOoWI8S8tEvJspz51GQkvr07w==} engines: {node: '>= 10'} @@ -2850,13 +2819,6 @@ packages: os: [linux] libc: [musl] - '@napi-rs/canvas-linux-riscv64-gnu@0.1.92': - resolution: {integrity: sha512-+GKvIFbQ74eB/TopEdH6XIXcvOGcuKvCITLGXy7WLJAyNp3Kdn1ncjxg91ihatBaPR+t63QOE99yHuIWn3UQ9w==} - engines: {node: '>= 10'} - cpu: [riscv64] - os: [linux] - libc: [glibc] - '@napi-rs/canvas-linux-riscv64-gnu@0.1.99': resolution: {integrity: sha512-jAnfOUv4IO1l8Levk5t85oVtEBOXLa07KnIUgWo1CDlPxiqpxS3uBfiE38Lvj/CQgHaNF6Nxk/SaemwLgsVJgw==} engines: {node: '>= 10'} @@ -2864,13 +2826,6 @@ packages: os: [linux] libc: [glibc] - '@napi-rs/canvas-linux-x64-gnu@0.1.92': - resolution: {integrity: sha512-tFd6MwbEhZ1g64iVY2asV+dOJC+GT3Yd6UH4G3Hp0/VHQ6qikB+nvXEULskFYZ0+wFqlGPtXjG1Jmv7sJy+3Ww==} - engines: {node: '>= 10'} - cpu: [x64] - os: [linux] - libc: [glibc] - '@napi-rs/canvas-linux-x64-gnu@0.1.99': resolution: {integrity: sha512-mIkXw3fGmbYyFjSmfWEvty4jN+rwEOmv0+Dy9bRvvTzLYWCgm3RMgUEQVfAKFw96nIRFnyNZiK83KNQaVVFjng==} engines: {node: '>= 10'} @@ -2878,13 +2833,6 @@ packages: os: [linux] libc: [glibc] - '@napi-rs/canvas-linux-x64-musl@0.1.92': - resolution: {integrity: sha512-uSuqeSveB/ZGd72VfNbHCSXO9sArpZTvznMVsb42nqPP7gBGEH6NJQ0+hmF+w24unEmxBhPYakP/Wiosm16KkA==} - engines: {node: '>= 10'} - cpu: [x64] - os: [linux] - libc: [musl] - '@napi-rs/canvas-linux-x64-musl@0.1.99': resolution: {integrity: sha512-f3Uz2P0RgrtBHISxZqr6yiYXJlTDyCVBumDacxo+4AmSg7z0HiqYZKGWC/gszq3fbPhyQUya1W2AEteKxT9Y6A==} engines: {node: '>= 10'} @@ -2892,34 +2840,18 @@ packages: os: [linux] libc: [musl] - '@napi-rs/canvas-win32-arm64-msvc@0.1.92': - resolution: {integrity: sha512-20SK5AU/OUNz9ZuoAPj5ekWai45EIBDh/XsdrVZ8le/pJVlhjFU3olbumSQUXRFn7lBRS+qwM8kA//uLaDx6iQ==} - engines: {node: '>= 10'} - cpu: [arm64] - os: [win32] - '@napi-rs/canvas-win32-arm64-msvc@0.1.99': resolution: {integrity: sha512-XE6KUkfqRsCNejcoRMiMr3RaUeObxNf6y7dut3hrq2rn7PzfRTZgrjF1F/B2C7FcdgqY/vSHWpQeMuNz1vTNHg==} engines: {node: '>= 10'} cpu: [arm64] os: [win32] - '@napi-rs/canvas-win32-x64-msvc@0.1.92': - resolution: {integrity: sha512-KEhyZLzq1MXCNlXybz4k25MJmHFp+uK1SIb8yJB0xfrQjz5aogAMhyseSzewo+XxAq3OAOdyKvfHGNzT3w1RPg==} - engines: {node: '>= 10'} - cpu: [x64] - os: [win32] - '@napi-rs/canvas-win32-x64-msvc@0.1.99': resolution: {integrity: sha512-plMYGVbc/vmmPF9MtmHbwNk1rL1Aj53vQZt+Gnv1oZn6gmd9jEHHJ0n9Nd2nxa5sKH7TS5IjkCDM6289O0d6PQ==} engines: {node: '>= 10'} cpu: [x64] os: [win32] - '@napi-rs/canvas@0.1.92': - resolution: {integrity: sha512-q7ZaUCJkEU5BeOdE7fBx1XWRd2T5Ady65nxq4brMf5L4cE1VV/ACq5w9Z5b/IVJs8CwSSIwc30nlthH0gFo4Ig==} - engines: {node: '>= 10'} - '@napi-rs/canvas@0.1.99': resolution: {integrity: sha512-zN4eQlK3eBf7aJBcTHZilpBH3tDekBzPMIWC8r0s94Ecl73XfOyFi4w7yKFMRVUT0lvNQjtOL8YSrwqQj6mZFg==} engines: {node: '>= 10'} @@ -9914,86 +9846,39 @@ snapshots: '@mozilla/readability@0.6.0': {} - '@napi-rs/canvas-android-arm64@0.1.92': - optional: true - '@napi-rs/canvas-android-arm64@0.1.99': optional: true - '@napi-rs/canvas-darwin-arm64@0.1.92': - optional: true - '@napi-rs/canvas-darwin-arm64@0.1.99': optional: true - '@napi-rs/canvas-darwin-x64@0.1.92': - optional: true - '@napi-rs/canvas-darwin-x64@0.1.99': optional: true - '@napi-rs/canvas-linux-arm-gnueabihf@0.1.92': - optional: true - '@napi-rs/canvas-linux-arm-gnueabihf@0.1.99': optional: true - '@napi-rs/canvas-linux-arm64-gnu@0.1.92': - optional: true - '@napi-rs/canvas-linux-arm64-gnu@0.1.99': optional: true - '@napi-rs/canvas-linux-arm64-musl@0.1.92': - optional: true - '@napi-rs/canvas-linux-arm64-musl@0.1.99': optional: true - '@napi-rs/canvas-linux-riscv64-gnu@0.1.92': - optional: true - '@napi-rs/canvas-linux-riscv64-gnu@0.1.99': optional: true - '@napi-rs/canvas-linux-x64-gnu@0.1.92': - optional: true - '@napi-rs/canvas-linux-x64-gnu@0.1.99': optional: true - '@napi-rs/canvas-linux-x64-musl@0.1.92': - optional: true - '@napi-rs/canvas-linux-x64-musl@0.1.99': optional: true - '@napi-rs/canvas-win32-arm64-msvc@0.1.92': - optional: true - '@napi-rs/canvas-win32-arm64-msvc@0.1.99': optional: true - '@napi-rs/canvas-win32-x64-msvc@0.1.92': - optional: true - '@napi-rs/canvas-win32-x64-msvc@0.1.99': optional: true - '@napi-rs/canvas@0.1.92': - optionalDependencies: - '@napi-rs/canvas-android-arm64': 0.1.92 - '@napi-rs/canvas-darwin-arm64': 0.1.92 - '@napi-rs/canvas-darwin-x64': 0.1.92 - '@napi-rs/canvas-linux-arm-gnueabihf': 0.1.92 - '@napi-rs/canvas-linux-arm64-gnu': 0.1.92 - '@napi-rs/canvas-linux-arm64-musl': 0.1.92 - '@napi-rs/canvas-linux-riscv64-gnu': 0.1.92 - '@napi-rs/canvas-linux-x64-gnu': 0.1.92 - '@napi-rs/canvas-linux-x64-musl': 0.1.92 - '@napi-rs/canvas-win32-arm64-msvc': 0.1.92 - '@napi-rs/canvas-win32-x64-msvc': 0.1.92 - '@napi-rs/canvas@0.1.99': optionalDependencies: '@napi-rs/canvas-android-arm64': 0.1.99 @@ -10007,7 +9892,6 @@ snapshots: '@napi-rs/canvas-linux-x64-musl': 0.1.99 '@napi-rs/canvas-win32-arm64-msvc': 0.1.99 '@napi-rs/canvas-win32-x64-msvc': 0.1.99 - optional: true '@napi-rs/wasm-runtime@1.1.4(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)': dependencies: diff --git a/scripts/lib/dependency-ownership.json b/scripts/lib/dependency-ownership.json index c37484f1cd7..71ef2924ccc 100644 --- a/scripts/lib/dependency-ownership.json +++ b/scripts/lib/dependency-ownership.json @@ -48,8 +48,12 @@ "risk": ["parser", "untrusted-html"] }, "@napi-rs/canvas": { - "owner": "capability:document-and-image-rendering", - "class": "default-runtime-initially", + "owner": "plugin:document-extract", + "class": "optional-peer-runtime", + "activation": [ + "input_file.application_pdf.image_fallback", + "plugins.entries.document-extract.enabled" + ], "risk": ["native", "parser", "untrusted-files"] }, "@vincentkoc/qrcode-tui": { @@ -149,8 +153,9 @@ "risk": ["terminal-rendering"] }, "pdfjs-dist": { - "owner": "capability:document-extract", - "class": "default-runtime-initially", + "owner": "plugin:document-extract", + "class": "plugin-runtime", + "activation": ["input_file.application_pdf", "plugins.entries.document-extract.enabled"], "risk": ["parser", "untrusted-files"] }, "proxy-agent": { diff --git a/scripts/lib/plugin-sdk-entrypoints.json b/scripts/lib/plugin-sdk-entrypoints.json index 3154f09365a..6cfd101598d 100644 --- a/scripts/lib/plugin-sdk-entrypoints.json +++ b/scripts/lib/plugin-sdk-entrypoints.json @@ -266,6 +266,7 @@ "provider-stream", "provider-tools", "provider-usage", + "document-extractor", "web-content-extractor", "provider-web-fetch-contract", "provider-web-fetch", diff --git a/src/agents/tools/pdf-tool.ts b/src/agents/tools/pdf-tool.ts index e608ddfaed6..f0edc9f8096 100644 --- a/src/agents/tools/pdf-tool.ts +++ b/src/agents/tools/pdf-tool.ts @@ -432,6 +432,7 @@ export function createPdfTool(options?: { maxPixels: PDF_MAX_PIXELS, minTextChars: PDF_MIN_TEXT_CHARS, pageNumbers, + config: options?.config, }); extractedAll.push(extracted); } diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index c95d765155e..34e69ee8e70 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -343,10 +343,11 @@ function isBinaryMediaMime(mime?: string): boolean { async function extractFileBlocks(params: { attachments: ReturnType; cache: ReturnType; + cfg: OpenClawConfig; limits: ReturnType; skipAttachmentIndexes?: Set; }): Promise { - const { attachments, cache, limits, skipAttachmentIndexes } = params; + const { attachments, cache, cfg, limits, skipAttachmentIndexes } = params; if (!attachments || attachments.length === 0) { return []; } @@ -447,6 +448,7 @@ async function extractFileBlocks(params: { ...baseLimits, allowedMimes, }, + config: cfg, }); } catch (err) { if (shouldLogVerbose()) { @@ -565,6 +567,7 @@ export async function applyMediaUnderstanding(params: { const fileBlocks = await extractFileBlocks({ attachments, cache, + cfg, limits: resolveFileLimits(cfg), skipAttachmentIndexes: audioAttachmentIndexes.size > 0 ? audioAttachmentIndexes : undefined, }); diff --git a/src/media/document-extractors.runtime.test.ts b/src/media/document-extractors.runtime.test.ts new file mode 100644 index 00000000000..dda3afd3630 --- /dev/null +++ b/src/media/document-extractors.runtime.test.ts @@ -0,0 +1,81 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const { resolvePluginDocumentExtractorsMock } = vi.hoisted(() => ({ + resolvePluginDocumentExtractorsMock: vi.fn(), +})); + +vi.mock("../plugins/document-extractors.runtime.js", () => ({ + resolvePluginDocumentExtractors: resolvePluginDocumentExtractorsMock, +})); + +import { extractDocumentContent } from "./document-extractors.runtime.js"; + +describe("extractDocumentContent", () => { + beforeEach(() => { + resolvePluginDocumentExtractorsMock.mockReset(); + }); + + it("passes only public extraction request fields to plugins", async () => { + const extract = vi.fn().mockResolvedValue({ text: "pdf text", images: [] }); + resolvePluginDocumentExtractorsMock.mockReturnValue([ + { + id: "pdf", + pluginId: "document-extract", + label: "PDF", + mimeTypes: ["application/pdf"], + extract, + }, + ]); + + await expect( + extractDocumentContent({ + buffer: Buffer.from("pdf"), + mimeType: "application/pdf", + maxPages: 1, + maxPixels: 100, + minTextChars: 10, + config: { + env: { + vars: { + SECRET_VALUE: "do-not-pass", + }, + }, + }, + }), + ).resolves.toMatchObject({ text: "pdf text", extractor: "pdf" }); + + expect(extract).toHaveBeenCalledWith({ + buffer: Buffer.from("pdf"), + mimeType: "application/pdf", + maxPages: 1, + maxPixels: 100, + minTextChars: 10, + }); + }); + + it("surfaces matching extractor failures instead of reporting disablement", async () => { + const cause = new Error("password required"); + resolvePluginDocumentExtractorsMock.mockReturnValue([ + { + id: "pdf", + pluginId: "document-extract", + label: "PDF", + mimeTypes: ["application/pdf"], + extract: vi.fn().mockRejectedValue(cause), + }, + ]); + + await expect( + extractDocumentContent({ + buffer: Buffer.from("pdf"), + mimeType: "application/pdf", + maxPages: 1, + maxPixels: 100, + minTextChars: 10, + }), + ).rejects.toMatchObject({ + message: "Document extraction failed for application/pdf", + cause, + }); + }); +}); diff --git a/src/media/document-extractors.runtime.ts b/src/media/document-extractors.runtime.ts new file mode 100644 index 00000000000..e7c0a426386 --- /dev/null +++ b/src/media/document-extractors.runtime.ts @@ -0,0 +1,76 @@ +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import type { + DocumentExtractionRequest, + DocumentExtractionResult, +} from "../plugins/document-extractor-types.js"; +import { resolvePluginDocumentExtractors } from "../plugins/document-extractors.runtime.js"; +import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js"; + +let extractorPromise: Promise> | undefined; +const extractorPromisesByConfig = new WeakMap< + OpenClawConfig, + Promise> +>(); + +async function loadDocumentExtractors(config?: OpenClawConfig) { + if (config) { + const cached = extractorPromisesByConfig.get(config); + if (cached) { + return await cached; + } + const promise = Promise.resolve().then(() => resolvePluginDocumentExtractors({ config })); + extractorPromisesByConfig.set(config, promise); + void promise.catch(() => { + extractorPromisesByConfig.delete(config); + }); + return await promise; + } + extractorPromise ??= Promise.resolve(resolvePluginDocumentExtractors()); + return await extractorPromise; +} + +export async function extractDocumentContent( + params: DocumentExtractionRequest & { + config?: OpenClawConfig; + }, +): Promise<(DocumentExtractionResult & { extractor: string }) | null> { + const mimeType = normalizeLowercaseStringOrEmpty(params.mimeType); + const extractors = await loadDocumentExtractors(params.config); + const request: DocumentExtractionRequest = { + buffer: params.buffer, + mimeType: params.mimeType, + maxPages: params.maxPages, + maxPixels: params.maxPixels, + minTextChars: params.minTextChars, + ...(params.pageNumbers ? { pageNumbers: params.pageNumbers } : {}), + ...(params.onImageExtractionError + ? { onImageExtractionError: params.onImageExtractionError } + : {}), + }; + const errors: unknown[] = []; + + for (const extractor of extractors) { + if ( + !extractor.mimeTypes.map((entry) => normalizeLowercaseStringOrEmpty(entry)).includes(mimeType) + ) { + continue; + } + try { + const result = await extractor.extract(request); + if (result) { + return { + ...result, + extractor: extractor.id, + }; + } + } catch (error) { + errors.push(error); + } + } + if (errors.length > 0) { + throw new Error(`Document extraction failed for ${mimeType || "unknown MIME type"}`, { + cause: errors.length === 1 ? errors[0] : new AggregateError(errors), + }); + } + return null; +} diff --git a/src/media/input-files.ts b/src/media/input-files.ts index d7583aba3ea..e33f3beb7c6 100644 --- a/src/media/input-files.ts +++ b/src/media/input-files.ts @@ -1,3 +1,4 @@ +import type { OpenClawConfig } from "../config/types.openclaw.js"; import { fetchWithSsrFGuard } from "../infra/net/fetch-guard.js"; import type { SsrFPolicy } from "../infra/net/ssrf.js"; import { logWarn } from "../logger.js"; @@ -322,6 +323,7 @@ export async function extractImageContentFromSource( export async function extractFileContentFromSource(params: { source: InputFileSource; limits: InputFileLimits; + config?: OpenClawConfig; }): Promise { const { source, limits } = params; const filename = source.filename || "file"; @@ -378,6 +380,7 @@ export async function extractFileContentFromSource(params: { maxPages: limits.pdf.maxPages, maxPixels: limits.pdf.maxPixels, minTextChars: limits.pdf.minTextChars, + ...(params.config ? { config: params.config } : {}), onImageExtractionError: (err) => { logWarn(`media: PDF image extraction skipped, ${String(err)}`); }, diff --git a/src/media/pdf-extract.test.ts b/src/media/pdf-extract.test.ts new file mode 100644 index 00000000000..4aa56e5c417 --- /dev/null +++ b/src/media/pdf-extract.test.ts @@ -0,0 +1,54 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const { extractDocumentContentMock } = vi.hoisted(() => ({ + extractDocumentContentMock: vi.fn(), +})); + +vi.mock("./document-extractors.runtime.js", () => ({ + extractDocumentContent: extractDocumentContentMock, +})); + +import { extractPdfContent } from "./pdf-extract.js"; + +describe("extractPdfContent", () => { + beforeEach(() => { + extractDocumentContentMock.mockReset(); + }); + + it("dispatches PDF extraction through document extractors", async () => { + extractDocumentContentMock.mockResolvedValue({ + text: "extracted pdf", + images: [], + extractor: "pdf", + }); + + await expect( + extractPdfContent({ + buffer: Buffer.from("%PDF-1.4"), + maxPages: 2, + maxPixels: 100, + minTextChars: 10, + }), + ).resolves.toEqual({ text: "extracted pdf", images: [] }); + expect(extractDocumentContentMock).toHaveBeenCalledWith({ + buffer: Buffer.from("%PDF-1.4"), + mimeType: "application/pdf", + maxPages: 2, + maxPixels: 100, + minTextChars: 10, + }); + }); + + it("throws a clear disabled error when no document extractor is available", async () => { + extractDocumentContentMock.mockResolvedValue(null); + + await expect( + extractPdfContent({ + buffer: Buffer.from("%PDF-1.4"), + maxPages: 2, + maxPixels: 100, + minTextChars: 10, + }), + ).rejects.toThrow("PDF extraction disabled or unavailable"); + }); +}); diff --git a/src/media/pdf-extract.ts b/src/media/pdf-extract.ts index 2a00700e622..8e9129ac8a8 100644 --- a/src/media/pdf-extract.ts +++ b/src/media/pdf-extract.ts @@ -1,81 +1,12 @@ -type CanvasLike = { - toBuffer(type: "image/png"): Buffer; -}; +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import type { + DocumentExtractedImage, + DocumentExtractionResult, +} from "../plugins/document-extractor-types.js"; +import { extractDocumentContent } from "./document-extractors.runtime.js"; -type CanvasModule = { - createCanvas(width: number, height: number): CanvasLike; -}; - -type PdfTextItem = { - str: string; -}; - -type PdfTextContent = { - items: Array; -}; - -type PdfViewport = { - width: number; - height: number; -}; - -type PdfPage = { - getTextContent(): Promise; - getViewport(params: { scale: number }): PdfViewport; - render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise }; -}; - -type PdfDocument = { - numPages: number; - getPage(pageNumber: number): Promise; -}; - -type PdfJsModule = { - getDocument(params: { data: Uint8Array; disableWorker?: boolean }): { - promise: Promise; - }; -}; - -const CANVAS_MODULE = "@napi-rs/canvas"; -const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs"; - -let canvasModulePromise: Promise | null = null; -let pdfJsModulePromise: Promise | null = null; - -async function loadCanvasModule(): Promise { - if (!canvasModulePromise) { - canvasModulePromise = (import(CANVAS_MODULE) as Promise).catch((err) => { - canvasModulePromise = null; - throw new Error( - `Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`, - ); - }); - } - return canvasModulePromise; -} - -async function loadPdfJsModule(): Promise { - if (!pdfJsModulePromise) { - pdfJsModulePromise = (import(PDFJS_MODULE) as Promise).catch((err) => { - pdfJsModulePromise = null; - throw new Error( - `Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`, - ); - }); - } - return pdfJsModulePromise; -} - -export type PdfExtractedImage = { - type: "image"; - data: string; - mimeType: string; -}; - -export type PdfExtractedContent = { - text: string; - images: PdfExtractedImage[]; -}; +export type PdfExtractedImage = DocumentExtractedImage; +export type PdfExtractedContent = DocumentExtractionResult; export async function extractPdfContent(params: { buffer: Buffer; @@ -83,60 +14,28 @@ export async function extractPdfContent(params: { maxPixels: number; minTextChars: number; pageNumbers?: number[]; + config?: OpenClawConfig; onImageExtractionError?: (error: unknown) => void; }): Promise { - const { buffer, maxPages, maxPixels, minTextChars, pageNumbers, onImageExtractionError } = params; - const pdfJsModule = await loadPdfJsModule(); - const pdf = await pdfJsModule.getDocument({ data: new Uint8Array(buffer), disableWorker: true }) - .promise; - - const effectivePages: number[] = pageNumbers - ? pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, maxPages) - : Array.from({ length: Math.min(pdf.numPages, maxPages) }, (_, i) => i + 1); - - const textParts: string[] = []; - for (const pageNum of effectivePages) { - const page = await pdf.getPage(pageNum); - const textContent = await page.getTextContent(); - const pageText = textContent.items - .map((item) => ("str" in item ? item.str : "")) - .filter(Boolean) - .join(" "); - if (pageText) { - textParts.push(pageText); - } + const extracted = await extractDocumentContent({ + buffer: params.buffer, + mimeType: "application/pdf", + maxPages: params.maxPages, + maxPixels: params.maxPixels, + minTextChars: params.minTextChars, + ...(params.pageNumbers ? { pageNumbers: params.pageNumbers } : {}), + ...(params.config ? { config: params.config } : {}), + ...(params.onImageExtractionError + ? { onImageExtractionError: params.onImageExtractionError } + : {}), + }); + if (!extracted) { + throw new Error( + "PDF extraction disabled or unavailable: enable the document-extract plugin to process application/pdf files.", + ); } - - const text = textParts.join("\n\n"); - if (text.trim().length >= minTextChars) { - return { text, images: [] }; - } - - let canvasModule: CanvasModule; - try { - canvasModule = await loadCanvasModule(); - } catch (err) { - onImageExtractionError?.(err); - return { text, images: [] }; - } - - const images: PdfExtractedImage[] = []; - const pixelBudget = Math.max(1, maxPixels); - - for (const pageNum of effectivePages) { - const page = await pdf.getPage(pageNum); - const viewport = page.getViewport({ scale: 1 }); - const pagePixels = viewport.width * viewport.height; - const scale = Math.min(1, Math.sqrt(pixelBudget / Math.max(1, pagePixels))); - const scaled = page.getViewport({ scale: Math.max(0.1, scale) }); - const canvas = canvasModule.createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height)); - await page.render({ - canvas: canvas as unknown as HTMLCanvasElement, - viewport: scaled, - }).promise; - const png = canvas.toBuffer("image/png"); - images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" }); - } - - return { text, images }; + return { + text: extracted.text, + images: extracted.images, + }; } diff --git a/src/plugin-sdk/document-extractor.ts b/src/plugin-sdk/document-extractor.ts new file mode 100644 index 00000000000..f93c4117a8b --- /dev/null +++ b/src/plugin-sdk/document-extractor.ts @@ -0,0 +1,6 @@ +export type { + DocumentExtractedImage, + DocumentExtractionRequest, + DocumentExtractionResult, + DocumentExtractorPlugin, +} from "../plugins/document-extractor-types.js"; diff --git a/src/plugins/contracts/inventory/bundled-capability-metadata.ts b/src/plugins/contracts/inventory/bundled-capability-metadata.ts index ba798c8614a..3bced92a5ac 100644 --- a/src/plugins/contracts/inventory/bundled-capability-metadata.ts +++ b/src/plugins/contracts/inventory/bundled-capability-metadata.ts @@ -20,6 +20,7 @@ export type BundledPluginContractSnapshot = { realtimeTranscriptionProviderIds: string[]; realtimeVoiceProviderIds: string[]; mediaUnderstandingProviderIds: string[]; + documentExtractorIds: string[]; imageGenerationProviderIds: string[]; videoGenerationProviderIds: string[]; musicGenerationProviderIds: string[]; @@ -116,6 +117,9 @@ export function buildBundledPluginContractSnapshot( manifest.contracts?.mediaUnderstandingProviders, (value) => value.trim(), ), + documentExtractorIds: uniqueStrings(manifest.contracts?.documentExtractors, (value) => + value.trim(), + ), imageGenerationProviderIds: uniqueStrings( manifest.contracts?.imageGenerationProviders, (value) => value.trim(), @@ -151,6 +155,7 @@ export function hasBundledPluginContractSnapshotCapabilities( entry.realtimeTranscriptionProviderIds.length > 0 || entry.realtimeVoiceProviderIds.length > 0 || entry.mediaUnderstandingProviderIds.length > 0 || + entry.documentExtractorIds.length > 0 || entry.imageGenerationProviderIds.length > 0 || entry.videoGenerationProviderIds.length > 0 || entry.musicGenerationProviderIds.length > 0 || diff --git a/src/plugins/contracts/registry.ts b/src/plugins/contracts/registry.ts index f4b5849c92e..961e127a419 100644 --- a/src/plugins/contracts/registry.ts +++ b/src/plugins/contracts/registry.ts @@ -64,6 +64,7 @@ type ManifestContractKey = | "realtimeTranscriptionProviders" | "realtimeVoiceProviders" | "mediaUnderstandingProviders" + | "documentExtractors" | "imageGenerationProviders" | "videoGenerationProviders" | "musicGenerationProviders" @@ -84,6 +85,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] { realtimeTranscriptionProviderIds: [...entry.realtimeTranscriptionProviderIds], realtimeVoiceProviderIds: [...entry.realtimeVoiceProviderIds], mediaUnderstandingProviderIds: [...entry.mediaUnderstandingProviderIds], + documentExtractorIds: [...entry.documentExtractorIds], imageGenerationProviderIds: [...entry.imageGenerationProviderIds], videoGenerationProviderIds: [...entry.videoGenerationProviderIds], musicGenerationProviderIds: [...entry.musicGenerationProviderIds], @@ -103,6 +105,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] { (plugin.contracts?.realtimeTranscriptionProviders?.length ?? 0) > 0 || (plugin.contracts?.realtimeVoiceProviders?.length ?? 0) > 0 || (plugin.contracts?.mediaUnderstandingProviders?.length ?? 0) > 0 || + (plugin.contracts?.documentExtractors?.length ?? 0) > 0 || (plugin.contracts?.imageGenerationProviders?.length ?? 0) > 0 || (plugin.contracts?.videoGenerationProviders?.length ?? 0) > 0 || (plugin.contracts?.musicGenerationProviders?.length ?? 0) > 0 || @@ -123,6 +126,7 @@ function resolveBundledManifestContracts(): PluginRegistrationContractEntry[] { mediaUnderstandingProviderIds: uniqueStrings( plugin.contracts?.mediaUnderstandingProviders ?? [], ), + documentExtractorIds: uniqueStrings(plugin.contracts?.documentExtractors ?? []), imageGenerationProviderIds: uniqueStrings(plugin.contracts?.imageGenerationProviders ?? []), videoGenerationProviderIds: uniqueStrings(plugin.contracts?.videoGenerationProviders ?? []), musicGenerationProviderIds: uniqueStrings(plugin.contracts?.musicGenerationProviders ?? []), @@ -175,6 +179,8 @@ function resolveBundledManifestPluginIdsForContract(contract: ManifestContractKe return entry.realtimeVoiceProviderIds.length > 0; case "mediaUnderstandingProviders": return entry.mediaUnderstandingProviderIds.length > 0; + case "documentExtractors": + return entry.documentExtractorIds.length > 0; case "imageGenerationProviders": return entry.imageGenerationProviderIds.length > 0; case "videoGenerationProviders": diff --git a/src/plugins/document-extractor-public-artifacts.test.ts b/src/plugins/document-extractor-public-artifacts.test.ts new file mode 100644 index 00000000000..4bf9b47128f --- /dev/null +++ b/src/plugins/document-extractor-public-artifacts.test.ts @@ -0,0 +1,55 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const { publicArtifactModule } = vi.hoisted(() => ({ + publicArtifactModule: {} as Record, +})); + +vi.mock("./public-surface-loader.js", () => ({ + loadBundledPluginPublicArtifactModuleSync: vi.fn(() => publicArtifactModule), + resolveBundledPluginPublicArtifactPath: vi.fn( + () => "/repo/extensions/demo/document-extractor.ts", + ), +})); + +import { loadBundledDocumentExtractorEntriesFromDir } from "./document-extractor-public-artifacts.js"; + +describe("loadBundledDocumentExtractorEntriesFromDir", () => { + beforeEach(() => { + for (const key of Object.keys(publicArtifactModule)) { + delete publicArtifactModule[key]; + } + }); + + it("isolates a throwing factory when another extractor factory succeeds", () => { + publicArtifactModule.createBrokenDocumentExtractor = () => { + throw new Error("native probe failed"); + }; + publicArtifactModule.createPdfDocumentExtractor = () => ({ + id: "pdf", + label: "PDF", + mimeTypes: ["application/pdf"], + extract: vi.fn(), + }); + + expect( + loadBundledDocumentExtractorEntriesFromDir({ + dirName: "demo", + pluginId: "demo", + }), + ).toMatchObject([{ id: "pdf", pluginId: "demo" }]); + }); + + it("surfaces initialization failure when every matching factory throws", () => { + const cause = new Error("native probe failed"); + publicArtifactModule.createPdfDocumentExtractor = () => { + throw cause; + }; + + expect(() => + loadBundledDocumentExtractorEntriesFromDir({ + dirName: "demo", + pluginId: "demo", + }), + ).toThrow("Unable to initialize document extractors for plugin demo"); + }); +}); diff --git a/src/plugins/document-extractor-public-artifacts.ts b/src/plugins/document-extractor-public-artifacts.ts new file mode 100644 index 00000000000..69081c4e65d --- /dev/null +++ b/src/plugins/document-extractor-public-artifacts.ts @@ -0,0 +1,108 @@ +import type { + DocumentExtractorPlugin, + PluginDocumentExtractorEntry, +} from "./document-extractor-types.js"; +import { + loadBundledPluginPublicArtifactModuleSync, + resolveBundledPluginPublicArtifactPath, +} from "./public-surface-loader.js"; + +const DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES = [ + "document-extractor.js", + "document-extractor-api.js", +] as const; + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function isDocumentExtractorPlugin(value: unknown): value is DocumentExtractorPlugin { + return ( + isRecord(value) && + typeof value.id === "string" && + typeof value.label === "string" && + Array.isArray(value.mimeTypes) && + value.mimeTypes.every((mimeType) => typeof mimeType === "string" && mimeType.trim()) && + (value.autoDetectOrder === undefined || typeof value.autoDetectOrder === "number") && + typeof value.extract === "function" + ); +} + +function tryLoadBundledPublicArtifactModule(params: { + dirName: string; +}): Record | null { + for (const artifactBasename of DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES) { + try { + return loadBundledPluginPublicArtifactModuleSync>({ + dirName: params.dirName, + artifactBasename, + }); + } catch (error) { + if ( + error instanceof Error && + error.message.startsWith("Unable to resolve bundled plugin public surface ") + ) { + continue; + } + throw error; + } + } + return null; +} + +function collectExtractorFactories(mod: Record): { + extractors: DocumentExtractorPlugin[]; + errors: unknown[]; +} { + const extractors: DocumentExtractorPlugin[] = []; + const errors: unknown[] = []; + for (const [name, exported] of Object.entries(mod).toSorted(([left], [right]) => + left.localeCompare(right), + )) { + if ( + typeof exported !== "function" || + exported.length !== 0 || + !name.startsWith("create") || + !name.endsWith("DocumentExtractor") + ) { + continue; + } + let candidate: unknown; + try { + candidate = exported(); + } catch (error) { + errors.push(error); + continue; + } + if (isDocumentExtractorPlugin(candidate)) { + extractors.push(candidate); + } + } + return { extractors, errors }; +} + +export function loadBundledDocumentExtractorEntriesFromDir(params: { + dirName: string; + pluginId: string; +}): PluginDocumentExtractorEntry[] | null { + const mod = tryLoadBundledPublicArtifactModule({ dirName: params.dirName }); + if (!mod) { + return null; + } + const { extractors, errors } = collectExtractorFactories(mod); + if (extractors.length === 0) { + if (errors.length > 0) { + throw new Error(`Unable to initialize document extractors for plugin ${params.pluginId}`, { + cause: errors.length === 1 ? errors[0] : new AggregateError(errors), + }); + } + return null; + } + return extractors.map((extractor) => Object.assign({}, extractor, { pluginId: params.pluginId })); +} + +export function hasBundledDocumentExtractorPublicArtifact(pluginId: string): boolean { + return DOCUMENT_EXTRACTOR_ARTIFACT_CANDIDATES.some((artifactBasename) => + Boolean(resolveBundledPluginPublicArtifactPath({ dirName: pluginId, artifactBasename })), + ); +} diff --git a/src/plugins/document-extractor-types.ts b/src/plugins/document-extractor-types.ts new file mode 100644 index 00000000000..94fda320c81 --- /dev/null +++ b/src/plugins/document-extractor-types.ts @@ -0,0 +1,32 @@ +export type DocumentExtractedImage = { + type: "image"; + data: string; + mimeType: string; +}; + +export type DocumentExtractionRequest = { + buffer: Buffer; + mimeType: string; + maxPages: number; + maxPixels: number; + minTextChars: number; + pageNumbers?: number[]; + onImageExtractionError?: (error: unknown) => void; +}; + +export type DocumentExtractionResult = { + text: string; + images: DocumentExtractedImage[]; +}; + +export type DocumentExtractorPlugin = { + id: string; + label: string; + mimeTypes: readonly string[]; + autoDetectOrder?: number; + extract: (request: DocumentExtractionRequest) => Promise; +}; + +export type PluginDocumentExtractorEntry = DocumentExtractorPlugin & { + pluginId: string; +}; diff --git a/src/plugins/document-extractors.runtime.test.ts b/src/plugins/document-extractors.runtime.test.ts new file mode 100644 index 00000000000..aef9aeee1f2 --- /dev/null +++ b/src/plugins/document-extractors.runtime.test.ts @@ -0,0 +1,28 @@ +import { describe, expect, it } from "vitest"; +import { resolvePluginDocumentExtractors } from "./document-extractors.runtime.js"; + +describe("resolvePluginDocumentExtractors", () => { + it("respects global plugin disablement", () => { + expect( + resolvePluginDocumentExtractors({ + config: { + plugins: { + enabled: false, + }, + }, + }), + ).toEqual([]); + }); + + it("does not expand an operator plugin allowlist", () => { + expect( + resolvePluginDocumentExtractors({ + config: { + plugins: { + allow: ["openai"], + }, + }, + }), + ).toEqual([]); + }); +}); diff --git a/src/plugins/document-extractors.runtime.ts b/src/plugins/document-extractors.runtime.ts new file mode 100644 index 00000000000..d54e6eebd85 --- /dev/null +++ b/src/plugins/document-extractors.runtime.ts @@ -0,0 +1,134 @@ +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import { resolveBundledPluginCompatibleLoadValues } from "./activation-context.js"; +import { + createPluginActivationSource, + normalizePluginsConfig, + resolveEffectivePluginActivationState, +} from "./config-state.js"; +import { loadBundledDocumentExtractorEntriesFromDir } from "./document-extractor-public-artifacts.js"; +import type { PluginDocumentExtractorEntry } from "./document-extractor-types.js"; +import { loadPluginManifestRegistry } from "./manifest-registry.js"; +import type { PluginManifestRecord } from "./manifest-registry.js"; + +function compareExtractors( + left: PluginDocumentExtractorEntry, + right: PluginDocumentExtractorEntry, +): number { + const leftOrder = left.autoDetectOrder ?? Number.MAX_SAFE_INTEGER; + const rightOrder = right.autoDetectOrder ?? Number.MAX_SAFE_INTEGER; + if (leftOrder !== rightOrder) { + return leftOrder - rightOrder; + } + return left.id.localeCompare(right.id) || left.pluginId.localeCompare(right.pluginId); +} + +function resolveBundledDocumentExtractorCompatPluginIds(params: { + config?: OpenClawConfig; + workspaceDir?: string; + env?: NodeJS.ProcessEnv; + onlyPluginIds?: readonly string[]; +}): string[] { + const onlyPluginIdSet = + params.onlyPluginIds && params.onlyPluginIds.length > 0 ? new Set(params.onlyPluginIds) : null; + return loadPluginManifestRegistry({ + config: params.config, + workspaceDir: params.workspaceDir, + env: params.env, + }) + .plugins.filter( + (plugin) => + plugin.origin === "bundled" && + (!onlyPluginIdSet || onlyPluginIdSet.has(plugin.id)) && + (plugin.contracts?.documentExtractors?.length ?? 0) > 0, + ) + .map((plugin) => plugin.id) + .toSorted((left, right) => left.localeCompare(right)); +} + +function resolveEnabledBundledDocumentExtractorPlugins(params: { + config?: OpenClawConfig; + workspaceDir?: string; + env?: NodeJS.ProcessEnv; + onlyPluginIds?: readonly string[]; +}): PluginManifestRecord[] { + if (params.config?.plugins?.enabled === false) { + return []; + } + + const activation = resolveBundledPluginCompatibleLoadValues({ + rawConfig: params.config, + env: params.env, + workspaceDir: params.workspaceDir, + onlyPluginIds: params.onlyPluginIds, + applyAutoEnable: true, + compatMode: { + allowlist: false, + enablement: "allowlist", + vitest: true, + }, + resolveCompatPluginIds: resolveBundledDocumentExtractorCompatPluginIds, + }); + const normalizedPlugins = normalizePluginsConfig(activation.config?.plugins); + const activationSource = createPluginActivationSource({ + config: activation.activationSourceConfig, + }); + const onlyPluginIdSet = + params.onlyPluginIds && params.onlyPluginIds.length > 0 ? new Set(params.onlyPluginIds) : null; + return loadPluginManifestRegistry({ + config: activation.config, + workspaceDir: params.workspaceDir, + env: params.env, + }).plugins.filter((plugin) => { + if ( + plugin.origin !== "bundled" || + (onlyPluginIdSet && !onlyPluginIdSet.has(plugin.id)) || + (plugin.contracts?.documentExtractors?.length ?? 0) === 0 + ) { + return false; + } + return resolveEffectivePluginActivationState({ + id: plugin.id, + origin: plugin.origin, + config: normalizedPlugins, + rootConfig: activation.config, + enabledByDefault: plugin.enabledByDefault, + activationSource, + }).enabled; + }); +} + +export function resolvePluginDocumentExtractors(params?: { + config?: OpenClawConfig; + workspaceDir?: string; + env?: NodeJS.ProcessEnv; + onlyPluginIds?: readonly string[]; +}): PluginDocumentExtractorEntry[] { + const extractors: PluginDocumentExtractorEntry[] = []; + const loadErrors: unknown[] = []; + for (const plugin of resolveEnabledBundledDocumentExtractorPlugins({ + config: params?.config, + workspaceDir: params?.workspaceDir, + env: params?.env, + onlyPluginIds: params?.onlyPluginIds, + })) { + let loaded: PluginDocumentExtractorEntry[] | null; + try { + loaded = loadBundledDocumentExtractorEntriesFromDir({ + dirName: plugin.id, + pluginId: plugin.id, + }); + } catch (error) { + loadErrors.push(error); + continue; + } + if (loaded) { + extractors.push(...loaded); + } + } + if (extractors.length === 0 && loadErrors.length > 0) { + throw new Error("Unable to load document extractor plugins", { + cause: loadErrors.length === 1 ? loadErrors[0] : new AggregateError(loadErrors), + }); + } + return extractors.toSorted(compareExtractors); +} diff --git a/src/plugins/gateway-startup-plugin-ids.ts b/src/plugins/gateway-startup-plugin-ids.ts index 9df5bc25fbf..1b65be49cf4 100644 --- a/src/plugins/gateway-startup-plugin-ids.ts +++ b/src/plugins/gateway-startup-plugin-ids.ts @@ -52,6 +52,7 @@ function hasRuntimeContractSurface(plugin: PluginManifestRecord): boolean { plugin.cliBackends.length > 0 || plugin.contracts?.speechProviders?.length || plugin.contracts?.mediaUnderstandingProviders?.length || + plugin.contracts?.documentExtractors?.length || plugin.contracts?.imageGenerationProviders?.length || plugin.contracts?.videoGenerationProviders?.length || plugin.contracts?.musicGenerationProviders?.length || diff --git a/src/plugins/manifest-registry.ts b/src/plugins/manifest-registry.ts index 2e8378c46c0..07087d85fe0 100644 --- a/src/plugins/manifest-registry.ts +++ b/src/plugins/manifest-registry.ts @@ -67,6 +67,7 @@ type PluginManifestContractListKey = | "speechProviders" | "externalAuthProviders" | "mediaUnderstandingProviders" + | "documentExtractors" | "realtimeVoiceProviders" | "realtimeTranscriptionProviders" | "imageGenerationProviders" diff --git a/src/plugins/manifest.ts b/src/plugins/manifest.ts index 717a8712783..c912d5b2a5c 100644 --- a/src/plugins/manifest.ts +++ b/src/plugins/manifest.ts @@ -251,6 +251,7 @@ export type PluginManifestContracts = { realtimeTranscriptionProviders?: string[]; realtimeVoiceProviders?: string[]; mediaUnderstandingProviders?: string[]; + documentExtractors?: string[]; imageGenerationProviders?: string[]; videoGenerationProviders?: string[]; musicGenerationProviders?: string[]; @@ -443,6 +444,7 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u ); const realtimeVoiceProviders = normalizeTrimmedStringList(value.realtimeVoiceProviders); const mediaUnderstandingProviders = normalizeTrimmedStringList(value.mediaUnderstandingProviders); + const documentExtractors = normalizeTrimmedStringList(value.documentExtractors); const imageGenerationProviders = normalizeTrimmedStringList(value.imageGenerationProviders); const videoGenerationProviders = normalizeTrimmedStringList(value.videoGenerationProviders); const musicGenerationProviders = normalizeTrimmedStringList(value.musicGenerationProviders); @@ -459,6 +461,7 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u ...(realtimeTranscriptionProviders.length > 0 ? { realtimeTranscriptionProviders } : {}), ...(realtimeVoiceProviders.length > 0 ? { realtimeVoiceProviders } : {}), ...(mediaUnderstandingProviders.length > 0 ? { mediaUnderstandingProviders } : {}), + ...(documentExtractors.length > 0 ? { documentExtractors } : {}), ...(imageGenerationProviders.length > 0 ? { imageGenerationProviders } : {}), ...(videoGenerationProviders.length > 0 ? { videoGenerationProviders } : {}), ...(musicGenerationProviders.length > 0 ? { musicGenerationProviders } : {}), diff --git a/src/plugins/public-surface-runtime.test.ts b/src/plugins/public-surface-runtime.test.ts index db6461a2ce6..80bea427963 100644 --- a/src/plugins/public-surface-runtime.test.ts +++ b/src/plugins/public-surface-runtime.test.ts @@ -5,6 +5,7 @@ import { afterEach, describe, expect, it } from "vitest"; import { PUBLIC_SURFACE_SOURCE_EXTENSIONS, normalizeBundledPluginArtifactSubpath, + normalizeBundledPluginDirName, resolveBundledPluginPublicSurfacePath, resolveBundledPluginSourcePublicSurfacePath, } from "./public-surface-runtime.js"; @@ -96,4 +97,12 @@ describe("bundled plugin public surface runtime", () => { /must stay plugin-local/, ); }); + + it("rejects bundled plugin directory traversal", () => { + expect(normalizeBundledPluginDirName("document-extract")).toBe("document-extract"); + expect(() => normalizeBundledPluginDirName("../outside")).toThrow(/single directory/); + expect(() => normalizeBundledPluginDirName("nested/plugin")).toThrow(/single directory/); + expect(() => normalizeBundledPluginDirName("nested\\plugin")).toThrow(/single directory/); + expect(() => normalizeBundledPluginDirName("C:plugin")).toThrow(/single directory/); + }); }); diff --git a/src/plugins/public-surface-runtime.ts b/src/plugins/public-surface-runtime.ts index ed38114b664..1fd51293854 100644 --- a/src/plugins/public-surface-runtime.ts +++ b/src/plugins/public-surface-runtime.ts @@ -38,19 +38,31 @@ export function normalizeBundledPluginArtifactSubpath(artifactBasename: string): return normalized; } +export function normalizeBundledPluginDirName(dirName: string): string { + const normalized = dirName.trim(); + if ( + !normalized || + normalized === "." || + normalized === ".." || + normalized.includes("/") || + normalized.includes("\\") || + normalized.includes(":") + ) { + throw new Error(`Bundled plugin dirName must be a single directory: ${dirName}`); + } + return normalized; +} + export function resolveBundledPluginSourcePublicSurfacePath(params: { sourceRoot: string; dirName: string; artifactBasename: string; }): string | null { const artifactBasename = normalizeBundledPluginArtifactSubpath(params.artifactBasename); + const dirName = normalizeBundledPluginDirName(params.dirName); const sourceBaseName = artifactBasename.replace(/\.js$/u, ""); for (const ext of PUBLIC_SURFACE_SOURCE_EXTENSIONS) { - const sourceCandidate = path.resolve( - params.sourceRoot, - params.dirName, - `${sourceBaseName}${ext}`, - ); + const sourceCandidate = path.resolve(params.sourceRoot, dirName, `${sourceBaseName}${ext}`); if (fs.existsSync(sourceCandidate)) { return sourceCandidate; } @@ -88,11 +100,12 @@ export function resolveBundledPluginPublicSurfacePath(params: { bundledPluginsDir?: string; }): string | null { const artifactBasename = normalizeBundledPluginArtifactSubpath(params.artifactBasename); + const dirName = normalizeBundledPluginDirName(params.dirName); const explicitBundledPluginsDir = params.bundledPluginsDir ?? resolveBundledPluginsDir(params.env ?? process.env); if (explicitBundledPluginsDir) { - const explicitPluginDir = path.resolve(explicitBundledPluginsDir, params.dirName); + const explicitPluginDir = path.resolve(explicitBundledPluginsDir, dirName); const explicitBuiltCandidate = path.join(explicitPluginDir, artifactBasename); if (fs.existsSync(explicitBuiltCandidate)) { return explicitBuiltCandidate; @@ -100,21 +113,21 @@ export function resolveBundledPluginPublicSurfacePath(params: { return ( resolveBundledPluginSourcePublicSurfacePath({ sourceRoot: explicitBundledPluginsDir, - dirName: params.dirName, + dirName, artifactBasename, }) ?? resolvePackageSourceFallbackForBundledDir({ rootDir: params.rootDir, bundledPluginsDir: explicitBundledPluginsDir, - dirName: params.dirName, + dirName, artifactBasename, }) ); } for (const candidate of [ - path.resolve(params.rootDir, "dist", "extensions", params.dirName, artifactBasename), - path.resolve(params.rootDir, "dist-runtime", "extensions", params.dirName, artifactBasename), + path.resolve(params.rootDir, "dist", "extensions", dirName, artifactBasename), + path.resolve(params.rootDir, "dist-runtime", "extensions", dirName, artifactBasename), ]) { if (fs.existsSync(candidate)) { return candidate; @@ -123,7 +136,7 @@ export function resolveBundledPluginPublicSurfacePath(params: { return resolveBundledPluginSourcePublicSurfacePath({ sourceRoot: path.resolve(params.rootDir, "extensions"), - dirName: params.dirName, + dirName, artifactBasename, }); } diff --git a/src/types/napi-rs-canvas.d.ts b/src/types/napi-rs-canvas.d.ts deleted file mode 100644 index ab856f1a9b7..00000000000 --- a/src/types/napi-rs-canvas.d.ts +++ /dev/null @@ -1,7 +0,0 @@ -declare module "@napi-rs/canvas" { - export type Canvas = { - toBuffer(type?: string): Buffer; - }; - - export function createCanvas(width: number, height: number): Canvas; -} diff --git a/src/types/pdfjs-dist-legacy.d.ts b/src/types/pdfjs-dist-legacy.d.ts deleted file mode 100644 index 078e02e4918..00000000000 --- a/src/types/pdfjs-dist-legacy.d.ts +++ /dev/null @@ -1,33 +0,0 @@ -declare module "pdfjs-dist/legacy/build/pdf.mjs" { - export type TextItem = { - str: string; - }; - - export type TextMarkedContent = { - type?: string; - }; - - export type TextContent = { - items: Array; - }; - - export type Viewport = { - width: number; - height: number; - }; - - export type PDFPageProxy = { - getTextContent(): Promise; - getViewport(params: { scale: number }): Viewport; - render(params: { canvas: unknown; viewport: Viewport }): { promise: Promise }; - }; - - export type PDFDocumentProxy = { - numPages: number; - getPage(pageNumber: number): Promise; - }; - - export function getDocument(params: { data: Uint8Array; disableWorker?: boolean }): { - promise: Promise; - }; -}