diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b360b5f1c0..1515d642805 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai - Claude CLI/sessions: classify `No conversation found with session ID` as `session_expired` so expired CLI-backed conversations clear the stale binding and recover on the next turn. (#65028) thanks @Ivan-Fn. - Context Engine: gracefully fall back to the legacy engine when a third-party context engine plugin fails at resolution time (unregistered id, factory throw, or contract violation), preventing a full gateway outage on every channel. (#66930) Thanks @openperf. - Control UI/chat: keep optimistic user message cards visible during active sends by deferring same-session history reloads until the active run ends, including aborted and errored runs. (#66997) Thanks @scotthuang and @vincentkoc. +- Media/Slack: allow host-local CSV and Markdown uploads only when the fallback buffer actually decodes as text, so real plain-text files work without letting opaque non-text blobs renamed to `.csv` or `.md` slip past the host-read guard. (#67047) Thanks @Unayung. ## 2026.4.14 diff --git a/src/media/web-media.test.ts b/src/media/web-media.test.ts index d1c78d86740..a320bef04a7 100644 --- a/src/media/web-media.test.ts +++ b/src/media/web-media.test.ts @@ -185,6 +185,248 @@ describe("loadWebMedia", () => { }); }); + it("allows host-read CSV files", async () => { + const csvFile = path.join(fixtureRoot, "data.csv"); + await fs.writeFile(csvFile, "name,value\nfoo,1\nbar,2\n", "utf8"); + const result = await loadWebMedia(csvFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }); + expect(result.kind).toBe("document"); + expect(result.contentType).toBe("text/csv"); + }); + + it("allows host-read Markdown files", async () => { + const mdFile = path.join(fixtureRoot, "notes.md"); + await fs.writeFile(mdFile, "# Title\n\nSome **bold** text.\n", "utf8"); + const result = await loadWebMedia(mdFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }); + expect(result.kind).toBe("document"); + expect(result.contentType).toBe("text/markdown"); + }); + + it("rejects binary data disguised as a CSV file", async () => { + const fakeCsv = path.join(fixtureRoot, "evil.csv"); + // Write ZIP magic bytes — file-type detects application/zip (not image, not CSV), + // so it is rejected by the host-read policy rather than allowed as an image. + await fs.writeFile(fakeCsv, Buffer.from([0x50, 0x4b, 0x03, 0x04])); + await expect( + loadWebMedia(fakeCsv, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + + it.each([ + { label: "CSV", fileName: "opaque.csv" }, + { label: "Markdown", fileName: "opaque.md" }, + ])("rejects opaque non-NUL binary data disguised as %s", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + const opaqueBinary = Buffer.alloc(9000); + for (let i = 0; i < opaqueBinary.length; i += 1) { + opaqueBinary[i] = (i % 255) + 1; + } + await fs.writeFile(fakeTextFile, opaqueBinary); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + + it.each([ + { label: "CSV", fileName: "prefix-tail.csv" }, + { label: "Markdown", fileName: "prefix-tail.md" }, + ])( + "rejects %s files with a text prefix and binary tail after the old sample window", + async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + const textPrefix = Buffer.from(`name,value\n${"row,1\n".repeat(1400)}`, "utf8"); + expect(textPrefix.length).toBeGreaterThan(8192); + const binaryTail = Buffer.from([0x00, 0xff, 0x10, 0x80]); + await fs.writeFile(fakeTextFile, Buffer.concat([textPrefix, binaryTail])); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }, + ); + + it.each([ + { + label: "CSV", + fileName: "punctuation.csv", + contentType: "text/csv", + body: ",,,,,,,,,,\n", + }, + { + label: "Markdown", + fileName: "punctuation.md", + contentType: "text/markdown", + body: "---\n***\n> > >\n", + }, + ])( + "loads valid punctuation-heavy %s files when host-read capability is enabled", + async ({ fileName, contentType, body }) => { + const textFile = path.join(fixtureRoot, fileName); + await fs.writeFile(textFile, Buffer.from(body, "utf8")); + const result = await loadWebMedia(textFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }); + expect(result.kind).toBe("document"); + expect(result.contentType).toBe(contentType); + }, + ); + + it.each([ + { + label: "CSV", + fileName: "legacy.csv", + contentType: "text/csv", + body: Buffer.from("caf\xe9,ni\xf1o\n", "latin1"), + }, + { + label: "Markdown", + fileName: "legacy.md", + contentType: "text/markdown", + body: Buffer.from("R\xe9sum\xe9\nni\xf1o\n", "latin1"), + }, + ])( + "loads valid single-byte encoded %s files when host-read capability is enabled", + async ({ fileName, contentType, body }) => { + const textFile = path.join(fixtureRoot, fileName); + await fs.writeFile(textFile, body); + const result = await loadWebMedia(textFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }); + expect(result.kind).toBe("document"); + expect(result.contentType).toBe(contentType); + }, + ); + + it.each([ + { label: "CSV", fileName: "nul-padded.csv" }, + { label: "Markdown", fileName: "nul-padded.md" }, + ])("rejects NUL-padded binary data disguised as %s", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + // Alternating 0x00/0xFF — UTF-8 decode fails (0xFF is invalid UTF-8), then + // hasSingleByteTextShape rejects because 0x00 bytes are control chars (< 0x20). + const nulPadded = Buffer.alloc(9000); + for (let i = 0; i < nulPadded.length; i += 1) { + nulPadded[i] = i % 2 === 0 ? 0x00 : 0xff; + } + await fs.writeFile(fakeTextFile, nulPadded); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + + it.each([ + { label: "CSV", fileName: "bom-binary.csv" }, + { label: "Markdown", fileName: "bom-binary.md" }, + ])("rejects UTF-16 BOM-prefixed binary data disguised as %s", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + // UTF-16LE BOM + repeating 0xFF bytes: if UTF-16 decoding were attempted, + // every byte pair would produce a printable code point and pass getTextStats. + // With UTF-16 decoding removed, falls through to UTF-8 strict decode (throws + // on 0xFF), then hasSingleByteTextShape rejects due to high-byte ratio > 30%. + const bom = Buffer.from([0xff, 0xfe]); + const garbage = Buffer.alloc(9000, 0xff); + await fs.writeFile(fakeTextFile, Buffer.concat([bom, garbage])); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + + it.each([ + { label: "CSV", fileName: "alternating-high.csv" }, + { label: "Markdown", fileName: "alternating-high.md" }, + ])("rejects alternating ASCII/high-byte data disguised as %s", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + // Alternating 0x41 ('A') and 0xFF — exactly 50% ASCII, 50% high bytes. + // With the old 50% threshold hasSingleByteTextShape would accept this; + // the tightened 70%/30% thresholds must reject it. + const mixed = Buffer.alloc(9000); + for (let i = 0; i < mixed.length; i += 1) { + mixed[i] = i % 2 === 0 ? 0x41 : 0xff; + } + await fs.writeFile(fakeTextFile, mixed); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + + it.each([ + { label: "CSV", fileName: "high-bytes.csv" }, + { label: "Markdown", fileName: "high-bytes.md" }, + ])("rejects high-byte opaque data disguised as %s", async ({ fileName }) => { + const fakeTextFile = path.join(fixtureRoot, fileName); + const opaqueBinary = Buffer.alloc(9000); + for (let i = 0; i < opaqueBinary.length; i += 1) { + opaqueBinary[i] = 0xa0 + (i % 96); + } + await fs.writeFile(fakeTextFile, opaqueBinary); + await expect( + loadWebMedia(fakeTextFile, { + maxBytes: 1024 * 1024, + localRoots: "any", + readFile: async (filePath) => await fs.readFile(filePath), + hostReadCapability: true, + }), + ).rejects.toMatchObject({ + code: "path-not-allowed", + }); + }); + it("rejects traversal-style canvas media paths before filesystem access", async () => { await expect( loadWebMedia(`${CANVAS_HOST_PATH}/documents/../collection.media/tiny.png`), diff --git a/src/media/web-media.ts b/src/media/web-media.ts index 9205d6fb273..bc286026e58 100644 --- a/src/media/web-media.ts +++ b/src/media/web-media.ts @@ -24,6 +24,7 @@ import { extensionForMime, getFileExtension, kindFromMime, + mimeTypeFromFilePath, normalizeMimeType, } from "./mime.js"; @@ -83,9 +84,95 @@ const HOST_READ_ALLOWED_DOCUMENT_MIMES = new Set([ "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "text/csv", + "text/markdown", ]); +// file-type returns undefined (no magic bytes) for plain-text formats like CSV and +// Markdown, so host-read needs an explicit "this really decodes as text" fallback. +const HOST_READ_TEXT_PLAIN_ALIASES = new Set(["text/csv", "text/markdown"]); const MB = 1024 * 1024; +function getTextStats(text: string): { printableRatio: number } { + if (!text) { + return { printableRatio: 0 }; + } + let printable = 0; + let control = 0; + for (const char of text) { + const code = char.codePointAt(0) ?? 0; + if (code === 9 || code === 10 || code === 13 || code === 32) { + printable += 1; + continue; + } + if (code < 32 || (code >= 0x7f && code <= 0x9f)) { + control += 1; + continue; + } + printable += 1; + } + const total = printable + control; + if (total === 0) { + return { printableRatio: 0 }; + } + return { printableRatio: printable / total }; +} + +function hasSingleByteTextShape(buffer: Buffer): boolean { + if (buffer.length === 0) { + return true; + } + let asciiText = 0; + let control = 0; + for (const byte of buffer) { + if (byte === 9 || byte === 10 || byte === 13 || (byte >= 0x20 && byte <= 0x7e)) { + asciiText += 1; + continue; + } + if (byte < 0x20 || byte === 0x7f) { + control += 1; + } + } + const total = buffer.length; + const highBytes = total - asciiText - control; + return control === 0 && asciiText / total >= 0.7 && highBytes / total <= 0.3; +} + +function decodeHostReadText(buffer: Buffer): string | undefined { + if (buffer.length === 0) { + return ""; + } + // UTF-16 decoding is intentionally omitted: TextDecoder("utf-16le/be") never throws on + // arbitrary byte pairs, so every byte pair is a valid (if meaningless) Unicode scalar — + // an attacker can prepend a BOM and pass getTextStats with printableRatio≈1.0 on pure + // binary garbage. The Latin-1 path below already covers the most common non-UTF-8 + // real-world case (Excel CSV exports with accented chars like é, ñ) while remaining + // safe because hasSingleByteTextShape gates on byte shape *before* any decode. + try { + return new TextDecoder("utf-8", { fatal: true }).decode(buffer); + } catch { + if (!hasSingleByteTextShape(buffer)) { + return undefined; + } + // WHATWG latin1 decodes common Excel-style single-byte exports via Windows-1252 mapping. + return new TextDecoder("latin1").decode(buffer); + } +} + +function isValidatedHostReadText(buffer?: Buffer): boolean { + if (!buffer) { + return false; + } + if (buffer.length === 0) { + return true; + } + const text = decodeHostReadText(buffer); + if (text === undefined) { + return false; + } + const { printableRatio } = getTextStats(text); + return printableRatio > 0.95; +} + function formatMb(bytes: number, digits = 2): string { return (bytes / MB).toFixed(digits); } @@ -113,7 +200,23 @@ function assertHostReadMediaAllowed(params: { contentType?: string; filePath?: string; kind: MediaKind | undefined; + buffer?: Buffer; }): void { + const declaredMime = normalizeMimeType(mimeTypeFromFilePath(params.filePath)); + const normalizedMime = normalizeMimeType(params.contentType); + // For extension-declared plain-text aliases such as .csv/.md, trust only the + // text validator path. Some opaque blobs can still produce bogus binary MIME + // hits (for example BOM-prefixed 0xFF data sniffing as audio/mpeg), and + // host-read should reject those instead of returning early on the sniff. + if (declaredMime && HOST_READ_TEXT_PLAIN_ALIASES.has(declaredMime)) { + if (!params.sniffedContentType && params.buffer && isValidatedHostReadText(params.buffer)) { + return; + } + throw new LocalMediaAccessError( + "path-not-allowed", + "hostReadCapability permits only validated plain-text CSV/Markdown documents for local reads", + ); + } const sniffedKind = kindFromMime(params.sniffedContentType); if (sniffedKind === "image" || sniffedKind === "audio" || sniffedKind === "video") { return; @@ -132,7 +235,20 @@ function assertHostReadMediaAllowed(params: { ) { return; } - const normalizedMime = normalizeMimeType(params.contentType); + // CSV / Markdown exception: file-type v22 returns undefined (not "text/plain") for + // plain-text buffers that have no binary magic bytes. Allow these formats when: + // - sniffedMime is undefined (no binary signature detected by file-type) + // - The extension-derived MIME is text/csv or text/markdown (operator intent) + // - The buffer decodes as actual text instead of opaque binary bytes + if ( + !sniffedMime && + normalizedMime && + HOST_READ_TEXT_PLAIN_ALIASES.has(normalizedMime) && + params.buffer && + isValidatedHostReadText(params.buffer) + ) { + return; + } if ( params.kind === "document" && normalizedMime && @@ -392,6 +508,7 @@ async function loadWebMediaInternal( contentType: mime, filePath: mediaUrl, kind, + buffer: data, }); } let fileName = path.basename(mediaUrl) || undefined;