diff --git a/packages/opencode/src/tool/webfetch.ts b/packages/opencode/src/tool/webfetch.ts index 634c68f4eea..bbe97ed3afc 100644 --- a/packages/opencode/src/tool/webfetch.ts +++ b/packages/opencode/src/tool/webfetch.ts @@ -2,6 +2,8 @@ import z from "zod" import { Tool } from "./tool" import TurndownService from "turndown" import DESCRIPTION from "./webfetch.txt" +import stripAnsi from "strip-ansi" +import { Identifier } from "../id/id" const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds @@ -83,16 +85,46 @@ export const WebFetchTool = Tool.define("webfetch", { throw new Error("Response too large (exceeds 5MB limit)") } - const content = new TextDecoder().decode(arrayBuffer) const contentType = response.headers.get("content-type") || "" + const type = contentType.split(";")[0].trim().toLowerCase() + const bytes = new Uint8Array(arrayBuffer) + const title = `${params.url} (${contentType || "unknown"})` - const title = `${params.url} (${contentType})` + const attachment = createAttachment({ + sessionID: ctx.sessionID, + messageID: ctx.messageID, + url: params.url, + type, + arrayBuffer, + }) + if (attachment) { + return { + output: attachment.mime === "application/pdf" ? "PDF fetched successfully" : "Image fetched successfully", + title, + metadata: {}, + attachments: [attachment], + } + } + + if (isBinaryContentType(type) || (!isTextContentType(type) && !looksLikeText(bytes))) { + return { + output: formatBinarySummary({ + url: params.url, + contentType, + byteLength: bytes.byteLength, + }), + title, + metadata: {}, + } + } + + const content = sanitizeText(new TextDecoder().decode(arrayBuffer)) // Handle content based on requested format and actual content type switch (params.format) { case "markdown": if (contentType.includes("text/html")) { - const markdown = convertHTMLToMarkdown(content) + const markdown = sanitizeText(convertHTMLToMarkdown(content)) return { output: markdown, title, @@ -107,7 +139,7 @@ export const WebFetchTool = Tool.define("webfetch", { case "text": if (contentType.includes("text/html")) { - const text = await extractTextFromHTML(content) + const text = sanitizeText(await extractTextFromHTML(content)) return { output: text, title, @@ -180,3 +212,127 @@ function convertHTMLToMarkdown(html: string): string { turndownService.remove(["script", "style", "meta", "link"]) return turndownService.turndown(html) } + +function isBinaryContentType(type?: string) { + if (!type) return false + if (type.startsWith("audio/")) return true + if (type.startsWith("video/")) return true + if (type.startsWith("font/")) return true + + return [ + "application/pdf", + "application/zip", + "application/gzip", + "application/x-gzip", + "application/x-7z-compressed", + "application/x-rar-compressed", + ].includes(type) +} + +function isTextContentType(type?: string) { + if (!type) return false + if (type.startsWith("text/")) return true + + return [ + "application/json", + "application/xml", + "application/xhtml+xml", + "application/javascript", + "application/x-javascript", + "application/yaml", + "application/x-yaml", + "application/toml", + "application/ld+json", + "application/problem+json", + ].includes(type) +} + +function looksLikeText(bytes: Uint8Array) { + const sample = bytes.subarray(0, Math.min(bytes.length, 1024)) + const stats = { controls: 0, zeros: 0 } + + for (const byte of sample) { + if (byte === 0) stats.zeros++ + if (byte < 32 && byte !== 9 && byte !== 10 && byte !== 13) stats.controls++ + if (byte === 127) stats.controls++ + } + + if (stats.zeros > 0) return false + return stats.controls / Math.max(sample.length, 1) <= 0.1 +} + +function sanitizeText(text: string) { + return stripAnsi(text) + .replaceAll("\r", "") + .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") + .replace(/[\u0080-\u009F]/g, "") +} + +function formatBinarySummary(input: { url: string; contentType: string; byteLength: number }) { + const type = input.contentType || "unknown" + const size = input.byteLength.toLocaleString() + return [ + "Binary response omitted to protect the TUI and keep context small.", + "", + `- url: ${input.url}`, + `- content-type: ${type}`, + `- bytes: ${size}`, + ].join("\n") +} + +function createAttachment(input: { + sessionID: string + messageID: string + url: string + type: string + arrayBuffer: ArrayBuffer +}) { + if (input.type === "image/svg+xml") return + if (!input.type.startsWith("image/") && input.type !== "application/pdf") return + + const b64 = Buffer.from(input.arrayBuffer).toString("base64") + return { + id: Identifier.ascending("part"), + sessionID: input.sessionID, + messageID: input.messageID, + type: "file" as const, + mime: input.type, + filename: filenameFromUrl(input.url, input.type), + url: `data:${input.type};base64,${b64}`, + } +} + +function filenameFromUrl(raw: string, mime: string) { + const base = safeFilenameFromUrl(raw) + if (base.includes(".")) return base + return `${base}.${extensionFromMime(mime)}` +} + +function safeFilenameFromUrl(raw: string) { + const fallback = "webfetch" + if (!URL.canParse(raw)) return fallback + + const url = new URL(raw) + const parts = url.pathname.split("/").filter(Boolean) + const last = parts.at(-1) + if (!last) return fallback + const cleaned = last.replace(/[^a-zA-Z0-9._-]/g, "_").slice(0, 128) + return cleaned || fallback +} + +function extensionFromMime(mime: string) { + switch (mime) { + case "image/jpeg": + return "jpg" + case "image/png": + return "png" + case "image/webp": + return "webp" + case "image/gif": + return "gif" + case "application/pdf": + return "pdf" + default: + return "bin" + } +} diff --git a/packages/opencode/test/tool/webfetch.test.ts b/packages/opencode/test/tool/webfetch.test.ts new file mode 100644 index 00000000000..ed92804e01b --- /dev/null +++ b/packages/opencode/test/tool/webfetch.test.ts @@ -0,0 +1,83 @@ +import { afterAll, beforeAll, describe, expect, test } from "bun:test" +import { WebFetchTool } from "../../src/tool/webfetch" + +const ctx = { + sessionID: "test", + messageID: "", + callID: "", + agent: "build", + abort: AbortSignal.any([]), + metadata: () => {}, + ask: async () => {}, +} + +describe("tool.webfetch", () => { + let server: ReturnType | undefined + let base = "" + + beforeAll(() => { + server = Bun.serve({ + port: 0, + fetch(req) { + const url = new URL(req.url) + + if (url.pathname === "/binary") { + const body = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x1b, 0x5b, 0x3c]) + return new Response(body, { + headers: { + "content-type": "image/png", + "content-length": String(body.byteLength), + }, + }) + } + + if (url.pathname === "/text") { + return new Response("hello\u001b[31m world\u001b[0m\r\nok\u0000bad\n", { + headers: { + "content-type": "text/plain; charset=utf-8", + }, + }) + } + + return new Response("not found", { status: 404 }) + }, + }) + base = `http://127.0.0.1:${server.port}` + }) + + afterAll(() => server?.stop()) + + test("returns images as attachments", async () => { + const webfetch = await WebFetchTool.init() + const result = await webfetch.execute( + { + url: `${base}/binary`, + format: "markdown", + }, + ctx, + ) + + expect(result.output).toBe("Image fetched successfully") + expect(result.attachments?.length).toBe(1) + expect(result.attachments?.[0].mime).toBe("image/png") + expect(result.attachments?.[0].filename).toBe("binary.png") + expect(result.attachments?.[0].url).toStartWith("data:image/png;base64,") + }) + + test("sanitizes control characters in text responses", async () => { + const webfetch = await WebFetchTool.init() + const result = await webfetch.execute( + { + url: `${base}/text`, + format: "text", + }, + ctx, + ) + + expect(result.output).toContain("hello world") + expect(result.output).toContain("ok") + expect(result.output).not.toContain("\u001b") + expect(result.output).not.toContain("\r") + expect(result.output).not.toContain("\u0000") + }) +})