218 lines
8.2 KiB
TypeScript
218 lines
8.2 KiB
TypeScript
export * as WebFetchTool from "./webfetch"
|
|
|
|
import { ToolFailure, toolText } from "@opencode-ai/llm"
|
|
import { Duration, Effect, Layer, Schema, Stream } from "effect"
|
|
import { HttpClient, HttpClientRequest, HttpClientResponse } from "effect/unstable/http"
|
|
import { Parser } from "htmlparser2"
|
|
import TurndownService from "turndown"
|
|
import { PermissionV2 } from "../permission"
|
|
import { Tool } from "./tool"
|
|
import { Tools } from "./tools"
|
|
|
|
export const name = "webfetch"
|
|
export const MAX_RESPONSE_BYTES = 5 * 1024 * 1024
|
|
export const DEFAULT_TIMEOUT_SECONDS = 30
|
|
export const MAX_TIMEOUT_SECONDS = 120
|
|
|
|
export const description = `Fetch content from an HTTP or HTTPS URL and return it as text, markdown, or HTML. Markdown is the default.
|
|
|
|
Use a more targeted tool when one is available. This tool is read-only. Large text results may be replaced with a preview while the complete output is retained in managed storage.`
|
|
|
|
const Timeout = Schema.Number.check(Schema.isGreaterThan(0), Schema.isLessThanOrEqualTo(MAX_TIMEOUT_SECONDS))
|
|
|
|
export const Input = Schema.Struct({
|
|
url: Schema.String.annotate({ description: "The HTTP or HTTPS URL to fetch content from" }),
|
|
format: Schema.Literals(["text", "markdown", "html"])
|
|
.annotate({ description: "The format to return the content in. Defaults to markdown." })
|
|
.pipe(Schema.withDecodingDefault(Effect.succeed("markdown" as const))),
|
|
timeout: Timeout.pipe(Schema.optional).annotate({
|
|
description: `Optional timeout in seconds (maximum: ${MAX_TIMEOUT_SECONDS})`,
|
|
}),
|
|
})
|
|
|
|
const Output = Schema.Struct({
|
|
url: Schema.String,
|
|
contentType: Schema.String,
|
|
format: Input.fields.format,
|
|
output: Schema.String,
|
|
})
|
|
|
|
type Format = (typeof Input.Type)["format"]
|
|
|
|
const acceptHeader = (format: Format) => {
|
|
switch (format) {
|
|
case "markdown":
|
|
return "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1"
|
|
case "text":
|
|
return "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1"
|
|
case "html":
|
|
return "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1"
|
|
}
|
|
return "*/*"
|
|
}
|
|
|
|
const headers = (format: Format, userAgent: string) => ({
|
|
"User-Agent": userAgent,
|
|
Accept: acceptHeader(format),
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
})
|
|
|
|
const browserUserAgent =
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
|
|
|
|
const isCloudflareChallenge = (error: unknown) => {
|
|
if (!error || typeof error !== "object" || !("reason" in error)) return false
|
|
const reason = error.reason
|
|
if (
|
|
!reason ||
|
|
typeof reason !== "object" ||
|
|
!("_tag" in reason) ||
|
|
reason._tag !== "StatusCodeError" ||
|
|
!("response" in reason)
|
|
)
|
|
return false
|
|
const response = reason.response as HttpClientResponse.HttpClientResponse
|
|
return response.status === 403 && response.headers["cf-mitigated"] === "challenge"
|
|
}
|
|
|
|
const request = (url: string, format: Format, userAgent = browserUserAgent) =>
|
|
HttpClientRequest.get(url).pipe(HttpClientRequest.setHeaders(headers(format, userAgent)))
|
|
|
|
const assertHttpUrl = (url: URL) => {
|
|
if (url.protocol !== "http:" && url.protocol !== "https:") throw new Error("URL must use http:// or https://")
|
|
}
|
|
|
|
const execute = (http: HttpClient.HttpClient, url: string, format: Format, userAgent = browserUserAgent) =>
|
|
http.execute(request(url, format, userAgent)).pipe(Effect.flatMap(HttpClientResponse.filterStatusOk))
|
|
|
|
const collectBody = (response: HttpClientResponse.HttpClientResponse) =>
|
|
Effect.gen(function* () {
|
|
const contentLength = response.headers["content-length"]
|
|
if (contentLength && Number.parseInt(contentLength, 10) > MAX_RESPONSE_BYTES) {
|
|
return yield* Effect.fail(new Error(`Response too large (exceeds ${MAX_RESPONSE_BYTES} byte limit)`))
|
|
}
|
|
const chunks: Uint8Array[] = []
|
|
let size = 0
|
|
yield* Stream.runForEach(response.stream, (chunk) =>
|
|
Effect.gen(function* () {
|
|
size += chunk.byteLength
|
|
if (size > MAX_RESPONSE_BYTES)
|
|
return yield* Effect.fail(new Error(`Response too large (exceeds ${MAX_RESPONSE_BYTES} byte limit)`))
|
|
chunks.push(chunk)
|
|
return undefined
|
|
}),
|
|
)
|
|
return Buffer.concat(chunks, size)
|
|
})
|
|
|
|
const mimeFrom = (contentType: string) => contentType.split(";", 1)[0]?.trim().toLowerCase() ?? ""
|
|
const isImageAttachment = (mime: string) =>
|
|
mime.startsWith("image/") && mime !== "image/svg+xml" && mime !== "image/vnd.fastbidsheet"
|
|
const isTextualMime = (mime: string) =>
|
|
!mime ||
|
|
mime.startsWith("text/") ||
|
|
mime === "application/json" ||
|
|
mime.endsWith("+json") ||
|
|
mime === "application/xml" ||
|
|
mime.endsWith("+xml") ||
|
|
mime === "application/javascript" ||
|
|
mime === "application/x-javascript"
|
|
const convert = (content: string, contentType: string, format: Format) => {
|
|
if (!contentType.includes("text/html")) return content
|
|
if (format === "markdown") return convertHTMLToMarkdown(content)
|
|
if (format === "text") return extractTextFromHTML(content)
|
|
return content
|
|
}
|
|
|
|
export const layer = Layer.effectDiscard(
|
|
Effect.gen(function* () {
|
|
const tools = yield* Tools.Service
|
|
const http = yield* HttpClient.HttpClient
|
|
const permission = yield* PermissionV2.Service
|
|
|
|
yield* tools
|
|
.register({
|
|
[name]: Tool.make({
|
|
description,
|
|
input: Input,
|
|
output: Output,
|
|
toModelOutput: ({ output }) => [toolText({ type: "text", text: output.output })],
|
|
execute: (input, context) =>
|
|
Effect.gen(function* () {
|
|
yield* Effect.try({
|
|
try: () => assertHttpUrl(new URL(input.url)),
|
|
catch: (error) => error,
|
|
})
|
|
|
|
yield* permission.assert({
|
|
action: name,
|
|
resources: [input.url],
|
|
save: ["*"],
|
|
metadata: input,
|
|
sessionID: context.sessionID,
|
|
agent: context.agent,
|
|
source: { type: "tool", messageID: context.assistantMessageID, callID: context.toolCallID },
|
|
})
|
|
|
|
const { body, contentType } = yield* Effect.gen(function* () {
|
|
const response = yield* execute(http, input.url, input.format).pipe(
|
|
Effect.catchIf(isCloudflareChallenge, () => execute(http, input.url, input.format, "opencode")),
|
|
)
|
|
const contentType = response.headers["content-type"] || ""
|
|
const mime = mimeFrom(contentType)
|
|
if (isImageAttachment(mime))
|
|
return yield* Effect.fail(new Error(`Unsupported fetched image content type: ${mime}`))
|
|
if (!isTextualMime(mime))
|
|
return yield* Effect.fail(new Error(`Unsupported fetched file content type: ${mime}`))
|
|
return { body: yield* collectBody(response), contentType }
|
|
}).pipe(
|
|
Effect.timeoutOrElse({
|
|
duration: Duration.seconds(input.timeout ?? DEFAULT_TIMEOUT_SECONDS),
|
|
orElse: () => Effect.fail(new Error("Request timed out")),
|
|
}),
|
|
)
|
|
const content = convert(new TextDecoder().decode(body), contentType, input.format)
|
|
return {
|
|
url: input.url,
|
|
contentType,
|
|
format: input.format,
|
|
output: content,
|
|
}
|
|
}).pipe(Effect.mapError(() => new ToolFailure({ message: `Unable to fetch ${input.url}` }))),
|
|
}),
|
|
})
|
|
.pipe(Effect.orDie)
|
|
}),
|
|
)
|
|
|
|
export function extractTextFromHTML(html: string) {
|
|
let text = ""
|
|
let skipDepth = 0
|
|
const parser = new Parser({
|
|
onopentag(name) {
|
|
if (skipDepth > 0 || ["script", "style", "noscript", "iframe", "object", "embed"].includes(name)) skipDepth++
|
|
},
|
|
ontext(input) {
|
|
if (skipDepth === 0) text += input
|
|
},
|
|
onclosetag() {
|
|
if (skipDepth > 0) skipDepth--
|
|
},
|
|
})
|
|
parser.write(html)
|
|
parser.end()
|
|
return text.trim()
|
|
}
|
|
|
|
export function convertHTMLToMarkdown(html: string) {
|
|
const turndown = new TurndownService({
|
|
headingStyle: "atx",
|
|
hr: "---",
|
|
bulletListMarker: "-",
|
|
codeBlockStyle: "fenced",
|
|
emDelimiter: "*",
|
|
})
|
|
turndown.remove(["script", "style", "meta", "link"])
|
|
return turndown.turndown(html)
|
|
}
|