/** * fetch_url — curl-first page fetcher, Tavily Extract fallback. * * Default path: spawn `curl` via pi.exec (fast, free, no API). * Fallback: POST https://api.tavily.com/extract (handles JS walls, * Cloudflare challenges, SPA shells, etc.) when the curl * result looks bad. * * Pairs with web-search.ts. Requires TAVILY_API_KEY only for the fallback. */ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; import { StringEnum } from "@earendil-works/pi-ai"; import { Type } from "typebox"; const DEFAULT_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"; const BAD_SENTINELS = [ "just a moment", "checking your browser", "enable javascript", "please enable js", "attention required | cloudflare", "access denied", "captcha", "are you a robot", ]; type FetchSource = "curl" | "tavily-extract"; type FetchOutcome = { source: FetchSource; status?: number; contentType?: string; url: string; finalUrl?: string; text: string; truncated: boolean; reasonForFallback?: string; }; function stripHtml(html: string): string { // Cheap visible-text extraction. Good enough for the "is this page junk?" // heuristic; we don't try to replace a real readability pipeline. return html .replace(//gi, " ") .replace(//gi, " ") .replace(//gi, " ") .replace(//g, " ") .replace(/<[^>]+>/g, " ") .replace(/ /g, " ") .replace(/\s+/g, " ") .trim(); } function looksBad( status: number | undefined, contentType: string | undefined, body: string, ): string | null { if (status === undefined) return "no HTTP status"; if (status >= 400) return `HTTP ${status}`; const ct = (contentType ?? "").toLowerCase(); const isHtml = ct.includes("html") || ct === "" || ct.includes("text/plain"); if (!isHtml && !ct.startsWith("application/json") && !ct.startsWith("application/xml")) { return `non-text content-type: ${contentType}`; } const visible = isHtml ? stripHtml(body) : body; if (visible.length < 500) return `only ${visible.length} chars of visible text`; const lower = visible.slice(0, 4000).toLowerCase(); for (const needle of BAD_SENTINELS) { if (lower.includes(needle)) return `sentinel match: "${needle}"`; } return null; } function truncate(text: string, maxChars: number): { text: string; truncated: boolean } { if (text.length <= maxChars) return { text, truncated: false }; return { text: text.slice(0, maxChars), truncated: true }; } async function fetchWithCurl( pi: ExtensionAPI, url: string, signal: AbortSignal | undefined, timeoutMs: number, ): Promise<{ status?: number; contentType?: string; body: string; finalUrl?: string; error?: string; }> { // `-w` writes a trailer we parse off the end so we get status + content-type // + effective URL without a second request. const TRAILER = "\n---PI_CURL_META---\n"; const result = await pi.exec( "curl", [ "-sSL", "--compressed", "--max-time", String(Math.ceil(timeoutMs / 1000)), "-A", DEFAULT_UA, "-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "-H", "Accept-Language: en-US,en;q=0.9", "-w", `${TRAILER}%{http_code} %{content_type} %{url_effective}`, url, ], { timeout: timeoutMs + 2_000, signal }, ); if (result.code !== 0 && !result.stdout) { return { body: "", error: `curl exit ${result.code}: ${result.stderr.trim()}` }; } const idx = result.stdout.lastIndexOf(TRAILER); if (idx === -1) { return { body: result.stdout, error: "missing curl meta trailer" }; } const body = result.stdout.slice(0, idx); const meta = result.stdout.slice(idx + TRAILER.length).trim(); const [codeStr, contentType, finalUrl] = meta.split(" "); const status = Number.parseInt(codeStr ?? "", 10); return { status: Number.isFinite(status) ? status : undefined, contentType, body, finalUrl, }; } async function fetchWithTavilyExtract( url: string, signal: AbortSignal | undefined, format: "markdown" | "text", ): Promise<{ text: string; error?: string }> { const apiKey = process.env.TAVILY_API_KEY; if (!apiKey) { return { text: "", error: "TAVILY_API_KEY is not set; cannot fall back to Tavily Extract. " + "Get a key at https://tavily.com/ and export it before launching pi.", }; } let resp: Response; try { resp = await fetch("https://api.tavily.com/extract", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiKey}`, }, body: JSON.stringify({ urls: [url], extract_depth: "advanced", format, // "markdown" or "text" }), signal, }); } catch (err) { const msg = err instanceof Error ? err.message : String(err); return { text: "", error: `Tavily Extract network error: ${msg}` }; } if (!resp.ok) { const body = await resp.text().catch(() => ""); return { text: "", error: `Tavily Extract ${resp.status}: ${body.slice(0, 500)}` }; } const data = (await resp.json()) as { results?: Array<{ url: string; raw_content?: string }>; failed_results?: Array<{ url: string; error?: string }>; }; const first = data.results?.[0]; if (!first?.raw_content) { const failure = data.failed_results?.[0]?.error ?? "no content returned"; return { text: "", error: `Tavily Extract returned no content: ${failure}` }; } return { text: first.raw_content }; } export default function fetchUrlExtension(pi: ExtensionAPI) { pi.registerTool({ name: "fetch_url", label: "Fetch URL", description: "Fetch a web page as text. Tries curl first; if the page looks bot-walled, " + "JS-only, or errors out, falls back to Tavily Extract (requires TAVILY_API_KEY).", promptSnippet: "Fetch a URL as text (curl first, Tavily Extract fallback for JS/bot walls)", promptGuidelines: [ "Use fetch_url after web_search when the snippet/answer is not enough and you need the page body.", "Prefer fetch_url over `curl` in bash for web pages: it handles JS walls and anti-bot pages transparently.", ], parameters: Type.Object({ url: Type.String({ description: "Absolute URL to fetch (http/https)" }), max_chars: Type.Optional( Type.Integer({ description: "Max characters of extracted text to return (default 20000)", minimum: 500, maximum: 200_000, }), ), format: Type.Optional( StringEnum(["markdown", "text"] as const, { description: "Output format for the Tavily Extract fallback. Ignored by curl path. Default 'markdown'.", }), ), force: Type.Optional( StringEnum(["auto", "curl", "tavily"] as const, { description: "'auto' (default) = curl then Tavily fallback. 'curl' = curl only. 'tavily' = skip curl.", }), ), timeout_ms: Type.Optional( Type.Integer({ description: "curl timeout in ms (default 15000)", minimum: 1000, maximum: 60_000, }), ), }), async execute(_toolCallId, params, signal) { const url = params.url; const maxChars = params.max_chars ?? 20_000; const format = params.format ?? "markdown"; const force = params.force ?? "auto"; const timeoutMs = params.timeout_ms ?? 15_000; if (!/^https?:\/\//i.test(url)) { return { content: [{ type: "text", text: `Refusing to fetch non-http(s) URL: ${url}` }], isError: true, details: { url }, }; } let outcome: FetchOutcome | null = null; let curlError: string | undefined; let fallbackReason: string | undefined; // --- curl path --- if (force !== "tavily") { const curl = await fetchWithCurl(pi, url, signal, timeoutMs); if (curl.error) { curlError = curl.error; fallbackReason = curl.error; } else { const bad = looksBad(curl.status, curl.contentType, curl.body); if (!bad || force === "curl") { const isHtml = (curl.contentType ?? "").toLowerCase().includes("html"); const text = isHtml ? stripHtml(curl.body) : curl.body; const { text: clipped, truncated } = truncate(text, maxChars); outcome = { source: "curl", status: curl.status, contentType: curl.contentType, url, finalUrl: curl.finalUrl, text: clipped, truncated, }; } else { fallbackReason = bad; } } } // --- Tavily Extract fallback --- if (!outcome && force !== "curl") { const tv = await fetchWithTavilyExtract(url, signal, format); if (tv.error) { const msg = [ curlError && `curl: ${curlError}`, fallbackReason && `fallback trigger: ${fallbackReason}`, `tavily: ${tv.error}`, ] .filter(Boolean) .join("\n"); return { content: [{ type: "text", text: `Failed to fetch ${url}\n${msg}` }], isError: true, details: { url, curlError, fallbackReason, tavilyError: tv.error }, }; } const { text: clipped, truncated } = truncate(tv.text, maxChars); outcome = { source: "tavily-extract", url, text: clipped, truncated, reasonForFallback: fallbackReason, }; } if (!outcome) { return { content: [ { type: "text", text: `Failed to fetch ${url}: ${curlError ?? "unknown error"}`, }, ], isError: true, details: { url, curlError, fallbackReason }, }; } const header = [ `URL: ${outcome.finalUrl ?? outcome.url}`, `Source: ${outcome.source}${ outcome.reasonForFallback ? ` (fallback: ${outcome.reasonForFallback})` : "" }`, outcome.status !== undefined ? `HTTP: ${outcome.status}` : undefined, outcome.contentType ? `Content-Type: ${outcome.contentType}` : undefined, outcome.truncated ? `(truncated to ${maxChars} chars)` : undefined, ] .filter(Boolean) .join("\n"); return { content: [{ type: "text", text: `${header}\n\n${outcome.text}` }], details: outcome, }; }, }); }