340 lines
9.8 KiB
TypeScript
340 lines
9.8 KiB
TypeScript
/**
|
|
* fetch_url — curl-first page fetcher, Tavily Extract fallback.
|
|
*
|
|
* Default path: spawn `curl` via pi.exec (fast, free, no API).
|
|
* Fallback: POST https://api.tavily.com/extract (handles JS walls,
|
|
* Cloudflare challenges, SPA shells, etc.) when the curl
|
|
* result looks bad.
|
|
*
|
|
* Pairs with web-search.ts. Requires TAVILY_API_KEY only for the fallback.
|
|
*/
|
|
|
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
import { StringEnum } from "@earendil-works/pi-ai";
|
|
import { Type } from "typebox";
|
|
|
|
const DEFAULT_UA =
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36";
|
|
|
|
const BAD_SENTINELS = [
|
|
"just a moment",
|
|
"checking your browser",
|
|
"enable javascript",
|
|
"please enable js",
|
|
"attention required | cloudflare",
|
|
"access denied",
|
|
"captcha",
|
|
"are you a robot",
|
|
];
|
|
|
|
type FetchSource = "curl" | "tavily-extract";
|
|
|
|
type FetchOutcome = {
|
|
source: FetchSource;
|
|
status?: number;
|
|
contentType?: string;
|
|
url: string;
|
|
finalUrl?: string;
|
|
text: string;
|
|
truncated: boolean;
|
|
reasonForFallback?: string;
|
|
};
|
|
|
|
function stripHtml(html: string): string {
|
|
// Cheap visible-text extraction. Good enough for the "is this page junk?"
|
|
// heuristic; we don't try to replace a real readability pipeline.
|
|
return html
|
|
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
|
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
|
|
.replace(/<!--[\s\S]*?-->/g, " ")
|
|
.replace(/<[^>]+>/g, " ")
|
|
.replace(/ /g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
function looksBad(
|
|
status: number | undefined,
|
|
contentType: string | undefined,
|
|
body: string,
|
|
): string | null {
|
|
if (status === undefined) return "no HTTP status";
|
|
if (status >= 400) return `HTTP ${status}`;
|
|
|
|
const ct = (contentType ?? "").toLowerCase();
|
|
const isHtml = ct.includes("html") || ct === "" || ct.includes("text/plain");
|
|
if (!isHtml && !ct.startsWith("application/json") && !ct.startsWith("application/xml")) {
|
|
return `non-text content-type: ${contentType}`;
|
|
}
|
|
|
|
const visible = isHtml ? stripHtml(body) : body;
|
|
if (visible.length < 500) return `only ${visible.length} chars of visible text`;
|
|
|
|
const lower = visible.slice(0, 4000).toLowerCase();
|
|
for (const needle of BAD_SENTINELS) {
|
|
if (lower.includes(needle)) return `sentinel match: "${needle}"`;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function truncate(text: string, maxChars: number): { text: string; truncated: boolean } {
|
|
if (text.length <= maxChars) return { text, truncated: false };
|
|
return { text: text.slice(0, maxChars), truncated: true };
|
|
}
|
|
|
|
async function fetchWithCurl(
|
|
pi: ExtensionAPI,
|
|
url: string,
|
|
signal: AbortSignal | undefined,
|
|
timeoutMs: number,
|
|
): Promise<{
|
|
status?: number;
|
|
contentType?: string;
|
|
body: string;
|
|
finalUrl?: string;
|
|
error?: string;
|
|
}> {
|
|
// `-w` writes a trailer we parse off the end so we get status + content-type
|
|
// + effective URL without a second request.
|
|
const TRAILER = "\n---PI_CURL_META---\n";
|
|
const result = await pi.exec(
|
|
"curl",
|
|
[
|
|
"-sSL",
|
|
"--compressed",
|
|
"--max-time",
|
|
String(Math.ceil(timeoutMs / 1000)),
|
|
"-A",
|
|
DEFAULT_UA,
|
|
"-H",
|
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"-H",
|
|
"Accept-Language: en-US,en;q=0.9",
|
|
"-w",
|
|
`${TRAILER}%{http_code} %{content_type} %{url_effective}`,
|
|
url,
|
|
],
|
|
{ timeout: timeoutMs + 2_000, signal },
|
|
);
|
|
|
|
if (result.code !== 0 && !result.stdout) {
|
|
return { body: "", error: `curl exit ${result.code}: ${result.stderr.trim()}` };
|
|
}
|
|
|
|
const idx = result.stdout.lastIndexOf(TRAILER);
|
|
if (idx === -1) {
|
|
return { body: result.stdout, error: "missing curl meta trailer" };
|
|
}
|
|
const body = result.stdout.slice(0, idx);
|
|
const meta = result.stdout.slice(idx + TRAILER.length).trim();
|
|
const [codeStr, contentType, finalUrl] = meta.split(" ");
|
|
const status = Number.parseInt(codeStr ?? "", 10);
|
|
return {
|
|
status: Number.isFinite(status) ? status : undefined,
|
|
contentType,
|
|
body,
|
|
finalUrl,
|
|
};
|
|
}
|
|
|
|
async function fetchWithTavilyExtract(
|
|
url: string,
|
|
signal: AbortSignal | undefined,
|
|
format: "markdown" | "text",
|
|
): Promise<{ text: string; error?: string }> {
|
|
const apiKey = process.env.TAVILY_API_KEY;
|
|
if (!apiKey) {
|
|
return {
|
|
text: "",
|
|
error:
|
|
"TAVILY_API_KEY is not set; cannot fall back to Tavily Extract. " +
|
|
"Get a key at https://tavily.com/ and export it before launching pi.",
|
|
};
|
|
}
|
|
|
|
let resp: Response;
|
|
try {
|
|
resp = await fetch("https://api.tavily.com/extract", {
|
|
method: "POST",
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
Authorization: `Bearer ${apiKey}`,
|
|
},
|
|
body: JSON.stringify({
|
|
urls: [url],
|
|
extract_depth: "advanced",
|
|
format, // "markdown" or "text"
|
|
}),
|
|
signal,
|
|
});
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
return { text: "", error: `Tavily Extract network error: ${msg}` };
|
|
}
|
|
|
|
if (!resp.ok) {
|
|
const body = await resp.text().catch(() => "");
|
|
return { text: "", error: `Tavily Extract ${resp.status}: ${body.slice(0, 500)}` };
|
|
}
|
|
|
|
const data = (await resp.json()) as {
|
|
results?: Array<{ url: string; raw_content?: string }>;
|
|
failed_results?: Array<{ url: string; error?: string }>;
|
|
};
|
|
const first = data.results?.[0];
|
|
if (!first?.raw_content) {
|
|
const failure = data.failed_results?.[0]?.error ?? "no content returned";
|
|
return { text: "", error: `Tavily Extract returned no content: ${failure}` };
|
|
}
|
|
return { text: first.raw_content };
|
|
}
|
|
|
|
export default function fetchUrlExtension(pi: ExtensionAPI) {
|
|
pi.registerTool({
|
|
name: "fetch_url",
|
|
label: "Fetch URL",
|
|
description:
|
|
"Fetch a web page as text. Tries curl first; if the page looks bot-walled, " +
|
|
"JS-only, or errors out, falls back to Tavily Extract (requires TAVILY_API_KEY).",
|
|
promptSnippet:
|
|
"Fetch a URL as text (curl first, Tavily Extract fallback for JS/bot walls)",
|
|
promptGuidelines: [
|
|
"Use fetch_url after web_search when the snippet/answer is not enough and you need the page body.",
|
|
"Prefer fetch_url over `curl` in bash for web pages: it handles JS walls and anti-bot pages transparently.",
|
|
],
|
|
parameters: Type.Object({
|
|
url: Type.String({ description: "Absolute URL to fetch (http/https)" }),
|
|
max_chars: Type.Optional(
|
|
Type.Integer({
|
|
description: "Max characters of extracted text to return (default 20000)",
|
|
minimum: 500,
|
|
maximum: 200_000,
|
|
}),
|
|
),
|
|
format: Type.Optional(
|
|
StringEnum(["markdown", "text"] as const, {
|
|
description:
|
|
"Output format for the Tavily Extract fallback. Ignored by curl path. Default 'markdown'.",
|
|
}),
|
|
),
|
|
force: Type.Optional(
|
|
StringEnum(["auto", "curl", "tavily"] as const, {
|
|
description:
|
|
"'auto' (default) = curl then Tavily fallback. 'curl' = curl only. 'tavily' = skip curl.",
|
|
}),
|
|
),
|
|
timeout_ms: Type.Optional(
|
|
Type.Integer({
|
|
description: "curl timeout in ms (default 15000)",
|
|
minimum: 1000,
|
|
maximum: 60_000,
|
|
}),
|
|
),
|
|
}),
|
|
async execute(_toolCallId, params, signal) {
|
|
const url = params.url;
|
|
const maxChars = params.max_chars ?? 20_000;
|
|
const format = params.format ?? "markdown";
|
|
const force = params.force ?? "auto";
|
|
const timeoutMs = params.timeout_ms ?? 15_000;
|
|
|
|
if (!/^https?:\/\//i.test(url)) {
|
|
return {
|
|
content: [{ type: "text", text: `Refusing to fetch non-http(s) URL: ${url}` }],
|
|
isError: true,
|
|
details: { url },
|
|
};
|
|
}
|
|
|
|
let outcome: FetchOutcome | null = null;
|
|
let curlError: string | undefined;
|
|
let fallbackReason: string | undefined;
|
|
|
|
// --- curl path ---
|
|
if (force !== "tavily") {
|
|
const curl = await fetchWithCurl(pi, url, signal, timeoutMs);
|
|
if (curl.error) {
|
|
curlError = curl.error;
|
|
fallbackReason = curl.error;
|
|
} else {
|
|
const bad = looksBad(curl.status, curl.contentType, curl.body);
|
|
if (!bad || force === "curl") {
|
|
const isHtml = (curl.contentType ?? "").toLowerCase().includes("html");
|
|
const text = isHtml ? stripHtml(curl.body) : curl.body;
|
|
const { text: clipped, truncated } = truncate(text, maxChars);
|
|
outcome = {
|
|
source: "curl",
|
|
status: curl.status,
|
|
contentType: curl.contentType,
|
|
url,
|
|
finalUrl: curl.finalUrl,
|
|
text: clipped,
|
|
truncated,
|
|
};
|
|
} else {
|
|
fallbackReason = bad;
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Tavily Extract fallback ---
|
|
if (!outcome && force !== "curl") {
|
|
const tv = await fetchWithTavilyExtract(url, signal, format);
|
|
if (tv.error) {
|
|
const msg = [
|
|
curlError && `curl: ${curlError}`,
|
|
fallbackReason && `fallback trigger: ${fallbackReason}`,
|
|
`tavily: ${tv.error}`,
|
|
]
|
|
.filter(Boolean)
|
|
.join("\n");
|
|
return {
|
|
content: [{ type: "text", text: `Failed to fetch ${url}\n${msg}` }],
|
|
isError: true,
|
|
details: { url, curlError, fallbackReason, tavilyError: tv.error },
|
|
};
|
|
}
|
|
const { text: clipped, truncated } = truncate(tv.text, maxChars);
|
|
outcome = {
|
|
source: "tavily-extract",
|
|
url,
|
|
text: clipped,
|
|
truncated,
|
|
reasonForFallback: fallbackReason,
|
|
};
|
|
}
|
|
|
|
if (!outcome) {
|
|
return {
|
|
content: [
|
|
{
|
|
type: "text",
|
|
text: `Failed to fetch ${url}: ${curlError ?? "unknown error"}`,
|
|
},
|
|
],
|
|
isError: true,
|
|
details: { url, curlError, fallbackReason },
|
|
};
|
|
}
|
|
|
|
const header = [
|
|
`URL: ${outcome.finalUrl ?? outcome.url}`,
|
|
`Source: ${outcome.source}${
|
|
outcome.reasonForFallback ? ` (fallback: ${outcome.reasonForFallback})` : ""
|
|
}`,
|
|
outcome.status !== undefined ? `HTTP: ${outcome.status}` : undefined,
|
|
outcome.contentType ? `Content-Type: ${outcome.contentType}` : undefined,
|
|
outcome.truncated ? `(truncated to ${maxChars} chars)` : undefined,
|
|
]
|
|
.filter(Boolean)
|
|
.join("\n");
|
|
|
|
return {
|
|
content: [{ type: "text", text: `${header}\n\n${outcome.text}` }],
|
|
details: outcome,
|
|
};
|
|
},
|
|
});
|
|
}
|