Add first version of dev sandbox
This commit is contained in:
parent
883b0df070
commit
7974e725d0
10 changed files with 989 additions and 0 deletions
340
.pi/agent/extensions/fetch-url.ts
Normal file
340
.pi/agent/extensions/fetch-url.ts
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
/**
|
||||
* fetch_url — curl-first page fetcher, Tavily Extract fallback.
|
||||
*
|
||||
* Default path: spawn `curl` via pi.exec (fast, free, no API).
|
||||
* Fallback: POST https://api.tavily.com/extract (handles JS walls,
|
||||
* Cloudflare challenges, SPA shells, etc.) when the curl
|
||||
* result looks bad.
|
||||
*
|
||||
* Pairs with web-search.ts. Requires TAVILY_API_KEY only for the fallback.
|
||||
*/
|
||||
|
||||
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
||||
import { StringEnum } from "@earendil-works/pi-ai";
|
||||
import { Type } from "typebox";
|
||||
|
||||
const DEFAULT_UA =
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36";
|
||||
|
||||
const BAD_SENTINELS = [
|
||||
"just a moment",
|
||||
"checking your browser",
|
||||
"enable javascript",
|
||||
"please enable js",
|
||||
"attention required | cloudflare",
|
||||
"access denied",
|
||||
"captcha",
|
||||
"are you a robot",
|
||||
];
|
||||
|
||||
type FetchSource = "curl" | "tavily-extract";
|
||||
|
||||
type FetchOutcome = {
|
||||
source: FetchSource;
|
||||
status?: number;
|
||||
contentType?: string;
|
||||
url: string;
|
||||
finalUrl?: string;
|
||||
text: string;
|
||||
truncated: boolean;
|
||||
reasonForFallback?: string;
|
||||
};
|
||||
|
||||
function stripHtml(html: string): string {
|
||||
// Cheap visible-text extraction. Good enough for the "is this page junk?"
|
||||
// heuristic; we don't try to replace a real readability pipeline.
|
||||
return html
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
||||
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
|
||||
.replace(/<!--[\s\S]*?-->/g, " ")
|
||||
.replace(/<[^>]+>/g, " ")
|
||||
.replace(/ /g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function looksBad(
|
||||
status: number | undefined,
|
||||
contentType: string | undefined,
|
||||
body: string,
|
||||
): string | null {
|
||||
if (status === undefined) return "no HTTP status";
|
||||
if (status >= 400) return `HTTP ${status}`;
|
||||
|
||||
const ct = (contentType ?? "").toLowerCase();
|
||||
const isHtml = ct.includes("html") || ct === "" || ct.includes("text/plain");
|
||||
if (!isHtml && !ct.startsWith("application/json") && !ct.startsWith("application/xml")) {
|
||||
return `non-text content-type: ${contentType}`;
|
||||
}
|
||||
|
||||
const visible = isHtml ? stripHtml(body) : body;
|
||||
if (visible.length < 500) return `only ${visible.length} chars of visible text`;
|
||||
|
||||
const lower = visible.slice(0, 4000).toLowerCase();
|
||||
for (const needle of BAD_SENTINELS) {
|
||||
if (lower.includes(needle)) return `sentinel match: "${needle}"`;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function truncate(text: string, maxChars: number): { text: string; truncated: boolean } {
|
||||
if (text.length <= maxChars) return { text, truncated: false };
|
||||
return { text: text.slice(0, maxChars), truncated: true };
|
||||
}
|
||||
|
||||
async function fetchWithCurl(
|
||||
pi: ExtensionAPI,
|
||||
url: string,
|
||||
signal: AbortSignal | undefined,
|
||||
timeoutMs: number,
|
||||
): Promise<{
|
||||
status?: number;
|
||||
contentType?: string;
|
||||
body: string;
|
||||
finalUrl?: string;
|
||||
error?: string;
|
||||
}> {
|
||||
// `-w` writes a trailer we parse off the end so we get status + content-type
|
||||
// + effective URL without a second request.
|
||||
const TRAILER = "\n---PI_CURL_META---\n";
|
||||
const result = await pi.exec(
|
||||
"curl",
|
||||
[
|
||||
"-sSL",
|
||||
"--compressed",
|
||||
"--max-time",
|
||||
String(Math.ceil(timeoutMs / 1000)),
|
||||
"-A",
|
||||
DEFAULT_UA,
|
||||
"-H",
|
||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"-H",
|
||||
"Accept-Language: en-US,en;q=0.9",
|
||||
"-w",
|
||||
`${TRAILER}%{http_code} %{content_type} %{url_effective}`,
|
||||
url,
|
||||
],
|
||||
{ timeout: timeoutMs + 2_000, signal },
|
||||
);
|
||||
|
||||
if (result.code !== 0 && !result.stdout) {
|
||||
return { body: "", error: `curl exit ${result.code}: ${result.stderr.trim()}` };
|
||||
}
|
||||
|
||||
const idx = result.stdout.lastIndexOf(TRAILER);
|
||||
if (idx === -1) {
|
||||
return { body: result.stdout, error: "missing curl meta trailer" };
|
||||
}
|
||||
const body = result.stdout.slice(0, idx);
|
||||
const meta = result.stdout.slice(idx + TRAILER.length).trim();
|
||||
const [codeStr, contentType, finalUrl] = meta.split(" ");
|
||||
const status = Number.parseInt(codeStr ?? "", 10);
|
||||
return {
|
||||
status: Number.isFinite(status) ? status : undefined,
|
||||
contentType,
|
||||
body,
|
||||
finalUrl,
|
||||
};
|
||||
}
|
||||
|
||||
async function fetchWithTavilyExtract(
|
||||
url: string,
|
||||
signal: AbortSignal | undefined,
|
||||
format: "markdown" | "text",
|
||||
): Promise<{ text: string; error?: string }> {
|
||||
const apiKey = process.env.TAVILY_API_KEY;
|
||||
if (!apiKey) {
|
||||
return {
|
||||
text: "",
|
||||
error:
|
||||
"TAVILY_API_KEY is not set; cannot fall back to Tavily Extract. " +
|
||||
"Get a key at https://tavily.com/ and export it before launching pi.",
|
||||
};
|
||||
}
|
||||
|
||||
let resp: Response;
|
||||
try {
|
||||
resp = await fetch("https://api.tavily.com/extract", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
urls: [url],
|
||||
extract_depth: "advanced",
|
||||
format, // "markdown" or "text"
|
||||
}),
|
||||
signal,
|
||||
});
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
return { text: "", error: `Tavily Extract network error: ${msg}` };
|
||||
}
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text().catch(() => "");
|
||||
return { text: "", error: `Tavily Extract ${resp.status}: ${body.slice(0, 500)}` };
|
||||
}
|
||||
|
||||
const data = (await resp.json()) as {
|
||||
results?: Array<{ url: string; raw_content?: string }>;
|
||||
failed_results?: Array<{ url: string; error?: string }>;
|
||||
};
|
||||
const first = data.results?.[0];
|
||||
if (!first?.raw_content) {
|
||||
const failure = data.failed_results?.[0]?.error ?? "no content returned";
|
||||
return { text: "", error: `Tavily Extract returned no content: ${failure}` };
|
||||
}
|
||||
return { text: first.raw_content };
|
||||
}
|
||||
|
||||
export default function fetchUrlExtension(pi: ExtensionAPI) {
|
||||
pi.registerTool({
|
||||
name: "fetch_url",
|
||||
label: "Fetch URL",
|
||||
description:
|
||||
"Fetch a web page as text. Tries curl first; if the page looks bot-walled, " +
|
||||
"JS-only, or errors out, falls back to Tavily Extract (requires TAVILY_API_KEY).",
|
||||
promptSnippet:
|
||||
"Fetch a URL as text (curl first, Tavily Extract fallback for JS/bot walls)",
|
||||
promptGuidelines: [
|
||||
"Use fetch_url after web_search when the snippet/answer is not enough and you need the page body.",
|
||||
"Prefer fetch_url over `curl` in bash for web pages: it handles JS walls and anti-bot pages transparently.",
|
||||
],
|
||||
parameters: Type.Object({
|
||||
url: Type.String({ description: "Absolute URL to fetch (http/https)" }),
|
||||
max_chars: Type.Optional(
|
||||
Type.Integer({
|
||||
description: "Max characters of extracted text to return (default 20000)",
|
||||
minimum: 500,
|
||||
maximum: 200_000,
|
||||
}),
|
||||
),
|
||||
format: Type.Optional(
|
||||
StringEnum(["markdown", "text"] as const, {
|
||||
description:
|
||||
"Output format for the Tavily Extract fallback. Ignored by curl path. Default 'markdown'.",
|
||||
}),
|
||||
),
|
||||
force: Type.Optional(
|
||||
StringEnum(["auto", "curl", "tavily"] as const, {
|
||||
description:
|
||||
"'auto' (default) = curl then Tavily fallback. 'curl' = curl only. 'tavily' = skip curl.",
|
||||
}),
|
||||
),
|
||||
timeout_ms: Type.Optional(
|
||||
Type.Integer({
|
||||
description: "curl timeout in ms (default 15000)",
|
||||
minimum: 1000,
|
||||
maximum: 60_000,
|
||||
}),
|
||||
),
|
||||
}),
|
||||
async execute(_toolCallId, params, signal) {
|
||||
const url = params.url;
|
||||
const maxChars = params.max_chars ?? 20_000;
|
||||
const format = params.format ?? "markdown";
|
||||
const force = params.force ?? "auto";
|
||||
const timeoutMs = params.timeout_ms ?? 15_000;
|
||||
|
||||
if (!/^https?:\/\//i.test(url)) {
|
||||
return {
|
||||
content: [{ type: "text", text: `Refusing to fetch non-http(s) URL: ${url}` }],
|
||||
isError: true,
|
||||
details: { url },
|
||||
};
|
||||
}
|
||||
|
||||
let outcome: FetchOutcome | null = null;
|
||||
let curlError: string | undefined;
|
||||
let fallbackReason: string | undefined;
|
||||
|
||||
// --- curl path ---
|
||||
if (force !== "tavily") {
|
||||
const curl = await fetchWithCurl(pi, url, signal, timeoutMs);
|
||||
if (curl.error) {
|
||||
curlError = curl.error;
|
||||
fallbackReason = curl.error;
|
||||
} else {
|
||||
const bad = looksBad(curl.status, curl.contentType, curl.body);
|
||||
if (!bad || force === "curl") {
|
||||
const isHtml = (curl.contentType ?? "").toLowerCase().includes("html");
|
||||
const text = isHtml ? stripHtml(curl.body) : curl.body;
|
||||
const { text: clipped, truncated } = truncate(text, maxChars);
|
||||
outcome = {
|
||||
source: "curl",
|
||||
status: curl.status,
|
||||
contentType: curl.contentType,
|
||||
url,
|
||||
finalUrl: curl.finalUrl,
|
||||
text: clipped,
|
||||
truncated,
|
||||
};
|
||||
} else {
|
||||
fallbackReason = bad;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Tavily Extract fallback ---
|
||||
if (!outcome && force !== "curl") {
|
||||
const tv = await fetchWithTavilyExtract(url, signal, format);
|
||||
if (tv.error) {
|
||||
const msg = [
|
||||
curlError && `curl: ${curlError}`,
|
||||
fallbackReason && `fallback trigger: ${fallbackReason}`,
|
||||
`tavily: ${tv.error}`,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
return {
|
||||
content: [{ type: "text", text: `Failed to fetch ${url}\n${msg}` }],
|
||||
isError: true,
|
||||
details: { url, curlError, fallbackReason, tavilyError: tv.error },
|
||||
};
|
||||
}
|
||||
const { text: clipped, truncated } = truncate(tv.text, maxChars);
|
||||
outcome = {
|
||||
source: "tavily-extract",
|
||||
url,
|
||||
text: clipped,
|
||||
truncated,
|
||||
reasonForFallback: fallbackReason,
|
||||
};
|
||||
}
|
||||
|
||||
if (!outcome) {
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: `Failed to fetch ${url}: ${curlError ?? "unknown error"}`,
|
||||
},
|
||||
],
|
||||
isError: true,
|
||||
details: { url, curlError, fallbackReason },
|
||||
};
|
||||
}
|
||||
|
||||
const header = [
|
||||
`URL: ${outcome.finalUrl ?? outcome.url}`,
|
||||
`Source: ${outcome.source}${
|
||||
outcome.reasonForFallback ? ` (fallback: ${outcome.reasonForFallback})` : ""
|
||||
}`,
|
||||
outcome.status !== undefined ? `HTTP: ${outcome.status}` : undefined,
|
||||
outcome.contentType ? `Content-Type: ${outcome.contentType}` : undefined,
|
||||
outcome.truncated ? `(truncated to ${maxChars} chars)` : undefined,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
|
||||
return {
|
||||
content: [{ type: "text", text: `${header}\n\n${outcome.text}` }],
|
||||
details: outcome,
|
||||
};
|
||||
},
|
||||
});
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue