Add first version of dev sandbox

This commit is contained in:
Alex Selimov 2026-05-13 20:51:24 -04:00
parent 883b0df070
commit 7974e725d0
10 changed files with 989 additions and 0 deletions

View file

@ -0,0 +1,340 @@
/**
* fetch_url curl-first page fetcher, Tavily Extract fallback.
*
* Default path: spawn `curl` via pi.exec (fast, free, no API).
* Fallback: POST https://api.tavily.com/extract (handles JS walls,
* Cloudflare challenges, SPA shells, etc.) when the curl
* result looks bad.
*
* Pairs with web-search.ts. Requires TAVILY_API_KEY only for the fallback.
*/
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
import { StringEnum } from "@earendil-works/pi-ai";
import { Type } from "typebox";
const DEFAULT_UA =
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36";
const BAD_SENTINELS = [
"just a moment",
"checking your browser",
"enable javascript",
"please enable js",
"attention required | cloudflare",
"access denied",
"captcha",
"are you a robot",
];
type FetchSource = "curl" | "tavily-extract";
type FetchOutcome = {
source: FetchSource;
status?: number;
contentType?: string;
url: string;
finalUrl?: string;
text: string;
truncated: boolean;
reasonForFallback?: string;
};
function stripHtml(html: string): string {
// Cheap visible-text extraction. Good enough for the "is this page junk?"
// heuristic; we don't try to replace a real readability pipeline.
return html
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
.replace(/<!--[\s\S]*?-->/g, " ")
.replace(/<[^>]+>/g, " ")
.replace(/&nbsp;/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function looksBad(
status: number | undefined,
contentType: string | undefined,
body: string,
): string | null {
if (status === undefined) return "no HTTP status";
if (status >= 400) return `HTTP ${status}`;
const ct = (contentType ?? "").toLowerCase();
const isHtml = ct.includes("html") || ct === "" || ct.includes("text/plain");
if (!isHtml && !ct.startsWith("application/json") && !ct.startsWith("application/xml")) {
return `non-text content-type: ${contentType}`;
}
const visible = isHtml ? stripHtml(body) : body;
if (visible.length < 500) return `only ${visible.length} chars of visible text`;
const lower = visible.slice(0, 4000).toLowerCase();
for (const needle of BAD_SENTINELS) {
if (lower.includes(needle)) return `sentinel match: "${needle}"`;
}
return null;
}
function truncate(text: string, maxChars: number): { text: string; truncated: boolean } {
if (text.length <= maxChars) return { text, truncated: false };
return { text: text.slice(0, maxChars), truncated: true };
}
async function fetchWithCurl(
pi: ExtensionAPI,
url: string,
signal: AbortSignal | undefined,
timeoutMs: number,
): Promise<{
status?: number;
contentType?: string;
body: string;
finalUrl?: string;
error?: string;
}> {
// `-w` writes a trailer we parse off the end so we get status + content-type
// + effective URL without a second request.
const TRAILER = "\n---PI_CURL_META---\n";
const result = await pi.exec(
"curl",
[
"-sSL",
"--compressed",
"--max-time",
String(Math.ceil(timeoutMs / 1000)),
"-A",
DEFAULT_UA,
"-H",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"-H",
"Accept-Language: en-US,en;q=0.9",
"-w",
`${TRAILER}%{http_code} %{content_type} %{url_effective}`,
url,
],
{ timeout: timeoutMs + 2_000, signal },
);
if (result.code !== 0 && !result.stdout) {
return { body: "", error: `curl exit ${result.code}: ${result.stderr.trim()}` };
}
const idx = result.stdout.lastIndexOf(TRAILER);
if (idx === -1) {
return { body: result.stdout, error: "missing curl meta trailer" };
}
const body = result.stdout.slice(0, idx);
const meta = result.stdout.slice(idx + TRAILER.length).trim();
const [codeStr, contentType, finalUrl] = meta.split(" ");
const status = Number.parseInt(codeStr ?? "", 10);
return {
status: Number.isFinite(status) ? status : undefined,
contentType,
body,
finalUrl,
};
}
async function fetchWithTavilyExtract(
url: string,
signal: AbortSignal | undefined,
format: "markdown" | "text",
): Promise<{ text: string; error?: string }> {
const apiKey = process.env.TAVILY_API_KEY;
if (!apiKey) {
return {
text: "",
error:
"TAVILY_API_KEY is not set; cannot fall back to Tavily Extract. " +
"Get a key at https://tavily.com/ and export it before launching pi.",
};
}
let resp: Response;
try {
resp = await fetch("https://api.tavily.com/extract", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify({
urls: [url],
extract_depth: "advanced",
format, // "markdown" or "text"
}),
signal,
});
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return { text: "", error: `Tavily Extract network error: ${msg}` };
}
if (!resp.ok) {
const body = await resp.text().catch(() => "");
return { text: "", error: `Tavily Extract ${resp.status}: ${body.slice(0, 500)}` };
}
const data = (await resp.json()) as {
results?: Array<{ url: string; raw_content?: string }>;
failed_results?: Array<{ url: string; error?: string }>;
};
const first = data.results?.[0];
if (!first?.raw_content) {
const failure = data.failed_results?.[0]?.error ?? "no content returned";
return { text: "", error: `Tavily Extract returned no content: ${failure}` };
}
return { text: first.raw_content };
}
export default function fetchUrlExtension(pi: ExtensionAPI) {
pi.registerTool({
name: "fetch_url",
label: "Fetch URL",
description:
"Fetch a web page as text. Tries curl first; if the page looks bot-walled, " +
"JS-only, or errors out, falls back to Tavily Extract (requires TAVILY_API_KEY).",
promptSnippet:
"Fetch a URL as text (curl first, Tavily Extract fallback for JS/bot walls)",
promptGuidelines: [
"Use fetch_url after web_search when the snippet/answer is not enough and you need the page body.",
"Prefer fetch_url over `curl` in bash for web pages: it handles JS walls and anti-bot pages transparently.",
],
parameters: Type.Object({
url: Type.String({ description: "Absolute URL to fetch (http/https)" }),
max_chars: Type.Optional(
Type.Integer({
description: "Max characters of extracted text to return (default 20000)",
minimum: 500,
maximum: 200_000,
}),
),
format: Type.Optional(
StringEnum(["markdown", "text"] as const, {
description:
"Output format for the Tavily Extract fallback. Ignored by curl path. Default 'markdown'.",
}),
),
force: Type.Optional(
StringEnum(["auto", "curl", "tavily"] as const, {
description:
"'auto' (default) = curl then Tavily fallback. 'curl' = curl only. 'tavily' = skip curl.",
}),
),
timeout_ms: Type.Optional(
Type.Integer({
description: "curl timeout in ms (default 15000)",
minimum: 1000,
maximum: 60_000,
}),
),
}),
async execute(_toolCallId, params, signal) {
const url = params.url;
const maxChars = params.max_chars ?? 20_000;
const format = params.format ?? "markdown";
const force = params.force ?? "auto";
const timeoutMs = params.timeout_ms ?? 15_000;
if (!/^https?:\/\//i.test(url)) {
return {
content: [{ type: "text", text: `Refusing to fetch non-http(s) URL: ${url}` }],
isError: true,
details: { url },
};
}
let outcome: FetchOutcome | null = null;
let curlError: string | undefined;
let fallbackReason: string | undefined;
// --- curl path ---
if (force !== "tavily") {
const curl = await fetchWithCurl(pi, url, signal, timeoutMs);
if (curl.error) {
curlError = curl.error;
fallbackReason = curl.error;
} else {
const bad = looksBad(curl.status, curl.contentType, curl.body);
if (!bad || force === "curl") {
const isHtml = (curl.contentType ?? "").toLowerCase().includes("html");
const text = isHtml ? stripHtml(curl.body) : curl.body;
const { text: clipped, truncated } = truncate(text, maxChars);
outcome = {
source: "curl",
status: curl.status,
contentType: curl.contentType,
url,
finalUrl: curl.finalUrl,
text: clipped,
truncated,
};
} else {
fallbackReason = bad;
}
}
}
// --- Tavily Extract fallback ---
if (!outcome && force !== "curl") {
const tv = await fetchWithTavilyExtract(url, signal, format);
if (tv.error) {
const msg = [
curlError && `curl: ${curlError}`,
fallbackReason && `fallback trigger: ${fallbackReason}`,
`tavily: ${tv.error}`,
]
.filter(Boolean)
.join("\n");
return {
content: [{ type: "text", text: `Failed to fetch ${url}\n${msg}` }],
isError: true,
details: { url, curlError, fallbackReason, tavilyError: tv.error },
};
}
const { text: clipped, truncated } = truncate(tv.text, maxChars);
outcome = {
source: "tavily-extract",
url,
text: clipped,
truncated,
reasonForFallback: fallbackReason,
};
}
if (!outcome) {
return {
content: [
{
type: "text",
text: `Failed to fetch ${url}: ${curlError ?? "unknown error"}`,
},
],
isError: true,
details: { url, curlError, fallbackReason },
};
}
const header = [
`URL: ${outcome.finalUrl ?? outcome.url}`,
`Source: ${outcome.source}${
outcome.reasonForFallback ? ` (fallback: ${outcome.reasonForFallback})` : ""
}`,
outcome.status !== undefined ? `HTTP: ${outcome.status}` : undefined,
outcome.contentType ? `Content-Type: ${outcome.contentType}` : undefined,
outcome.truncated ? `(truncated to ${maxChars} chars)` : undefined,
]
.filter(Boolean)
.join("\n");
return {
content: [{ type: "text", text: `${header}\n\n${outcome.text}` }],
details: outcome,
};
},
});
}

View file

@ -0,0 +1,162 @@
/**
* Web Search Extension (Tavily API)
*
* Registers a `web_search` tool the LLM can call for current/up-to-date info.
*
* Setup:
* 1. Get an API key at https://tavily.com/ (email signup, free 1k queries/month)
* 2. export TAVILY_API_KEY=tvly-...
*/
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
import { StringEnum } from "@earendil-works/pi-ai";
import { Type } from "typebox";
type TavilyResult = {
title: string;
url: string;
content: string;
score?: number;
published_date?: string;
};
type TavilyResponse = {
query: string;
answer?: string;
results: TavilyResult[];
};
export default function webSearchExtension(pi: ExtensionAPI) {
pi.registerTool({
name: "web_search",
label: "Web Search",
description:
"Search the web via Tavily and return a synthesized answer plus result titles, URLs, and snippets. " +
"Use for current events, docs lookups, or any fact that may be newer than training data.",
promptSnippet: "Search the web (Tavily) for up-to-date information",
promptGuidelines: [
"Use web_search when the user asks about current events, recent releases, or facts that may be outdated in training data.",
"After web_search returns results, call fetch_url on the most relevant URLs if the snippets are insufficient. Prefer fetch_url over raw curl/wget in bash — it handles JS walls and anti-bot pages automatically.",
],
parameters: Type.Object({
query: Type.String({ description: "Search query" }),
max_results: Type.Optional(
Type.Integer({
description: "Max results to return (1-20, default 5)",
minimum: 1,
maximum: 20,
}),
),
search_depth: Type.Optional(
StringEnum(["basic", "advanced"] as const, {
description:
"'basic' is faster/cheaper; 'advanced' does deeper crawling for harder queries. Default 'basic'.",
}),
),
topic: Type.Optional(
StringEnum(["general", "news"] as const, {
description: "Use 'news' for recent-events queries. Default 'general'.",
}),
),
include_answer: Type.Optional(
Type.Boolean({
description: "Ask Tavily to return a short synthesized answer. Default true.",
}),
),
}),
async execute(_toolCallId, params, signal) {
const apiKey = process.env.TAVILY_API_KEY;
if (!apiKey) {
return {
content: [
{
type: "text",
text:
"TAVILY_API_KEY is not set. Get a key at https://tavily.com/ " +
"and `export TAVILY_API_KEY=...` before launching pi.",
},
],
isError: true,
details: {},
};
}
const body = {
query: params.query,
max_results: params.max_results ?? 5,
search_depth: params.search_depth ?? "basic",
topic: params.topic ?? "general",
include_answer: params.include_answer ?? true,
};
let resp: Response;
try {
resp = await fetch("https://api.tavily.com/search", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify(body),
signal,
});
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return {
content: [{ type: "text", text: `Network error calling Tavily: ${msg}` }],
isError: true,
details: {},
};
}
if (!resp.ok) {
const errBody = await resp.text().catch(() => "");
return {
content: [
{
type: "text",
text: `Tavily search failed: ${resp.status} ${resp.statusText}\n${errBody}`,
},
],
isError: true,
details: { status: resp.status },
};
}
const data = (await resp.json()) as TavilyResponse;
const results = data.results ?? [];
if (results.length === 0 && !data.answer) {
return {
content: [{ type: "text", text: `No results for: ${params.query}` }],
details: { query: params.query, results: [] },
};
}
const parts: string[] = [];
if (data.answer) {
parts.push(`Answer: ${data.answer}`);
}
if (results.length > 0) {
const formatted = results
.map(
(r, i) =>
`${i + 1}. ${r.title}\n ${r.url}\n ${r.content}${
r.published_date ? ` (${r.published_date})` : ""
}`,
)
.join("\n\n");
parts.push(`Results:\n${formatted}`);
}
return {
content: [{ type: "text", text: parts.join("\n\n") }],
details: {
query: params.query,
answer: data.answer,
results,
},
};
},
});
}