/**
 * fetch_url — curl-first page fetcher, Tavily Extract fallback.
 *
 * Default path:  spawn `curl` via pi.exec (fast, free, no API).
 * Fallback:      POST https://api.tavily.com/extract (handles JS walls,
 *                Cloudflare challenges, SPA shells, etc.) when the curl
 *                result looks bad.
 *
 * Pairs with web-search.ts. Requires TAVILY_API_KEY only for the fallback.
 */

import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
import { StringEnum } from "@earendil-works/pi-ai";
import { Type } from "typebox";

const DEFAULT_UA =
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36";

const BAD_SENTINELS = [
	"just a moment",
	"checking your browser",
	"enable javascript",
	"please enable js",
	"attention required | cloudflare",
	"access denied",
	"captcha",
	"are you a robot",
];

type FetchSource = "curl" | "tavily-extract";

type FetchOutcome = {
	source: FetchSource;
	status?: number;
	contentType?: string;
	url: string;
	finalUrl?: string;
	text: string;
	truncated: boolean;
	reasonForFallback?: string;
};

function stripHtml(html: string): string {
	// Cheap visible-text extraction. Good enough for the "is this page junk?"
	// heuristic; we don't try to replace a real readability pipeline.
	return html
		.replace(/<script[\s\S]*?<\/script>/gi, " ")
		.replace(/<style[\s\S]*?<\/style>/gi, " ")
		.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
		.replace(/<!--[\s\S]*?-->/g, " ")
		.replace(/<[^>]+>/g, " ")
		.replace(/&nbsp;/g, " ")
		.replace(/\s+/g, " ")
		.trim();
}

function looksBad(
	status: number | undefined,
	contentType: string | undefined,
	body: string,
): string | null {
	if (status === undefined) return "no HTTP status";
	if (status >= 400) return `HTTP ${status}`;

	const ct = (contentType ?? "").toLowerCase();
	const isHtml = ct.includes("html") || ct === "" || ct.includes("text/plain");
	if (!isHtml && !ct.startsWith("application/json") && !ct.startsWith("application/xml")) {
		return `non-text content-type: ${contentType}`;
	}

	const visible = isHtml ? stripHtml(body) : body;
	if (visible.length < 500) return `only ${visible.length} chars of visible text`;

	const lower = visible.slice(0, 4000).toLowerCase();
	for (const needle of BAD_SENTINELS) {
		if (lower.includes(needle)) return `sentinel match: "${needle}"`;
	}

	return null;
}

function truncate(text: string, maxChars: number): { text: string; truncated: boolean } {
	if (text.length <= maxChars) return { text, truncated: false };
	return { text: text.slice(0, maxChars), truncated: true };
}

async function fetchWithCurl(
	pi: ExtensionAPI,
	url: string,
	signal: AbortSignal | undefined,
	timeoutMs: number,
): Promise<{
	status?: number;
	contentType?: string;
	body: string;
	finalUrl?: string;
	error?: string;
}> {
	// `-w` writes a trailer we parse off the end so we get status + content-type
	// + effective URL without a second request.
	const TRAILER = "\n---PI_CURL_META---\n";
	const result = await pi.exec(
		"curl",
		[
			"-sSL",
			"--compressed",
			"--max-time",
			String(Math.ceil(timeoutMs / 1000)),
			"-A",
			DEFAULT_UA,
			"-H",
			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			"-H",
			"Accept-Language: en-US,en;q=0.9",
			"-w",
			`${TRAILER}%{http_code} %{content_type} %{url_effective}`,
			url,
		],
		{ timeout: timeoutMs + 2_000, signal },
	);

	if (result.code !== 0 && !result.stdout) {
		return { body: "", error: `curl exit ${result.code}: ${result.stderr.trim()}` };
	}

	const idx = result.stdout.lastIndexOf(TRAILER);
	if (idx === -1) {
		return { body: result.stdout, error: "missing curl meta trailer" };
	}
	const body = result.stdout.slice(0, idx);
	const meta = result.stdout.slice(idx + TRAILER.length).trim();
	const [codeStr, contentType, finalUrl] = meta.split(" ");
	const status = Number.parseInt(codeStr ?? "", 10);
	return {
		status: Number.isFinite(status) ? status : undefined,
		contentType,
		body,
		finalUrl,
	};
}

async function fetchWithTavilyExtract(
	url: string,
	signal: AbortSignal | undefined,
	format: "markdown" | "text",
): Promise<{ text: string; error?: string }> {
	const apiKey = process.env.TAVILY_API_KEY;
	if (!apiKey) {
		return {
			text: "",
			error:
				"TAVILY_API_KEY is not set; cannot fall back to Tavily Extract. " +
				"Get a key at https://tavily.com/ and export it before launching pi.",
		};
	}

	let resp: Response;
	try {
		resp = await fetch("https://api.tavily.com/extract", {
			method: "POST",
			headers: {
				"Content-Type": "application/json",
				Authorization: `Bearer ${apiKey}`,
			},
			body: JSON.stringify({
				urls: [url],
				extract_depth: "advanced",
				format, // "markdown" or "text"
			}),
			signal,
		});
	} catch (err) {
		const msg = err instanceof Error ? err.message : String(err);
		return { text: "", error: `Tavily Extract network error: ${msg}` };
	}

	if (!resp.ok) {
		const body = await resp.text().catch(() => "");
		return { text: "", error: `Tavily Extract ${resp.status}: ${body.slice(0, 500)}` };
	}

	const data = (await resp.json()) as {
		results?: Array<{ url: string; raw_content?: string }>;
		failed_results?: Array<{ url: string; error?: string }>;
	};
	const first = data.results?.[0];
	if (!first?.raw_content) {
		const failure = data.failed_results?.[0]?.error ?? "no content returned";
		return { text: "", error: `Tavily Extract returned no content: ${failure}` };
	}
	return { text: first.raw_content };
}

export default function fetchUrlExtension(pi: ExtensionAPI) {
	pi.registerTool({
		name: "fetch_url",
		label: "Fetch URL",
		description:
			"Fetch a web page as text. Tries curl first; if the page looks bot-walled, " +
			"JS-only, or errors out, falls back to Tavily Extract (requires TAVILY_API_KEY).",
		promptSnippet:
			"Fetch a URL as text (curl first, Tavily Extract fallback for JS/bot walls)",
		promptGuidelines: [
			"Use fetch_url after web_search when the snippet/answer is not enough and you need the page body.",
			"Prefer fetch_url over `curl` in bash for web pages: it handles JS walls and anti-bot pages transparently.",
		],
		parameters: Type.Object({
			url: Type.String({ description: "Absolute URL to fetch (http/https)" }),
			max_chars: Type.Optional(
				Type.Integer({
					description: "Max characters of extracted text to return (default 20000)",
					minimum: 500,
					maximum: 200_000,
				}),
			),
			format: Type.Optional(
				StringEnum(["markdown", "text"] as const, {
					description:
						"Output format for the Tavily Extract fallback. Ignored by curl path. Default 'markdown'.",
				}),
			),
			force: Type.Optional(
				StringEnum(["auto", "curl", "tavily"] as const, {
					description:
						"'auto' (default) = curl then Tavily fallback. 'curl' = curl only. 'tavily' = skip curl.",
				}),
			),
			timeout_ms: Type.Optional(
				Type.Integer({
					description: "curl timeout in ms (default 15000)",
					minimum: 1000,
					maximum: 60_000,
				}),
			),
		}),
		async execute(_toolCallId, params, signal) {
			const url = params.url;
			const maxChars = params.max_chars ?? 20_000;
			const format = params.format ?? "markdown";
			const force = params.force ?? "auto";
			const timeoutMs = params.timeout_ms ?? 15_000;

			if (!/^https?:\/\//i.test(url)) {
				return {
					content: [{ type: "text", text: `Refusing to fetch non-http(s) URL: ${url}` }],
					isError: true,
					details: { url },
				};
			}

			let outcome: FetchOutcome | null = null;
			let curlError: string | undefined;
			let fallbackReason: string | undefined;

			// --- curl path ---
			if (force !== "tavily") {
				const curl = await fetchWithCurl(pi, url, signal, timeoutMs);
				if (curl.error) {
					curlError = curl.error;
					fallbackReason = curl.error;
				} else {
					const bad = looksBad(curl.status, curl.contentType, curl.body);
					if (!bad || force === "curl") {
						const isHtml = (curl.contentType ?? "").toLowerCase().includes("html");
						const text = isHtml ? stripHtml(curl.body) : curl.body;
						const { text: clipped, truncated } = truncate(text, maxChars);
						outcome = {
							source: "curl",
							status: curl.status,
							contentType: curl.contentType,
							url,
							finalUrl: curl.finalUrl,
							text: clipped,
							truncated,
						};
					} else {
						fallbackReason = bad;
					}
				}
			}

			// --- Tavily Extract fallback ---
			if (!outcome && force !== "curl") {
				const tv = await fetchWithTavilyExtract(url, signal, format);
				if (tv.error) {
					const msg = [
						curlError && `curl: ${curlError}`,
						fallbackReason && `fallback trigger: ${fallbackReason}`,
						`tavily: ${tv.error}`,
					]
						.filter(Boolean)
						.join("\n");
					return {
						content: [{ type: "text", text: `Failed to fetch ${url}\n${msg}` }],
						isError: true,
						details: { url, curlError, fallbackReason, tavilyError: tv.error },
					};
				}
				const { text: clipped, truncated } = truncate(tv.text, maxChars);
				outcome = {
					source: "tavily-extract",
					url,
					text: clipped,
					truncated,
					reasonForFallback: fallbackReason,
				};
			}

			if (!outcome) {
				return {
					content: [
						{
							type: "text",
							text: `Failed to fetch ${url}: ${curlError ?? "unknown error"}`,
						},
					],
					isError: true,
					details: { url, curlError, fallbackReason },
				};
			}

			const header = [
				`URL: ${outcome.finalUrl ?? outcome.url}`,
				`Source: ${outcome.source}${
					outcome.reasonForFallback ? ` (fallback: ${outcome.reasonForFallback})` : ""
				}`,
				outcome.status !== undefined ? `HTTP: ${outcome.status}` : undefined,
				outcome.contentType ? `Content-Type: ${outcome.contentType}` : undefined,
				outcome.truncated ? `(truncated to ${maxChars} chars)` : undefined,
			]
				.filter(Boolean)
				.join("\n");

			return {
				content: [{ type: "text", text: `${header}\n\n${outcome.text}` }],
				details: outcome,
			};
		},
	});
}