280 lines
11 KiB
Python
280 lines
11 KiB
Python
"""
|
|
tools/web_tool.py — Tiered web access.
|
|
|
|
Tier 1: Domains in WEB_TIER1_WHITELIST — always allowed.
|
|
Tier 2: Any other domain — allowed only when web_tier2_enabled is True
|
|
in the current execution context (set by the agent loop when the
|
|
user explicitly requests external web research) or when running
|
|
as a scheduled task that declared web access.
|
|
|
|
DuckDuckGo search uses the HTML endpoint (no API key required).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from urllib.parse import quote_plus, urlparse
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
from ..context_vars import current_task_id, web_tier2_enabled
|
|
from ..security import SecurityError, assert_domain_tier1, sanitize_external_content
|
|
from ..security_screening import get_content_limit, is_option_enabled
|
|
from .base import BaseTool, ToolResult
|
|
|
|
MAX_RESPONSE_BYTES = 50 * 1024 # 50 KB (legacy fallback when truncation option disabled)
|
|
_DEFAULT_MAX_WEB_CHARS = 20_000 # default when truncation option is enabled
|
|
REQUEST_TIMEOUT = 10 # seconds
|
|
MAX_SEARCH_RESULTS = 10
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept-Language": "en-US,en;q=0.9,nb;q=0.8",
|
|
}
|
|
|
|
|
|
class WebTool(BaseTool):
|
|
name = "web"
|
|
description = (
|
|
"Fetch web pages and search the web. "
|
|
"Operations: fetch_page (retrieve and extract text from a URL), "
|
|
"search (DuckDuckGo search, returns titles, URLs and snippets). "
|
|
"Commonly used sites (Wikipedia, yr.no, etc.) are always available. "
|
|
"Other sites require the user to have initiated a web research task."
|
|
)
|
|
input_schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"operation": {
|
|
"type": "string",
|
|
"enum": ["fetch_page", "search"],
|
|
"description": "fetch_page retrieves a URL; search queries DuckDuckGo",
|
|
},
|
|
"url": {
|
|
"type": "string",
|
|
"description": "URL to fetch (required for fetch_page)",
|
|
},
|
|
"query": {
|
|
"type": "string",
|
|
"description": "Search query (required for search)",
|
|
},
|
|
"num_results": {
|
|
"type": "integer",
|
|
"description": f"Max search results to return (default 5, max {MAX_SEARCH_RESULTS})",
|
|
},
|
|
},
|
|
"required": ["operation"],
|
|
}
|
|
requires_confirmation = False
|
|
allowed_in_scheduled_tasks = True
|
|
|
|
async def execute(
|
|
self,
|
|
operation: str,
|
|
url: str = "",
|
|
query: str = "",
|
|
num_results: int = 5,
|
|
**kwargs,
|
|
) -> ToolResult:
|
|
if operation == "fetch_page":
|
|
if not url:
|
|
return ToolResult(success=False, error="url is required for fetch_page")
|
|
return await self._fetch_page(url)
|
|
|
|
if operation == "search":
|
|
if not query:
|
|
return ToolResult(success=False, error="query is required for search")
|
|
return await self._search(query, min(num_results, MAX_SEARCH_RESULTS))
|
|
|
|
return ToolResult(success=False, error=f"Unknown operation: {operation!r}")
|
|
|
|
# ── Tier check ────────────────────────────────────────────────────────────
|
|
|
|
async def _check_tier(self, url: str) -> ToolResult | None:
|
|
"""
|
|
Returns a ToolResult(success=False) if access is denied, None if allowed.
|
|
Tier 1 is always allowed. Tier 2 requires context flag or scheduled task.
|
|
"""
|
|
if await assert_domain_tier1(url):
|
|
return None # Tier 1 — always allowed
|
|
|
|
# Tier 2 — check context
|
|
task_id = current_task_id.get()
|
|
tier2 = web_tier2_enabled.get()
|
|
|
|
if task_id is not None:
|
|
# Scheduled tasks that declared web access can use Tier 2
|
|
return None
|
|
|
|
if tier2:
|
|
return None # User explicitly initiated web research
|
|
|
|
parsed = urlparse(url)
|
|
return ToolResult(
|
|
success=False,
|
|
error=(
|
|
f"Domain '{parsed.hostname}' is not in the Tier 1 whitelist. "
|
|
"To access it, ask me to search the web or fetch a specific external page — "
|
|
"I'll enable Tier 2 access for your request."
|
|
),
|
|
)
|
|
|
|
# ── fetch_page ────────────────────────────────────────────────────────────
|
|
|
|
async def _fetch_page(self, url: str) -> ToolResult:
|
|
denied = await self._check_tier(url)
|
|
if denied:
|
|
return denied
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=REQUEST_TIMEOUT,
|
|
follow_redirects=True,
|
|
headers=_HEADERS,
|
|
) as client:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
|
|
content_type = resp.headers.get("content-type", "")
|
|
if "text" not in content_type and "html" not in content_type:
|
|
return ToolResult(
|
|
success=False,
|
|
error=f"Non-text content type: {content_type}. Only text/HTML pages are supported.",
|
|
)
|
|
|
|
raw = resp.content[:MAX_RESPONSE_BYTES * 2] # read more, truncate after parse
|
|
text = _extract_text(raw)
|
|
|
|
if await is_option_enabled("system:security_truncation_enabled"):
|
|
max_chars = await get_content_limit("system:security_max_web_chars", _DEFAULT_MAX_WEB_CHARS)
|
|
if len(text) > max_chars:
|
|
text = text[:max_chars]
|
|
text += f"\n\n[Content truncated at {max_chars:,} chars]"
|
|
elif len(text.encode()) > MAX_RESPONSE_BYTES:
|
|
text = text[: MAX_RESPONSE_BYTES // 4 * 4] # char-safe truncation
|
|
text += f"\n\n[Content truncated at {MAX_RESPONSE_BYTES // 1024} KB]"
|
|
|
|
text = await sanitize_external_content(text, source="web")
|
|
|
|
return ToolResult(
|
|
success=True,
|
|
data={
|
|
"url": str(resp.url),
|
|
"content": text,
|
|
"status_code": resp.status_code,
|
|
},
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
return ToolResult(success=False, error=f"Request timed out after {REQUEST_TIMEOUT}s: {url}")
|
|
except httpx.HTTPStatusError as e:
|
|
return ToolResult(success=False, error=f"HTTP {e.response.status_code}: {url}")
|
|
except Exception as e:
|
|
return ToolResult(success=False, error=f"Fetch error: {e}")
|
|
|
|
# ── search ────────────────────────────────────────────────────────────────
|
|
|
|
async def _search(self, query: str, num_results: int) -> ToolResult:
|
|
# DuckDuckGo is Tier 1 — always allowed, no tier check needed
|
|
ddg_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=REQUEST_TIMEOUT,
|
|
follow_redirects=True,
|
|
headers={**_HEADERS, "Accept": "text/html"},
|
|
) as client:
|
|
resp = await client.post(
|
|
"https://html.duckduckgo.com/html/",
|
|
data={"q": query, "b": "", "kl": ""},
|
|
headers=_HEADERS,
|
|
)
|
|
resp.raise_for_status()
|
|
|
|
results = _parse_ddg_results(resp.text, num_results)
|
|
|
|
if not results:
|
|
# Fallback: try GET
|
|
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT, headers=_HEADERS) as client:
|
|
resp = await client.get(ddg_url)
|
|
results = _parse_ddg_results(resp.text, num_results)
|
|
|
|
# Sanitise snippets
|
|
for r in results:
|
|
r["snippet"] = await sanitize_external_content(r["snippet"], source="web")
|
|
|
|
return ToolResult(
|
|
success=True,
|
|
data={"query": query, "results": results, "count": len(results)},
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
return ToolResult(success=False, error=f"Search timed out after {REQUEST_TIMEOUT}s")
|
|
except Exception as e:
|
|
return ToolResult(success=False, error=f"Search error: {e}")
|
|
|
|
|
|
# ── HTML helpers ──────────────────────────────────────────────────────────────
|
|
|
|
def _extract_text(raw: bytes) -> str:
|
|
"""Strip HTML, scripts, styles and return clean readable text."""
|
|
soup = BeautifulSoup(raw, "html.parser")
|
|
|
|
# Remove noise elements
|
|
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
|
|
tag.decompose()
|
|
|
|
# Get text with spacing
|
|
text = soup.get_text(separator="\n")
|
|
|
|
# Collapse whitespace
|
|
lines = [line.strip() for line in text.splitlines()]
|
|
lines = [l for l in lines if l]
|
|
text = "\n".join(lines)
|
|
|
|
# Collapse multiple blank lines
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
return text
|
|
|
|
|
|
def _parse_ddg_results(html: str, limit: int) -> list[dict]:
|
|
"""Parse DuckDuckGo HTML results page."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
results = []
|
|
|
|
for result in soup.select(".result__body, .result"):
|
|
if len(results) >= limit:
|
|
break
|
|
|
|
title_el = result.select_one(".result__title, .result__a")
|
|
url_el = result.select_one(".result__url, a.result__a")
|
|
snippet_el = result.select_one(".result__snippet")
|
|
|
|
title = title_el.get_text(strip=True) if title_el else ""
|
|
url = ""
|
|
if url_el:
|
|
href = url_el.get("href", "")
|
|
# DDG wraps URLs — extract real URL
|
|
if "uddg=" in href:
|
|
from urllib.parse import unquote, parse_qs
|
|
qs = parse_qs(urlparse(href).query)
|
|
url = unquote(qs.get("uddg", [""])[0])
|
|
elif href.startswith("http"):
|
|
url = href
|
|
else:
|
|
url = url_el.get_text(strip=True)
|
|
|
|
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
|
|
|
|
if title and (url or snippet):
|
|
results.append({
|
|
"title": title,
|
|
"url": url,
|
|
"snippet": snippet,
|
|
})
|
|
|
|
return results
|