Files
oai-web/server/tools/web_tool.py
2026-04-08 12:43:24 +02:00

280 lines
11 KiB
Python

"""
tools/web_tool.py — Tiered web access.
Tier 1: Domains in WEB_TIER1_WHITELIST — always allowed.
Tier 2: Any other domain — allowed only when web_tier2_enabled is True
in the current execution context (set by the agent loop when the
user explicitly requests external web research) or when running
as a scheduled task that declared web access.
DuckDuckGo search uses the HTML endpoint (no API key required).
"""
from __future__ import annotations
import re
from urllib.parse import quote_plus, urlparse
import httpx
from bs4 import BeautifulSoup
from ..context_vars import current_task_id, web_tier2_enabled
from ..security import SecurityError, assert_domain_tier1, sanitize_external_content
from ..security_screening import get_content_limit, is_option_enabled
from .base import BaseTool, ToolResult
MAX_RESPONSE_BYTES = 50 * 1024 # 50 KB (legacy fallback when truncation option disabled)
_DEFAULT_MAX_WEB_CHARS = 20_000 # default when truncation option is enabled
REQUEST_TIMEOUT = 10 # seconds
MAX_SEARCH_RESULTS = 10
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9,nb;q=0.8",
}
class WebTool(BaseTool):
name = "web"
description = (
"Fetch web pages and search the web. "
"Operations: fetch_page (retrieve and extract text from a URL), "
"search (DuckDuckGo search, returns titles, URLs and snippets). "
"Commonly used sites (Wikipedia, yr.no, etc.) are always available. "
"Other sites require the user to have initiated a web research task."
)
input_schema = {
"type": "object",
"properties": {
"operation": {
"type": "string",
"enum": ["fetch_page", "search"],
"description": "fetch_page retrieves a URL; search queries DuckDuckGo",
},
"url": {
"type": "string",
"description": "URL to fetch (required for fetch_page)",
},
"query": {
"type": "string",
"description": "Search query (required for search)",
},
"num_results": {
"type": "integer",
"description": f"Max search results to return (default 5, max {MAX_SEARCH_RESULTS})",
},
},
"required": ["operation"],
}
requires_confirmation = False
allowed_in_scheduled_tasks = True
async def execute(
self,
operation: str,
url: str = "",
query: str = "",
num_results: int = 5,
**kwargs,
) -> ToolResult:
if operation == "fetch_page":
if not url:
return ToolResult(success=False, error="url is required for fetch_page")
return await self._fetch_page(url)
if operation == "search":
if not query:
return ToolResult(success=False, error="query is required for search")
return await self._search(query, min(num_results, MAX_SEARCH_RESULTS))
return ToolResult(success=False, error=f"Unknown operation: {operation!r}")
# ── Tier check ────────────────────────────────────────────────────────────
async def _check_tier(self, url: str) -> ToolResult | None:
"""
Returns a ToolResult(success=False) if access is denied, None if allowed.
Tier 1 is always allowed. Tier 2 requires context flag or scheduled task.
"""
if await assert_domain_tier1(url):
return None # Tier 1 — always allowed
# Tier 2 — check context
task_id = current_task_id.get()
tier2 = web_tier2_enabled.get()
if task_id is not None:
# Scheduled tasks that declared web access can use Tier 2
return None
if tier2:
return None # User explicitly initiated web research
parsed = urlparse(url)
return ToolResult(
success=False,
error=(
f"Domain '{parsed.hostname}' is not in the Tier 1 whitelist. "
"To access it, ask me to search the web or fetch a specific external page — "
"I'll enable Tier 2 access for your request."
),
)
# ── fetch_page ────────────────────────────────────────────────────────────
async def _fetch_page(self, url: str) -> ToolResult:
denied = await self._check_tier(url)
if denied:
return denied
try:
async with httpx.AsyncClient(
timeout=REQUEST_TIMEOUT,
follow_redirects=True,
headers=_HEADERS,
) as client:
resp = await client.get(url)
resp.raise_for_status()
content_type = resp.headers.get("content-type", "")
if "text" not in content_type and "html" not in content_type:
return ToolResult(
success=False,
error=f"Non-text content type: {content_type}. Only text/HTML pages are supported.",
)
raw = resp.content[:MAX_RESPONSE_BYTES * 2] # read more, truncate after parse
text = _extract_text(raw)
if await is_option_enabled("system:security_truncation_enabled"):
max_chars = await get_content_limit("system:security_max_web_chars", _DEFAULT_MAX_WEB_CHARS)
if len(text) > max_chars:
text = text[:max_chars]
text += f"\n\n[Content truncated at {max_chars:,} chars]"
elif len(text.encode()) > MAX_RESPONSE_BYTES:
text = text[: MAX_RESPONSE_BYTES // 4 * 4] # char-safe truncation
text += f"\n\n[Content truncated at {MAX_RESPONSE_BYTES // 1024} KB]"
text = await sanitize_external_content(text, source="web")
return ToolResult(
success=True,
data={
"url": str(resp.url),
"content": text,
"status_code": resp.status_code,
},
)
except httpx.TimeoutException:
return ToolResult(success=False, error=f"Request timed out after {REQUEST_TIMEOUT}s: {url}")
except httpx.HTTPStatusError as e:
return ToolResult(success=False, error=f"HTTP {e.response.status_code}: {url}")
except Exception as e:
return ToolResult(success=False, error=f"Fetch error: {e}")
# ── search ────────────────────────────────────────────────────────────────
async def _search(self, query: str, num_results: int) -> ToolResult:
# DuckDuckGo is Tier 1 — always allowed, no tier check needed
ddg_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
try:
async with httpx.AsyncClient(
timeout=REQUEST_TIMEOUT,
follow_redirects=True,
headers={**_HEADERS, "Accept": "text/html"},
) as client:
resp = await client.post(
"https://html.duckduckgo.com/html/",
data={"q": query, "b": "", "kl": ""},
headers=_HEADERS,
)
resp.raise_for_status()
results = _parse_ddg_results(resp.text, num_results)
if not results:
# Fallback: try GET
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT, headers=_HEADERS) as client:
resp = await client.get(ddg_url)
results = _parse_ddg_results(resp.text, num_results)
# Sanitise snippets
for r in results:
r["snippet"] = await sanitize_external_content(r["snippet"], source="web")
return ToolResult(
success=True,
data={"query": query, "results": results, "count": len(results)},
)
except httpx.TimeoutException:
return ToolResult(success=False, error=f"Search timed out after {REQUEST_TIMEOUT}s")
except Exception as e:
return ToolResult(success=False, error=f"Search error: {e}")
# ── HTML helpers ──────────────────────────────────────────────────────────────
def _extract_text(raw: bytes) -> str:
"""Strip HTML, scripts, styles and return clean readable text."""
soup = BeautifulSoup(raw, "html.parser")
# Remove noise elements
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
tag.decompose()
# Get text with spacing
text = soup.get_text(separator="\n")
# Collapse whitespace
lines = [line.strip() for line in text.splitlines()]
lines = [l for l in lines if l]
text = "\n".join(lines)
# Collapse multiple blank lines
text = re.sub(r"\n{3,}", "\n\n", text)
return text
def _parse_ddg_results(html: str, limit: int) -> list[dict]:
"""Parse DuckDuckGo HTML results page."""
soup = BeautifulSoup(html, "html.parser")
results = []
for result in soup.select(".result__body, .result"):
if len(results) >= limit:
break
title_el = result.select_one(".result__title, .result__a")
url_el = result.select_one(".result__url, a.result__a")
snippet_el = result.select_one(".result__snippet")
title = title_el.get_text(strip=True) if title_el else ""
url = ""
if url_el:
href = url_el.get("href", "")
# DDG wraps URLs — extract real URL
if "uddg=" in href:
from urllib.parse import unquote, parse_qs
qs = parse_qs(urlparse(href).query)
url = unquote(qs.get("uddg", [""])[0])
elif href.startswith("http"):
url = href
else:
url = url_el.get_text(strip=True)
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
if title and (url or snippet):
results.append({
"title": title,
"url": url,
"snippet": snippet,
})
return results