Initial commit
This commit is contained in:
279
server/tools/web_tool.py
Normal file
279
server/tools/web_tool.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""
|
||||
tools/web_tool.py — Tiered web access.
|
||||
|
||||
Tier 1: Domains in WEB_TIER1_WHITELIST — always allowed.
|
||||
Tier 2: Any other domain — allowed only when web_tier2_enabled is True
|
||||
in the current execution context (set by the agent loop when the
|
||||
user explicitly requests external web research) or when running
|
||||
as a scheduled task that declared web access.
|
||||
|
||||
DuckDuckGo search uses the HTML endpoint (no API key required).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from urllib.parse import quote_plus, urlparse
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ..context_vars import current_task_id, web_tier2_enabled
|
||||
from ..security import SecurityError, assert_domain_tier1, sanitize_external_content
|
||||
from ..security_screening import get_content_limit, is_option_enabled
|
||||
from .base import BaseTool, ToolResult
|
||||
|
||||
MAX_RESPONSE_BYTES = 50 * 1024 # 50 KB (legacy fallback when truncation option disabled)
|
||||
_DEFAULT_MAX_WEB_CHARS = 20_000 # default when truncation option is enabled
|
||||
REQUEST_TIMEOUT = 10 # seconds
|
||||
MAX_SEARCH_RESULTS = 10
|
||||
|
||||
_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept-Language": "en-US,en;q=0.9,nb;q=0.8",
|
||||
}
|
||||
|
||||
|
||||
class WebTool(BaseTool):
|
||||
name = "web"
|
||||
description = (
|
||||
"Fetch web pages and search the web. "
|
||||
"Operations: fetch_page (retrieve and extract text from a URL), "
|
||||
"search (DuckDuckGo search, returns titles, URLs and snippets). "
|
||||
"Commonly used sites (Wikipedia, yr.no, etc.) are always available. "
|
||||
"Other sites require the user to have initiated a web research task."
|
||||
)
|
||||
input_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"operation": {
|
||||
"type": "string",
|
||||
"enum": ["fetch_page", "search"],
|
||||
"description": "fetch_page retrieves a URL; search queries DuckDuckGo",
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "URL to fetch (required for fetch_page)",
|
||||
},
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Search query (required for search)",
|
||||
},
|
||||
"num_results": {
|
||||
"type": "integer",
|
||||
"description": f"Max search results to return (default 5, max {MAX_SEARCH_RESULTS})",
|
||||
},
|
||||
},
|
||||
"required": ["operation"],
|
||||
}
|
||||
requires_confirmation = False
|
||||
allowed_in_scheduled_tasks = True
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
operation: str,
|
||||
url: str = "",
|
||||
query: str = "",
|
||||
num_results: int = 5,
|
||||
**kwargs,
|
||||
) -> ToolResult:
|
||||
if operation == "fetch_page":
|
||||
if not url:
|
||||
return ToolResult(success=False, error="url is required for fetch_page")
|
||||
return await self._fetch_page(url)
|
||||
|
||||
if operation == "search":
|
||||
if not query:
|
||||
return ToolResult(success=False, error="query is required for search")
|
||||
return await self._search(query, min(num_results, MAX_SEARCH_RESULTS))
|
||||
|
||||
return ToolResult(success=False, error=f"Unknown operation: {operation!r}")
|
||||
|
||||
# ── Tier check ────────────────────────────────────────────────────────────
|
||||
|
||||
async def _check_tier(self, url: str) -> ToolResult | None:
|
||||
"""
|
||||
Returns a ToolResult(success=False) if access is denied, None if allowed.
|
||||
Tier 1 is always allowed. Tier 2 requires context flag or scheduled task.
|
||||
"""
|
||||
if await assert_domain_tier1(url):
|
||||
return None # Tier 1 — always allowed
|
||||
|
||||
# Tier 2 — check context
|
||||
task_id = current_task_id.get()
|
||||
tier2 = web_tier2_enabled.get()
|
||||
|
||||
if task_id is not None:
|
||||
# Scheduled tasks that declared web access can use Tier 2
|
||||
return None
|
||||
|
||||
if tier2:
|
||||
return None # User explicitly initiated web research
|
||||
|
||||
parsed = urlparse(url)
|
||||
return ToolResult(
|
||||
success=False,
|
||||
error=(
|
||||
f"Domain '{parsed.hostname}' is not in the Tier 1 whitelist. "
|
||||
"To access it, ask me to search the web or fetch a specific external page — "
|
||||
"I'll enable Tier 2 access for your request."
|
||||
),
|
||||
)
|
||||
|
||||
# ── fetch_page ────────────────────────────────────────────────────────────
|
||||
|
||||
async def _fetch_page(self, url: str) -> ToolResult:
|
||||
denied = await self._check_tier(url)
|
||||
if denied:
|
||||
return denied
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
headers=_HEADERS,
|
||||
) as client:
|
||||
resp = await client.get(url)
|
||||
resp.raise_for_status()
|
||||
|
||||
content_type = resp.headers.get("content-type", "")
|
||||
if "text" not in content_type and "html" not in content_type:
|
||||
return ToolResult(
|
||||
success=False,
|
||||
error=f"Non-text content type: {content_type}. Only text/HTML pages are supported.",
|
||||
)
|
||||
|
||||
raw = resp.content[:MAX_RESPONSE_BYTES * 2] # read more, truncate after parse
|
||||
text = _extract_text(raw)
|
||||
|
||||
if await is_option_enabled("system:security_truncation_enabled"):
|
||||
max_chars = await get_content_limit("system:security_max_web_chars", _DEFAULT_MAX_WEB_CHARS)
|
||||
if len(text) > max_chars:
|
||||
text = text[:max_chars]
|
||||
text += f"\n\n[Content truncated at {max_chars:,} chars]"
|
||||
elif len(text.encode()) > MAX_RESPONSE_BYTES:
|
||||
text = text[: MAX_RESPONSE_BYTES // 4 * 4] # char-safe truncation
|
||||
text += f"\n\n[Content truncated at {MAX_RESPONSE_BYTES // 1024} KB]"
|
||||
|
||||
text = await sanitize_external_content(text, source="web")
|
||||
|
||||
return ToolResult(
|
||||
success=True,
|
||||
data={
|
||||
"url": str(resp.url),
|
||||
"content": text,
|
||||
"status_code": resp.status_code,
|
||||
},
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
return ToolResult(success=False, error=f"Request timed out after {REQUEST_TIMEOUT}s: {url}")
|
||||
except httpx.HTTPStatusError as e:
|
||||
return ToolResult(success=False, error=f"HTTP {e.response.status_code}: {url}")
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, error=f"Fetch error: {e}")
|
||||
|
||||
# ── search ────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _search(self, query: str, num_results: int) -> ToolResult:
|
||||
# DuckDuckGo is Tier 1 — always allowed, no tier check needed
|
||||
ddg_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
headers={**_HEADERS, "Accept": "text/html"},
|
||||
) as client:
|
||||
resp = await client.post(
|
||||
"https://html.duckduckgo.com/html/",
|
||||
data={"q": query, "b": "", "kl": ""},
|
||||
headers=_HEADERS,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
results = _parse_ddg_results(resp.text, num_results)
|
||||
|
||||
if not results:
|
||||
# Fallback: try GET
|
||||
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT, headers=_HEADERS) as client:
|
||||
resp = await client.get(ddg_url)
|
||||
results = _parse_ddg_results(resp.text, num_results)
|
||||
|
||||
# Sanitise snippets
|
||||
for r in results:
|
||||
r["snippet"] = await sanitize_external_content(r["snippet"], source="web")
|
||||
|
||||
return ToolResult(
|
||||
success=True,
|
||||
data={"query": query, "results": results, "count": len(results)},
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
return ToolResult(success=False, error=f"Search timed out after {REQUEST_TIMEOUT}s")
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, error=f"Search error: {e}")
|
||||
|
||||
|
||||
# ── HTML helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _extract_text(raw: bytes) -> str:
|
||||
"""Strip HTML, scripts, styles and return clean readable text."""
|
||||
soup = BeautifulSoup(raw, "html.parser")
|
||||
|
||||
# Remove noise elements
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
|
||||
tag.decompose()
|
||||
|
||||
# Get text with spacing
|
||||
text = soup.get_text(separator="\n")
|
||||
|
||||
# Collapse whitespace
|
||||
lines = [line.strip() for line in text.splitlines()]
|
||||
lines = [l for l in lines if l]
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Collapse multiple blank lines
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text
|
||||
|
||||
|
||||
def _parse_ddg_results(html: str, limit: int) -> list[dict]:
|
||||
"""Parse DuckDuckGo HTML results page."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
results = []
|
||||
|
||||
for result in soup.select(".result__body, .result"):
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
title_el = result.select_one(".result__title, .result__a")
|
||||
url_el = result.select_one(".result__url, a.result__a")
|
||||
snippet_el = result.select_one(".result__snippet")
|
||||
|
||||
title = title_el.get_text(strip=True) if title_el else ""
|
||||
url = ""
|
||||
if url_el:
|
||||
href = url_el.get("href", "")
|
||||
# DDG wraps URLs — extract real URL
|
||||
if "uddg=" in href:
|
||||
from urllib.parse import unquote, parse_qs
|
||||
qs = parse_qs(urlparse(href).query)
|
||||
url = unquote(qs.get("uddg", [""])[0])
|
||||
elif href.startswith("http"):
|
||||
url = href
|
||||
else:
|
||||
url = url_el.get_text(strip=True)
|
||||
|
||||
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
|
||||
|
||||
if title and (url or snippet):
|
||||
results.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"snippet": snippet,
|
||||
})
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user