""" tools/web_tool.py — Tiered web access. Tier 1: Domains in WEB_TIER1_WHITELIST — always allowed. Tier 2: Any other domain — allowed only when web_tier2_enabled is True in the current execution context (set by the agent loop when the user explicitly requests external web research) or when running as a scheduled task that declared web access. DuckDuckGo search uses the HTML endpoint (no API key required). """ from __future__ import annotations import re from urllib.parse import quote_plus, urlparse import httpx from bs4 import BeautifulSoup from ..context_vars import current_task_id, web_tier2_enabled from ..security import SecurityError, assert_domain_tier1, sanitize_external_content from ..security_screening import get_content_limit, is_option_enabled from .base import BaseTool, ToolResult MAX_RESPONSE_BYTES = 50 * 1024 # 50 KB (legacy fallback when truncation option disabled) _DEFAULT_MAX_WEB_CHARS = 20_000 # default when truncation option is enabled REQUEST_TIMEOUT = 10 # seconds MAX_SEARCH_RESULTS = 10 _HEADERS = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9,nb;q=0.8", } class WebTool(BaseTool): name = "web" description = ( "Fetch web pages and search the web. " "Operations: fetch_page (retrieve and extract text from a URL), " "search (DuckDuckGo search, returns titles, URLs and snippets). " "Commonly used sites (Wikipedia, yr.no, etc.) are always available. " "Other sites require the user to have initiated a web research task." ) input_schema = { "type": "object", "properties": { "operation": { "type": "string", "enum": ["fetch_page", "search"], "description": "fetch_page retrieves a URL; search queries DuckDuckGo", }, "url": { "type": "string", "description": "URL to fetch (required for fetch_page)", }, "query": { "type": "string", "description": "Search query (required for search)", }, "num_results": { "type": "integer", "description": f"Max search results to return (default 5, max {MAX_SEARCH_RESULTS})", }, }, "required": ["operation"], } requires_confirmation = False allowed_in_scheduled_tasks = True async def execute( self, operation: str, url: str = "", query: str = "", num_results: int = 5, **kwargs, ) -> ToolResult: if operation == "fetch_page": if not url: return ToolResult(success=False, error="url is required for fetch_page") return await self._fetch_page(url) if operation == "search": if not query: return ToolResult(success=False, error="query is required for search") return await self._search(query, min(num_results, MAX_SEARCH_RESULTS)) return ToolResult(success=False, error=f"Unknown operation: {operation!r}") # ── Tier check ──────────────────────────────────────────────────────────── async def _check_tier(self, url: str) -> ToolResult | None: """ Returns a ToolResult(success=False) if access is denied, None if allowed. Tier 1 is always allowed. Tier 2 requires context flag or scheduled task. """ if await assert_domain_tier1(url): return None # Tier 1 — always allowed # Tier 2 — check context task_id = current_task_id.get() tier2 = web_tier2_enabled.get() if task_id is not None: # Scheduled tasks that declared web access can use Tier 2 return None if tier2: return None # User explicitly initiated web research parsed = urlparse(url) return ToolResult( success=False, error=( f"Domain '{parsed.hostname}' is not in the Tier 1 whitelist. " "To access it, ask me to search the web or fetch a specific external page — " "I'll enable Tier 2 access for your request." ), ) # ── fetch_page ──────────────────────────────────────────────────────────── async def _fetch_page(self, url: str) -> ToolResult: denied = await self._check_tier(url) if denied: return denied try: async with httpx.AsyncClient( timeout=REQUEST_TIMEOUT, follow_redirects=True, headers=_HEADERS, ) as client: resp = await client.get(url) resp.raise_for_status() content_type = resp.headers.get("content-type", "") if "text" not in content_type and "html" not in content_type: return ToolResult( success=False, error=f"Non-text content type: {content_type}. Only text/HTML pages are supported.", ) raw = resp.content[:MAX_RESPONSE_BYTES * 2] # read more, truncate after parse text = _extract_text(raw) if await is_option_enabled("system:security_truncation_enabled"): max_chars = await get_content_limit("system:security_max_web_chars", _DEFAULT_MAX_WEB_CHARS) if len(text) > max_chars: text = text[:max_chars] text += f"\n\n[Content truncated at {max_chars:,} chars]" elif len(text.encode()) > MAX_RESPONSE_BYTES: text = text[: MAX_RESPONSE_BYTES // 4 * 4] # char-safe truncation text += f"\n\n[Content truncated at {MAX_RESPONSE_BYTES // 1024} KB]" text = await sanitize_external_content(text, source="web") return ToolResult( success=True, data={ "url": str(resp.url), "content": text, "status_code": resp.status_code, }, ) except httpx.TimeoutException: return ToolResult(success=False, error=f"Request timed out after {REQUEST_TIMEOUT}s: {url}") except httpx.HTTPStatusError as e: return ToolResult(success=False, error=f"HTTP {e.response.status_code}: {url}") except Exception as e: return ToolResult(success=False, error=f"Fetch error: {e}") # ── search ──────────────────────────────────────────────────────────────── async def _search(self, query: str, num_results: int) -> ToolResult: # DuckDuckGo is Tier 1 — always allowed, no tier check needed ddg_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}" try: async with httpx.AsyncClient( timeout=REQUEST_TIMEOUT, follow_redirects=True, headers={**_HEADERS, "Accept": "text/html"}, ) as client: resp = await client.post( "https://html.duckduckgo.com/html/", data={"q": query, "b": "", "kl": ""}, headers=_HEADERS, ) resp.raise_for_status() results = _parse_ddg_results(resp.text, num_results) if not results: # Fallback: try GET async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT, headers=_HEADERS) as client: resp = await client.get(ddg_url) results = _parse_ddg_results(resp.text, num_results) # Sanitise snippets for r in results: r["snippet"] = await sanitize_external_content(r["snippet"], source="web") return ToolResult( success=True, data={"query": query, "results": results, "count": len(results)}, ) except httpx.TimeoutException: return ToolResult(success=False, error=f"Search timed out after {REQUEST_TIMEOUT}s") except Exception as e: return ToolResult(success=False, error=f"Search error: {e}") # ── HTML helpers ────────────────────────────────────────────────────────────── def _extract_text(raw: bytes) -> str: """Strip HTML, scripts, styles and return clean readable text.""" soup = BeautifulSoup(raw, "html.parser") # Remove noise elements for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]): tag.decompose() # Get text with spacing text = soup.get_text(separator="\n") # Collapse whitespace lines = [line.strip() for line in text.splitlines()] lines = [l for l in lines if l] text = "\n".join(lines) # Collapse multiple blank lines text = re.sub(r"\n{3,}", "\n\n", text) return text def _parse_ddg_results(html: str, limit: int) -> list[dict]: """Parse DuckDuckGo HTML results page.""" soup = BeautifulSoup(html, "html.parser") results = [] for result in soup.select(".result__body, .result"): if len(results) >= limit: break title_el = result.select_one(".result__title, .result__a") url_el = result.select_one(".result__url, a.result__a") snippet_el = result.select_one(".result__snippet") title = title_el.get_text(strip=True) if title_el else "" url = "" if url_el: href = url_el.get("href", "") # DDG wraps URLs — extract real URL if "uddg=" in href: from urllib.parse import unquote, parse_qs qs = parse_qs(urlparse(href).query) url = unquote(qs.get("uddg", [""])[0]) elif href.startswith("http"): url = href else: url = url_el.get_text(strip=True) snippet = snippet_el.get_text(strip=True) if snippet_el else "" if title and (url or snippet): results.append({ "title": title, "url": url, "snippet": snippet, }) return results