Initial commit

2026-04-08 12:43:24 +02:00
commit be674c2f93
148 changed files with 25007 additions and 0 deletions
--- a/server/tools/web_tool.py
+++ b/server/tools/web_tool.py
@@ -0,0 +1,279 @@
+"""
+tools/web_tool.py — Tiered web access.
+
+Tier 1: Domains in WEB_TIER1_WHITELIST — always allowed.
+Tier 2: Any other domain — allowed only when web_tier2_enabled is True
+        in the current execution context (set by the agent loop when the
+        user explicitly requests external web research) or when running
+        as a scheduled task that declared web access.
+
+DuckDuckGo search uses the HTML endpoint (no API key required).
+"""
+from __future__ import annotations
+
+import re
+from urllib.parse import quote_plus, urlparse
+
+import httpx
+from bs4 import BeautifulSoup
+
+from ..context_vars import current_task_id, web_tier2_enabled
+from ..security import SecurityError, assert_domain_tier1, sanitize_external_content
+from ..security_screening import get_content_limit, is_option_enabled
+from .base import BaseTool, ToolResult
+
+MAX_RESPONSE_BYTES = 50 * 1024   # 50 KB (legacy fallback when truncation option disabled)
+_DEFAULT_MAX_WEB_CHARS = 20_000  # default when truncation option is enabled
+REQUEST_TIMEOUT = 10             # seconds
+MAX_SEARCH_RESULTS = 10
+
+_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+    ),
+    "Accept-Language": "en-US,en;q=0.9,nb;q=0.8",
+}
+
+
+class WebTool(BaseTool):
+    name = "web"
+    description = (
+        "Fetch web pages and search the web. "
+        "Operations: fetch_page (retrieve and extract text from a URL), "
+        "search (DuckDuckGo search, returns titles, URLs and snippets). "
+        "Commonly used sites (Wikipedia, yr.no, etc.) are always available. "
+        "Other sites require the user to have initiated a web research task."
+    )
+    input_schema = {
+        "type": "object",
+        "properties": {
+            "operation": {
+                "type": "string",
+                "enum": ["fetch_page", "search"],
+                "description": "fetch_page retrieves a URL; search queries DuckDuckGo",
+            },
+            "url": {
+                "type": "string",
+                "description": "URL to fetch (required for fetch_page)",
+            },
+            "query": {
+                "type": "string",
+                "description": "Search query (required for search)",
+            },
+            "num_results": {
+                "type": "integer",
+                "description": f"Max search results to return (default 5, max {MAX_SEARCH_RESULTS})",
+            },
+        },
+        "required": ["operation"],
+    }
+    requires_confirmation = False
+    allowed_in_scheduled_tasks = True
+
+    async def execute(
+        self,
+        operation: str,
+        url: str = "",
+        query: str = "",
+        num_results: int = 5,
+        **kwargs,
+    ) -> ToolResult:
+        if operation == "fetch_page":
+            if not url:
+                return ToolResult(success=False, error="url is required for fetch_page")
+            return await self._fetch_page(url)
+
+        if operation == "search":
+            if not query:
+                return ToolResult(success=False, error="query is required for search")
+            return await self._search(query, min(num_results, MAX_SEARCH_RESULTS))
+
+        return ToolResult(success=False, error=f"Unknown operation: {operation!r}")
+
+    # ── Tier check ────────────────────────────────────────────────────────────
+
+    async def _check_tier(self, url: str) -> ToolResult | None:
+        """
+        Returns a ToolResult(success=False) if access is denied, None if allowed.
+        Tier 1 is always allowed. Tier 2 requires context flag or scheduled task.
+        """
+        if await assert_domain_tier1(url):
+            return None  # Tier 1 — always allowed
+
+        # Tier 2 — check context
+        task_id = current_task_id.get()
+        tier2 = web_tier2_enabled.get()
+
+        if task_id is not None:
+            # Scheduled tasks that declared web access can use Tier 2
+            return None
+
+        if tier2:
+            return None  # User explicitly initiated web research
+
+        parsed = urlparse(url)
+        return ToolResult(
+            success=False,
+            error=(
+                f"Domain '{parsed.hostname}' is not in the Tier 1 whitelist. "
+                "To access it, ask me to search the web or fetch a specific external page — "
+                "I'll enable Tier 2 access for your request."
+            ),
+        )
+
+    # ── fetch_page ────────────────────────────────────────────────────────────
+
+    async def _fetch_page(self, url: str) -> ToolResult:
+        denied = await self._check_tier(url)
+        if denied:
+            return denied
+
+        try:
+            async with httpx.AsyncClient(
+                timeout=REQUEST_TIMEOUT,
+                follow_redirects=True,
+                headers=_HEADERS,
+            ) as client:
+                resp = await client.get(url)
+                resp.raise_for_status()
+
+            content_type = resp.headers.get("content-type", "")
+            if "text" not in content_type and "html" not in content_type:
+                return ToolResult(
+                    success=False,
+                    error=f"Non-text content type: {content_type}. Only text/HTML pages are supported.",
+                )
+
+            raw = resp.content[:MAX_RESPONSE_BYTES * 2]  # read more, truncate after parse
+            text = _extract_text(raw)
+
+            if await is_option_enabled("system:security_truncation_enabled"):
+                max_chars = await get_content_limit("system:security_max_web_chars", _DEFAULT_MAX_WEB_CHARS)
+                if len(text) > max_chars:
+                    text = text[:max_chars]
+                    text += f"\n\n[Content truncated at {max_chars:,} chars]"
+            elif len(text.encode()) > MAX_RESPONSE_BYTES:
+                text = text[: MAX_RESPONSE_BYTES // 4 * 4]  # char-safe truncation
+                text += f"\n\n[Content truncated at {MAX_RESPONSE_BYTES // 1024} KB]"
+
+            text = await sanitize_external_content(text, source="web")
+
+            return ToolResult(
+                success=True,
+                data={
+                    "url": str(resp.url),
+                    "content": text,
+                    "status_code": resp.status_code,
+                },
+            )
+
+        except httpx.TimeoutException:
+            return ToolResult(success=False, error=f"Request timed out after {REQUEST_TIMEOUT}s: {url}")
+        except httpx.HTTPStatusError as e:
+            return ToolResult(success=False, error=f"HTTP {e.response.status_code}: {url}")
+        except Exception as e:
+            return ToolResult(success=False, error=f"Fetch error: {e}")
+
+    # ── search ────────────────────────────────────────────────────────────────
+
+    async def _search(self, query: str, num_results: int) -> ToolResult:
+        # DuckDuckGo is Tier 1 — always allowed, no tier check needed
+        ddg_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
+
+        try:
+            async with httpx.AsyncClient(
+                timeout=REQUEST_TIMEOUT,
+                follow_redirects=True,
+                headers={**_HEADERS, "Accept": "text/html"},
+            ) as client:
+                resp = await client.post(
+                    "https://html.duckduckgo.com/html/",
+                    data={"q": query, "b": "", "kl": ""},
+                    headers=_HEADERS,
+                )
+                resp.raise_for_status()
+
+            results = _parse_ddg_results(resp.text, num_results)
+
+            if not results:
+                # Fallback: try GET
+                async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT, headers=_HEADERS) as client:
+                    resp = await client.get(ddg_url)
+                results = _parse_ddg_results(resp.text, num_results)
+
+            # Sanitise snippets
+            for r in results:
+                r["snippet"] = await sanitize_external_content(r["snippet"], source="web")
+
+            return ToolResult(
+                success=True,
+                data={"query": query, "results": results, "count": len(results)},
+            )
+
+        except httpx.TimeoutException:
+            return ToolResult(success=False, error=f"Search timed out after {REQUEST_TIMEOUT}s")
+        except Exception as e:
+            return ToolResult(success=False, error=f"Search error: {e}")
+
+
+# ── HTML helpers ──────────────────────────────────────────────────────────────
+
+def _extract_text(raw: bytes) -> str:
+    """Strip HTML, scripts, styles and return clean readable text."""
+    soup = BeautifulSoup(raw, "html.parser")
+
+    # Remove noise elements
+    for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
+        tag.decompose()
+
+    # Get text with spacing
+    text = soup.get_text(separator="\n")
+
+    # Collapse whitespace
+    lines = [line.strip() for line in text.splitlines()]
+    lines = [l for l in lines if l]
+    text = "\n".join(lines)
+
+    # Collapse multiple blank lines
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text
+
+
+def _parse_ddg_results(html: str, limit: int) -> list[dict]:
+    """Parse DuckDuckGo HTML results page."""
+    soup = BeautifulSoup(html, "html.parser")
+    results = []
+
+    for result in soup.select(".result__body, .result"):
+        if len(results) >= limit:
+            break
+
+        title_el = result.select_one(".result__title, .result__a")
+        url_el = result.select_one(".result__url, a.result__a")
+        snippet_el = result.select_one(".result__snippet")
+
+        title = title_el.get_text(strip=True) if title_el else ""
+        url = ""
+        if url_el:
+            href = url_el.get("href", "")
+            # DDG wraps URLs — extract real URL
+            if "uddg=" in href:
+                from urllib.parse import unquote, parse_qs
+                qs = parse_qs(urlparse(href).query)
+                url = unquote(qs.get("uddg", [""])[0])
+            elif href.startswith("http"):
+                url = href
+            else:
+                url = url_el.get_text(strip=True)
+
+        snippet = snippet_el.get_text(strip=True) if snippet_el else ""
+
+        if title and (url or snippet):
+            results.append({
+                "title": title,
+                "url": url,
+                "snippet": snippet,
+            })
+
+    return results