oai-web/server/tools/web_tool.py

"""
tools/web_tool.py — Tiered web access.

Tier 1: Domains in WEB_TIER1_WHITELIST — always allowed.
Tier 2: Any other domain — allowed only when web_tier2_enabled is True
        in the current execution context (set by the agent loop when the
        user explicitly requests external web research) or when running
        as a scheduled task that declared web access.

DuckDuckGo search uses the HTML endpoint (no API key required).
"""
from __future__ import annotations

import re
from urllib.parse import quote_plus, urlparse

import httpx
from bs4 import BeautifulSoup

from ..context_vars import current_task_id, web_tier2_enabled
from ..security import SecurityError, assert_domain_tier1, sanitize_external_content
from ..security_screening import get_content_limit, is_option_enabled
from .base import BaseTool, ToolResult

MAX_RESPONSE_BYTES = 50 * 1024   # 50 KB (legacy fallback when truncation option disabled)
_DEFAULT_MAX_WEB_CHARS = 20_000  # default when truncation option is enabled
REQUEST_TIMEOUT = 10             # seconds
MAX_SEARCH_RESULTS = 10

_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9,nb;q=0.8",
}


class WebTool(BaseTool):
    name = "web"
    description = (
        "Fetch web pages and search the web. "
        "Operations: fetch_page (retrieve and extract text from a URL), "
        "search (DuckDuckGo search, returns titles, URLs and snippets). "
        "Commonly used sites (Wikipedia, yr.no, etc.) are always available. "
        "Other sites require the user to have initiated a web research task."
    )
    input_schema = {
        "type": "object",
        "properties": {
            "operation": {
                "type": "string",
                "enum": ["fetch_page", "search"],
                "description": "fetch_page retrieves a URL; search queries DuckDuckGo",
            },
            "url": {
                "type": "string",
                "description": "URL to fetch (required for fetch_page)",
            },
            "query": {
                "type": "string",
                "description": "Search query (required for search)",
            },
            "num_results": {
                "type": "integer",
                "description": f"Max search results to return (default 5, max {MAX_SEARCH_RESULTS})",
            },
        },
        "required": ["operation"],
    }
    requires_confirmation = False
    allowed_in_scheduled_tasks = True

    async def execute(
        self,
        operation: str,
        url: str = "",
        query: str = "",
        num_results: int = 5,
        **kwargs,
    ) -> ToolResult:
        if operation == "fetch_page":
            if not url:
                return ToolResult(success=False, error="url is required for fetch_page")
            return await self._fetch_page(url)

        if operation == "search":
            if not query:
                return ToolResult(success=False, error="query is required for search")
            return await self._search(query, min(num_results, MAX_SEARCH_RESULTS))

        return ToolResult(success=False, error=f"Unknown operation: {operation!r}")

    # ── Tier check ────────────────────────────────────────────────────────────

    async def _check_tier(self, url: str) -> ToolResult | None:
        """
        Returns a ToolResult(success=False) if access is denied, None if allowed.
        Tier 1 is always allowed. Tier 2 requires context flag or scheduled task.
        """
        if await assert_domain_tier1(url):
            return None  # Tier 1 — always allowed

        # Tier 2 — check context
        task_id = current_task_id.get()
        tier2 = web_tier2_enabled.get()

        if task_id is not None:
            # Scheduled tasks that declared web access can use Tier 2
            return None

        if tier2:
            return None  # User explicitly initiated web research

        parsed = urlparse(url)
        return ToolResult(
            success=False,
            error=(
                f"Domain '{parsed.hostname}' is not in the Tier 1 whitelist. "
                "To access it, ask me to search the web or fetch a specific external page — "
                "I'll enable Tier 2 access for your request."
            ),
        )

    # ── fetch_page ────────────────────────────────────────────────────────────

    async def _fetch_page(self, url: str) -> ToolResult:
        denied = await self._check_tier(url)
        if denied:
            return denied

        try:
            async with httpx.AsyncClient(
                timeout=REQUEST_TIMEOUT,
                follow_redirects=True,
                headers=_HEADERS,
            ) as client:
                resp = await client.get(url)
                resp.raise_for_status()

            content_type = resp.headers.get("content-type", "")
            if "text" not in content_type and "html" not in content_type:
                return ToolResult(
                    success=False,
                    error=f"Non-text content type: {content_type}. Only text/HTML pages are supported.",
                )

            raw = resp.content[:MAX_RESPONSE_BYTES * 2]  # read more, truncate after parse
            text = _extract_text(raw)

            if await is_option_enabled("system:security_truncation_enabled"):
                max_chars = await get_content_limit("system:security_max_web_chars", _DEFAULT_MAX_WEB_CHARS)
                if len(text) > max_chars:
                    text = text[:max_chars]
                    text += f"\n\n[Content truncated at {max_chars:,} chars]"
            elif len(text.encode()) > MAX_RESPONSE_BYTES:
                text = text[: MAX_RESPONSE_BYTES // 4 * 4]  # char-safe truncation
                text += f"\n\n[Content truncated at {MAX_RESPONSE_BYTES // 1024} KB]"

            text = await sanitize_external_content(text, source="web")

            return ToolResult(
                success=True,
                data={
                    "url": str(resp.url),
                    "content": text,
                    "status_code": resp.status_code,
                },
            )

        except httpx.TimeoutException:
            return ToolResult(success=False, error=f"Request timed out after {REQUEST_TIMEOUT}s: {url}")
        except httpx.HTTPStatusError as e:
            return ToolResult(success=False, error=f"HTTP {e.response.status_code}: {url}")
        except Exception as e:
            return ToolResult(success=False, error=f"Fetch error: {e}")

    # ── search ────────────────────────────────────────────────────────────────

    async def _search(self, query: str, num_results: int) -> ToolResult:
        # DuckDuckGo is Tier 1 — always allowed, no tier check needed
        ddg_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"

        try:
            async with httpx.AsyncClient(
                timeout=REQUEST_TIMEOUT,
                follow_redirects=True,
                headers={**_HEADERS, "Accept": "text/html"},
            ) as client:
                resp = await client.post(
                    "https://html.duckduckgo.com/html/",
                    data={"q": query, "b": "", "kl": ""},
                    headers=_HEADERS,
                )
                resp.raise_for_status()

            results = _parse_ddg_results(resp.text, num_results)

            if not results:
                # Fallback: try GET
                async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT, headers=_HEADERS) as client:
                    resp = await client.get(ddg_url)
                results = _parse_ddg_results(resp.text, num_results)

            # Sanitise snippets
            for r in results:
                r["snippet"] = await sanitize_external_content(r["snippet"], source="web")

            return ToolResult(
                success=True,
                data={"query": query, "results": results, "count": len(results)},
            )

        except httpx.TimeoutException:
            return ToolResult(success=False, error=f"Search timed out after {REQUEST_TIMEOUT}s")
        except Exception as e:
            return ToolResult(success=False, error=f"Search error: {e}")


# ── HTML helpers ──────────────────────────────────────────────────────────────

def _extract_text(raw: bytes) -> str:
    """Strip HTML, scripts, styles and return clean readable text."""
    soup = BeautifulSoup(raw, "html.parser")

    # Remove noise elements
    for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
        tag.decompose()

    # Get text with spacing
    text = soup.get_text(separator="\n")

    # Collapse whitespace
    lines = [line.strip() for line in text.splitlines()]
    lines = [l for l in lines if l]
    text = "\n".join(lines)

    # Collapse multiple blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text


def _parse_ddg_results(html: str, limit: int) -> list[dict]:
    """Parse DuckDuckGo HTML results page."""
    soup = BeautifulSoup(html, "html.parser")
    results = []

    for result in soup.select(".result__body, .result"):
        if len(results) >= limit:
            break

        title_el = result.select_one(".result__title, .result__a")
        url_el = result.select_one(".result__url, a.result__a")
        snippet_el = result.select_one(".result__snippet")

        title = title_el.get_text(strip=True) if title_el else ""
        url = ""
        if url_el:
            href = url_el.get("href", "")
            # DDG wraps URLs — extract real URL
            if "uddg=" in href:
                from urllib.parse import unquote, parse_qs
                qs = parse_qs(urlparse(href).query)
                url = unquote(qs.get("uddg", [""])[0])
            elif href.startswith("http"):
                url = href
            else:
                url = url_el.get_text(strip=True)

        snippet = snippet_el.get_text(strip=True) if snippet_el else ""

        if title and (url or snippet):
            results.append({
                "title": title,
                "url": url,
                "snippet": snippet,
            })

    return results