""" monitors/page_monitor.py — Page change monitor. Polls watched URLs on a cron schedule, hashes the content, and dispatches an agent (or Pushover notification) when the page content changes. """ from __future__ import annotations import hashlib import logging import httpx from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.triggers.cron import CronTrigger from ..config import settings from . import store logger = logging.getLogger(__name__) _DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; oAI-Web page-monitor/1.0)", } async def _fetch_page_content(url: str, css_selector: str | None = None) -> str: """Fetch URL and return plain text (optionally filtered by CSS selector).""" async with httpx.AsyncClient( timeout=30.0, follow_redirects=True, headers=_DEFAULT_HEADERS, ) as client: resp = await client.get(url) resp.raise_for_status() html = resp.text if css_selector: try: from bs4 import BeautifulSoup soup = BeautifulSoup(html, "html.parser") elements = soup.select(css_selector) return "\n".join(el.get_text(separator=" ", strip=True) for el in elements) except Exception as e: logger.warning("[page-monitor] CSS selector '%s' failed: %s", css_selector, e) try: from bs4 import BeautifulSoup soup = BeautifulSoup(html, "html.parser") for tag in soup(["script", "style", "nav", "footer", "header"]): tag.decompose() return soup.get_text(separator="\n", strip=True) except Exception: return html def _content_hash(text: str) -> str: return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() class PageMonitorManager: """Manages APScheduler jobs for all watched_pages entries.""" def __init__(self) -> None: self._scheduler: AsyncIOScheduler | None = None def init(self, scheduler: AsyncIOScheduler) -> None: """Share the AgentRunner's scheduler.""" self._scheduler = scheduler async def start_all(self) -> None: """Load all enabled watched pages and register APScheduler jobs.""" pages = await store.list_watched_pages() for page in pages: if page["enabled"]: self._add_job(page) logger.info("[page-monitor] Registered %d page monitor jobs", len([p for p in pages if p["enabled"]])) def _add_job(self, page: dict) -> None: if not self._scheduler: return try: self._scheduler.add_job( self._check_page, trigger=CronTrigger.from_crontab(page["schedule"], timezone=settings.timezone), id=f"page:{page['id']}", args=[str(page["id"])], replace_existing=True, misfire_grace_time=300, ) except Exception as e: logger.error("[page-monitor] Failed to schedule page '%s': %s", page["name"], e) def reschedule(self, page: dict) -> None: if not self._scheduler: return job_id = f"page:{page['id']}" try: self._scheduler.remove_job(job_id) except Exception: pass if page.get("enabled"): self._add_job(page) def remove(self, page_id: str) -> None: if not self._scheduler: return try: self._scheduler.remove_job(f"page:{page_id}") except Exception: pass async def check_now(self, page_id: str) -> dict: """Force-check a page immediately (UI-triggered). Returns status dict.""" return await self._check_page(page_id) async def _check_page(self, page_id: str) -> dict: page = await store.get_watched_page(page_id) if not page: return {"error": "Page not found"} logger.info("[page-monitor] Checking '%s' (%s)", page["name"], page["url"]) try: content = await _fetch_page_content(page["url"], page.get("css_selector")) except Exception as e: error_msg = str(e)[:200] logger.warning("[page-monitor] Failed to fetch '%s': %s", page["url"], error_msg) await store.update_page_check_result(page_id, None, False, error=error_msg) return {"error": error_msg} new_hash = _content_hash(content) old_hash = page.get("last_content_hash") changed = old_hash is not None and new_hash != old_hash await store.update_page_check_result(page_id, new_hash, changed) if changed: logger.info("[page-monitor] Change detected on '%s'", page["name"]) await self._dispatch_change(page, content) return {"changed": changed, "hash": new_hash, "first_check": old_hash is None} async def _dispatch_change(self, page: dict, content: str) -> None: mode = page.get("notification_mode", "agent") message = ( f"Page change detected: {page['name']}\n" f"URL: {page['url']}\n\n" f"Current content (first 2000 chars):\n{content[:2000]}" ) if mode in ("pushover", "both"): try: from ..tools.pushover_tool import PushoverTool await PushoverTool().execute( title=f"Page changed: {page['name']}", message=f"{page['url']} has new content.", priority=0, ) except Exception as e: logger.warning("[page-monitor] Pushover notify failed: %s", e) if mode in ("agent", "both"): agent_id = page.get("agent_id") if agent_id: try: from ..agents.runner import agent_runner await agent_runner.run_agent_now( agent_id=agent_id, override_message=message, ) except Exception as e: logger.warning("[page-monitor] Agent dispatch failed: %s", e) page_monitor = PageMonitorManager()