oai-web/server/agents/runner.py

"""
agents/runner.py — Agent execution and APScheduler integration (async).

Owns the AsyncIOScheduler — schedules and runs all agents (cron + manual).
Each run is tracked in the agent_runs table with token counts.
"""
from __future__ import annotations

import asyncio
import logging
from datetime import datetime, timezone

from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger

from ..agent.agent import Agent, DoneEvent, ErrorEvent
from ..config import settings
from ..database import credential_store
from . import tasks as agent_store

logger = logging.getLogger(__name__)


# Priority levels for the run queue (lower number = higher priority)
PRIORITY_HIGH   = 0  # User-initiated chat runs
PRIORITY_NORMAL = 1  # Webhook / inbox / Telegram triggers
PRIORITY_LOW    = 2  # Background monitors

_DEFAULT_MAX_CONCURRENT = 3


class AgentRunner:
    def __init__(self) -> None:
        self._agent: Agent | None = None
        self._scheduler = AsyncIOScheduler(timezone=settings.timezone)
        self._running: dict[str, asyncio.Task] = {}  # run_id → asyncio.Task
        # Concurrency semaphore — initialised in start() once event loop is running
        self._semaphore: asyncio.Semaphore | None = None
        self._max_concurrent: int = _DEFAULT_MAX_CONCURRENT

    @property
    def scheduler(self) -> AsyncIOScheduler:
        return self._scheduler

    @property
    def queue_status(self) -> dict:
        running = sum(1 for t in self._running.values() if not t.done())
        # Tasks waiting for the semaphore are counted as "queued"
        queued = max(0, running - self._max_concurrent)
        return {"running": min(running, self._max_concurrent), "queued": queued, "max_concurrent": self._max_concurrent}

    def init(self, agent: Agent) -> None:
        self._agent = agent

    async def _load_max_concurrent(self) -> int:
        val = await credential_store.get("system:max_concurrent_runs")
        try:
            return max(1, int(val)) if val else _DEFAULT_MAX_CONCURRENT
        except (ValueError, TypeError):
            return _DEFAULT_MAX_CONCURRENT

    async def start(self) -> None:
        """Load all enabled agents with schedules into APScheduler and start it."""
        # Initialise concurrency semaphore (must happen inside a running event loop)
        self._max_concurrent = await self._load_max_concurrent()
        self._semaphore = asyncio.Semaphore(self._max_concurrent)

        for agent in await agent_store.list_agents():
            if agent["enabled"] and agent["schedule"]:
                self._add_job(agent)
        # Daily audit log rotation at 03:00
        self._scheduler.add_job(
            self._rotate_audit_log,
            trigger=CronTrigger(hour=3, minute=0, timezone=settings.timezone),
            id="system:audit-rotation",
            replace_existing=True,
            misfire_grace_time=3600,
        )
        self._scheduler.start()
        logger.info(
            "[agent-runner] Scheduler started, max_concurrent=%d", self._max_concurrent
        )

    def shutdown(self) -> None:
        if self._scheduler.running:
            self._scheduler.shutdown(wait=False)
        logger.info("[agent-runner] Scheduler stopped")

    def _add_job(self, agent: dict) -> None:
        try:
            self._scheduler.add_job(
                self._run_agent_scheduled,
                trigger=CronTrigger.from_crontab(
                    agent["schedule"], timezone=settings.timezone
                ),
                id=f"agent:{agent['id']}",
                args=[agent["id"]],
                replace_existing=True,
                misfire_grace_time=300,
            )
            logger.info(
                f"[agent-runner] Scheduled agent '{agent['name']}' ({agent['schedule']})"
            )
        except Exception as e:
            logger.error(
                f"[agent-runner] Failed to schedule agent '{agent['name']}': {e}"
            )

    def reschedule(self, agent: dict) -> None:
        job_id = f"agent:{agent['id']}"
        try:
            self._scheduler.remove_job(job_id)
        except Exception:
            pass
        if agent["enabled"] and agent["schedule"]:
            self._add_job(agent)

    def remove(self, agent_id: str) -> None:
        try:
            self._scheduler.remove_job(f"agent:{agent_id}")
        except Exception:
            pass

    # ── Execution ─────────────────────────────────────────────────────────────

    async def run_agent_now(self, agent_id: str, override_message: str | None = None) -> dict:
        """UI-triggered run — bypasses schedule, returns run dict."""
        return await self._run_agent(agent_id, ignore_rate_limit=True, override_message=override_message)

    async def run_agent_and_wait(
        self,
        agent_id: str,
        override_message: str,
        session_id: str | None = None,
        extra_tools: list | None = None,
        force_only_extra_tools: bool = False,
    ) -> str:
        """Run an agent, wait for it to finish, and return the final response text."""
        run = await self._run_agent(
            agent_id,
            ignore_rate_limit=True,
            override_message=override_message,
            session_id=session_id,
            extra_tools=extra_tools,
            force_only_extra_tools=force_only_extra_tools,
        )
        if "id" not in run:
            logger.warning("[agent-runner] run_agent_and_wait failed for agent %s: %s", agent_id, run.get("error"))
            return f"Could not run agent: {run.get('error', 'unknown error')}"
        run_id = run["id"]
        task = self._running.get(run_id)
        if task:
            try:
                await asyncio.wait_for(asyncio.shield(task), timeout=300)
            except (asyncio.TimeoutError, asyncio.CancelledError):
                pass
        row = await agent_store.get_run(run_id)
        return (row.get("result") or "(no response)") if row else "(no response)"

    async def _rotate_audit_log(self) -> None:
        """Called daily by APScheduler. Purges audit entries older than the configured retention."""
        from ..audit import audit_log
        days_str = await credential_store.get("system:audit_retention_days")
        days = int(days_str) if days_str else 0
        if days <= 0:
            return
        deleted = await audit_log.purge(older_than_days=days)
        logger.info("[agent-runner] Audit rotation: deleted %d entries older than %d days", deleted, days)

    async def _run_agent_scheduled(self, agent_id: str) -> None:
        """Called by APScheduler — fire and forget."""
        await self._run_agent(agent_id, ignore_rate_limit=False)

    async def _run_agent(
        self,
        agent_id: str,
        ignore_rate_limit: bool = False,
        override_message: str | None = None,
        session_id: str | None = None,
        extra_tools: list | None = None,
        force_only_extra_tools: bool = False,
    ) -> dict:
        agent_data = await agent_store.get_agent(agent_id)
        if not agent_data:
            logger.warning("[agent-runner] _run_agent: agent %s not found", agent_id)
            return {"error": "Agent not found"}
        if not agent_data["enabled"] and not ignore_rate_limit:
            logger.warning("[agent-runner] _run_agent: agent %s is disabled", agent_id)
            return {"error": "Agent is disabled"}

        # Kill switch
        if await credential_store.get("system:paused") == "1":
            logger.warning("[agent-runner] _run_agent: system is paused")
            return {"error": "Agent is paused"}

        if self._agent is None:
            logger.warning("[agent-runner] _run_agent: agent runner not initialized")
            return {"error": "Agent not initialized"}

        # allowed_tools is JSONB, normalised to list|None in _agent_row()
        raw = agent_data.get("allowed_tools")
        allowed_tools: list[str] | None = raw if raw else None

        # Resolve agent owner's admin status — bash is never available to non-admin owners
        # Also block execution if the owner account has been deactivated.
        owner_is_admin = True
        owner_id = agent_data.get("owner_user_id")
        if owner_id:
            from ..users import get_user_by_id as _get_user
            owner = await _get_user(owner_id)
            if owner and not owner.get("is_active", True):
                logger.warning(
                    "[agent-runner] Skipping agent '%s' — owner account is deactivated",
                    agent_data["name"],
                )
                return {"error": "Owner account is deactivated"}
            owner_is_admin = (owner["role"] == "admin") if owner else True

        if not owner_is_admin:
            if allowed_tools is None:
                all_names = [t.name for t in self._agent._registry.all_tools()]
                allowed_tools = [t for t in all_names if t != "bash"]
            else:
                allowed_tools = [t for t in allowed_tools if t != "bash"]

        # Create run record
        run = await agent_store.create_run(agent_id)
        run_id = run["id"]

        logger.info(
            f"[agent-runner] Running agent '{agent_data['name']}' run={run_id[:8]}"
        )

        # Per-agent max_tool_calls override (None = use system default)
        max_tool_calls: int | None = agent_data.get("max_tool_calls") or None

        async def _execute():
            input_tokens = 0
            output_tokens = 0
            final_text = ""
            try:
                from ..agent.agent import _build_system_prompt
                prompt_mode = agent_data.get("prompt_mode") or "combined"
                agent_prompt = agent_data["prompt"]
                system_override: str | None = None

                if override_message:
                    run_message = override_message
                    if prompt_mode == "agent_only":
                        system_override = agent_prompt
                    elif prompt_mode == "combined":
                        system_override = agent_prompt + "\n\n---\n\n" + await _build_system_prompt(user_id=owner_id)
                else:
                    run_message = agent_prompt
                    if prompt_mode == "agent_only":
                        system_override = agent_prompt
                    elif prompt_mode == "combined":
                        system_override = agent_prompt + "\n\n---\n\n" + await _build_system_prompt(user_id=owner_id)

                stream = await self._agent.run(
                    message=run_message,
                    session_id=session_id or f"agent:{run_id}",
                    task_id=run_id,
                    allowed_tools=allowed_tools,
                    model=agent_data.get("model") or None,
                    max_tool_calls=max_tool_calls,
                    system_override=system_override,
                    user_id=owner_id,
                    extra_tools=extra_tools,
                    force_only_extra_tools=force_only_extra_tools,
                )
                async for event in stream:
                    if isinstance(event, DoneEvent):
                        final_text = event.text or "Done"
                        input_tokens = event.usage.input_tokens
                        output_tokens = event.usage.output_tokens
                    elif isinstance(event, ErrorEvent):
                        final_text = f"Error: {event.message}"

                await agent_store.finish_run(
                    run_id,
                    status="success",
                    input_tokens=input_tokens,
                    output_tokens=output_tokens,
                    result=final_text,
                )
                logger.info(
                    f"[agent-runner] Agent '{agent_data['name']}' run={run_id[:8]} completed OK"
                )
            except asyncio.CancelledError:
                await agent_store.finish_run(run_id, status="stopped")
                logger.info(f"[agent-runner] Run {run_id[:8]} stopped")
            except Exception as e:
                logger.error(f"[agent-runner] Run {run_id[:8]} failed: {e}")
                await agent_store.finish_run(run_id, status="error", error=str(e))
            finally:
                self._running.pop(run_id, None)

        async def _execute_with_semaphore():
            if self._semaphore:
                async with self._semaphore:
                    await _execute()
            else:
                await _execute()

        task = asyncio.create_task(_execute_with_semaphore())
        self._running[run_id] = task
        return await agent_store.get_run(run_id)

    def stop_run(self, run_id: str) -> bool:
        task = self._running.get(run_id)
        if task and not task.done():
            task.cancel()
            return True
        return False

    def is_running(self, run_id: str) -> bool:
        task = self._running.get(run_id)
        return task is not None and not task.done()

    async def find_active_run(self, agent_id: str) -> str | None:
        """Return run_id of an in-progress run for this agent, or None."""
        for run_id, task in self._running.items():
            if not task.done():
                run = await agent_store.get_run(run_id)
                if run and run["agent_id"] == agent_id:
                    return run_id
        return None


# Module-level singleton
agent_runner = AgentRunner()