oai-web/server/brain/metadata.py

"""
brain/metadata.py — LLM-based metadata extraction.

Extracts structured metadata from a thought using a fast model (gpt-4o-mini
via OpenRouter). Returns type classification, tags, people, and action items.
"""
from __future__ import annotations

import json
import logging

logger = logging.getLogger(__name__)

_MODEL = "openai/gpt-4o-mini"

_SYSTEM_PROMPT = """\
You are a metadata extractor for a personal knowledge base. Given a thought,
extract structured metadata and return ONLY valid JSON — no explanation, no markdown.

JSON schema:
{
  "type": "<one of: insight | person_note | task | reference | idea | other>",
  "tags": ["<2-5 lowercase topic tags>"],
  "people": ["<names of people mentioned, if any>"],
  "action_items": ["<concrete next actions, if any>"]
}

Rules:
- type: insight = general knowledge/observation, person_note = about a specific person,
  task = something to do, reference = link/resource/tool, idea = creative/speculative
- tags: short lowercase words, no spaces (use underscores if needed)
- people: first name or full name as written
- action_items: concrete, actionable phrases only — omit if none
- Keep all lists concise (max 5 items each)
"""


async def extract_metadata(text: str) -> dict:
    """
    Extract type, tags, people, and action_items from a thought.
    Returns a dict. Falls back to minimal metadata on any error.
    """
    from openai import AsyncOpenAI
    from ..database import credential_store

    api_key = await credential_store.get("system:openrouter_api_key")
    if not api_key:
        return {"type": "other", "tags": [], "people": [], "action_items": []}

    client = AsyncOpenAI(
        api_key=api_key,
        base_url="https://openrouter.ai/api/v1",
        default_headers={
            "HTTP-Referer": "https://mac.oai.pm",
            "X-Title": "oAI-Web",
        },
    )

    try:
        response = await client.chat.completions.create(
            model=_MODEL,
            messages=[
                {"role": "system", "content": _SYSTEM_PROMPT},
                {"role": "user", "content": text},
            ],
            temperature=0,
            max_tokens=256,
            response_format={"type": "json_object"},
        )
        raw = response.choices[0].message.content or "{}"
        data = json.loads(raw)
        return {
            "type": str(data.get("type", "other")),
            "tags": [str(t) for t in data.get("tags", [])],
            "people": [str(p) for p in data.get("people", [])],
            "action_items": [str(a) for a in data.get("action_items", [])],
        }
    except Exception as e:
        logger.warning("Metadata extraction failed: %s", e)
        return {"type": "other", "tags": [], "people": [], "action_items": []}