oai-web/server/brain/database.py

"""
brain/database.py — PostgreSQL + pgvector connection pool and schema.

Manages the asyncpg connection pool and initialises the thoughts table +
match_thoughts function on first startup.
"""
from __future__ import annotations

import logging
import os
from typing import Any

import asyncpg

logger = logging.getLogger(__name__)

_pool: asyncpg.Pool | None = None

# ── Schema ────────────────────────────────────────────────────────────────────

_SCHEMA_SQL = """
-- pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;

-- Main thoughts table
CREATE TABLE IF NOT EXISTS thoughts (
    id         UUID        PRIMARY KEY DEFAULT gen_random_uuid(),
    content    TEXT        NOT NULL,
    embedding  vector(1536),
    metadata   JSONB       NOT NULL DEFAULT '{}',
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- IVFFlat index for fast approximate nearest-neighbour search.
-- Created only if it doesn't exist (pg doesn't support IF NOT EXISTS for indexes).
DO $$
BEGIN
    IF NOT EXISTS (
        SELECT 1 FROM pg_indexes
        WHERE tablename = 'thoughts' AND indexname = 'thoughts_embedding_idx'
    ) THEN
        CREATE INDEX thoughts_embedding_idx
            ON thoughts USING ivfflat (embedding vector_cosine_ops)
            WITH (lists = 100);
    END IF;
END$$;

-- Semantic similarity search function
CREATE OR REPLACE FUNCTION match_thoughts(
    query_embedding vector(1536),
    match_threshold FLOAT  DEFAULT 0.7,
    match_count     INT    DEFAULT 10
)
RETURNS TABLE (
    id          UUID,
    content     TEXT,
    metadata    JSONB,
    similarity  FLOAT,
    created_at  TIMESTAMPTZ
)
LANGUAGE sql STABLE AS $$
    SELECT
        id,
        content,
        metadata,
        1 - (embedding <=> query_embedding) AS similarity,
        created_at
    FROM thoughts
    WHERE 1 - (embedding <=> query_embedding) > match_threshold
    ORDER BY similarity DESC
    LIMIT match_count;
$$;
"""


# ── Pool lifecycle ────────────────────────────────────────────────────────────

async def init_brain_db() -> None:
    """
    Create the connection pool and initialise the schema.
    Called from main.py lifespan. No-ops gracefully if BRAIN_DB_URL is unset.
    """
    global _pool
    url = os.getenv("BRAIN_DB_URL")
    if not url:
        logger.info("BRAIN_DB_URL not set — 2nd Brain disabled")
        return
    try:
        _pool = await asyncpg.create_pool(url, min_size=1, max_size=5)
        async with _pool.acquire() as conn:
            await conn.execute(_SCHEMA_SQL)
            # Per-user brain namespace (3-G): add user_id column if it doesn't exist yet
            await conn.execute(
                "ALTER TABLE thoughts ADD COLUMN IF NOT EXISTS user_id TEXT"
            )
        logger.info("Brain DB initialised")
    except Exception as e:
        logger.error("Brain DB init failed: %s", e)
        _pool = None


async def close_brain_db() -> None:
    global _pool
    if _pool:
        await _pool.close()
        _pool = None


def get_pool() -> asyncpg.Pool | None:
    return _pool


# ── CRUD helpers ──────────────────────────────────────────────────────────────

async def insert_thought(
    content: str,
    embedding: list[float],
    metadata: dict,
    user_id: str | None = None,
) -> str:
    """Insert a thought and return its UUID."""
    pool = get_pool()
    if pool is None:
        raise RuntimeError("Brain DB not available")
    async with pool.acquire() as conn:
        row = await conn.fetchrow(
            """
            INSERT INTO thoughts (content, embedding, metadata, user_id)
            VALUES ($1, $2::vector, $3::jsonb, $4)
            RETURNING id::text
            """,
            content,
            str(embedding),
            __import__("json").dumps(metadata),
            user_id,
        )
        return row["id"]


async def search_thoughts(
    query_embedding: list[float],
    threshold: float = 0.7,
    limit: int = 10,
    user_id: str | None = None,
) -> list[dict]:
    """Return thoughts ranked by semantic similarity, scoped to user_id if set."""
    pool = get_pool()
    if pool is None:
        raise RuntimeError("Brain DB not available")
    import json as _json
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            """
            SELECT mt.id, mt.content, mt.metadata, mt.similarity, mt.created_at
            FROM match_thoughts($1::vector, $2, $3) mt
            JOIN thoughts t ON t.id = mt.id
            WHERE ($4::text IS NULL OR t.user_id = $4::text)
            """,
            str(query_embedding),
            threshold,
            limit,
            user_id,
        )
    return [
        {
            "id": str(r["id"]),
            "content": r["content"],
            "metadata": _json.loads(r["metadata"]) if isinstance(r["metadata"], str) else dict(r["metadata"]),
            "similarity": round(float(r["similarity"]), 4),
            "created_at": r["created_at"].isoformat(),
        }
        for r in rows
    ]


async def browse_thoughts(
    limit: int = 20,
    type_filter: str | None = None,
    user_id: str | None = None,
) -> list[dict]:
    """Return recent thoughts, optionally filtered by metadata type and user."""
    pool = get_pool()
    if pool is None:
        raise RuntimeError("Brain DB not available")
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            """
            SELECT id::text, content, metadata, created_at
            FROM thoughts
            WHERE ($1::text IS NULL OR user_id = $1::text)
              AND ($2::text IS NULL OR metadata->>'type' = $2::text)
            ORDER BY created_at DESC
            LIMIT $3
            """,
            user_id,
            type_filter,
            limit,
        )
    import json as _json
    return [
        {
            "id": str(r["id"]),
            "content": r["content"],
            "metadata": _json.loads(r["metadata"]) if isinstance(r["metadata"], str) else dict(r["metadata"]),
            "created_at": r["created_at"].isoformat(),
        }
        for r in rows
    ]


async def get_stats(user_id: str | None = None) -> dict:
    """Return aggregate stats about the thoughts database, scoped to user_id if set."""
    pool = get_pool()
    if pool is None:
        raise RuntimeError("Brain DB not available")
    async with pool.acquire() as conn:
        total = await conn.fetchval(
            "SELECT COUNT(*) FROM thoughts WHERE ($1::text IS NULL OR user_id = $1::text)",
            user_id,
        )
        by_type = await conn.fetch(
            """
            SELECT metadata->>'type' AS type, COUNT(*) AS count
            FROM thoughts
            WHERE ($1::text IS NULL OR user_id = $1::text)
            GROUP BY metadata->>'type'
            ORDER BY count DESC
            """,
            user_id,
        )
        recent = await conn.fetchval(
            "SELECT created_at FROM thoughts WHERE ($1::text IS NULL OR user_id = $1::text) ORDER BY created_at DESC LIMIT 1",
            user_id,
        )
    return {
        "total": total,
        "by_type": [{"type": r["type"] or "unknown", "count": r["count"]} for r in by_type],
        "most_recent": recent.isoformat() if recent else None,
    }