pg_orrery/search/alembic/versions/001_baseline.py
Ryan Malloy 317f74b33b Add search backend: FastAPI + FastMCP + pgvector for docs Q&A and live SQL
Search stack replicates the Hamilton site pattern with pg_orrery-specific
additions:

- FastAPI REST API (chat SSE streaming, semantic search, health check)
- FastMCP server at /mcp with doc search and live SQL query tools
- pgvector + pgai vectorizer for 1024-dim document embeddings
- Hybrid search (semantic cosine + text ILIKE with pg_trgm GIN)
- Dual LLM backend: self-hosted qwen3 via GPU gateway or Anthropic Claude
- Live read-only pg_orrery SQL execution with safety guardrails
  (SELECT-only validation, read-only transaction, 5s timeout, 100-row cap)
- Convenience MCP tools: planet_position, sky_survey, satellite_pass
- MDX content ingestion from docs/src/content/docs/ (50 pages)
- Docker Compose: pg_orrery+pgvector DB, pgai, vectorizer-worker, API
- Alembic async migrations, Makefile, .env.example
2026-03-01 15:42:14 -07:00

84 lines
2.7 KiB
Python

"""document table with pgai vectorizer
Revision ID: 001_baseline
Revises: None
Create Date: 2026-03-01
"""
import sqlalchemy as sa
from alembic import op
revision = "001_baseline"
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
op.create_table(
"document",
sa.Column("id", sa.Integer, primary_key=True, autoincrement=True),
sa.Column("content_type", sa.String(20), nullable=False, index=True),
sa.Column("slug", sa.String(300), nullable=False, unique=True),
sa.Column("title", sa.String(300), nullable=False),
sa.Column("section", sa.String(200), nullable=False, index=True),
sa.Column("description", sa.Text, nullable=True),
sa.Column("body", sa.Text, nullable=False),
sa.Column("search_text", sa.Text, nullable=True),
sa.Column("url", sa.String(300), nullable=False),
sa.Column("word_count", sa.Integer, nullable=False, server_default="0"),
sa.Column(
"updated_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
)
# Enable pg_trgm for fast ILIKE with GIN indexes
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
op.execute(
"CREATE INDEX ix_document_title_trgm ON document USING gin (title gin_trgm_ops)"
)
op.execute(
"CREATE INDEX ix_document_body_trgm ON document USING gin (body gin_trgm_ops)"
)
# pgai vectorizer — reads search_text, generates 1024-dim embeddings
# Uses mxbai-embed-large via the GPU embedding gateway.
op.execute("""
SELECT ai.create_vectorizer(
'document'::regclass,
name => 'document_embedder',
loading => ai.loading_column(column_name => 'search_text'),
embedding => ai.embedding_openai(
model => 'mxbai-embed-large',
dimensions => 1024
),
chunking => ai.chunking_recursive_character_text_splitter(
chunk_size => 400,
chunk_overlap => 50,
separators => array[E'\\n\\n', E'\\n', '. ', ' ']
),
formatting => ai.formatting_python_template(
template => '$chunk'
)
)
""")
def downgrade():
op.execute("""
DO $$ BEGIN
PERFORM ai.drop_vectorizer('document_embedder', drop_all => true);
EXCEPTION WHEN OTHERS THEN
RAISE NOTICE 'document_embedder not found, skipping drop';
END $$
""")
op.execute("DROP INDEX IF EXISTS ix_document_title_trgm")
op.execute("DROP INDEX IF EXISTS ix_document_body_trgm")
op.drop_table("document")