Search stack replicates the Hamilton site pattern with pg_orrery-specific additions: - FastAPI REST API (chat SSE streaming, semantic search, health check) - FastMCP server at /mcp with doc search and live SQL query tools - pgvector + pgai vectorizer for 1024-dim document embeddings - Hybrid search (semantic cosine + text ILIKE with pg_trgm GIN) - Dual LLM backend: self-hosted qwen3 via GPU gateway or Anthropic Claude - Live read-only pg_orrery SQL execution with safety guardrails (SELECT-only validation, read-only transaction, 5s timeout, 100-row cap) - Convenience MCP tools: planet_position, sky_survey, satellite_pass - MDX content ingestion from docs/src/content/docs/ (50 pages) - Docker Compose: pg_orrery+pgvector DB, pgai, vectorizer-worker, API - Alembic async migrations, Makefile, .env.example
84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
"""document table with pgai vectorizer
|
|
|
|
Revision ID: 001_baseline
|
|
Revises: None
|
|
Create Date: 2026-03-01
|
|
"""
|
|
|
|
import sqlalchemy as sa
|
|
from alembic import op
|
|
|
|
revision = "001_baseline"
|
|
down_revision = None
|
|
branch_labels = None
|
|
depends_on = None
|
|
|
|
|
|
def upgrade():
|
|
op.create_table(
|
|
"document",
|
|
sa.Column("id", sa.Integer, primary_key=True, autoincrement=True),
|
|
sa.Column("content_type", sa.String(20), nullable=False, index=True),
|
|
sa.Column("slug", sa.String(300), nullable=False, unique=True),
|
|
sa.Column("title", sa.String(300), nullable=False),
|
|
sa.Column("section", sa.String(200), nullable=False, index=True),
|
|
sa.Column("description", sa.Text, nullable=True),
|
|
sa.Column("body", sa.Text, nullable=False),
|
|
sa.Column("search_text", sa.Text, nullable=True),
|
|
sa.Column("url", sa.String(300), nullable=False),
|
|
sa.Column("word_count", sa.Integer, nullable=False, server_default="0"),
|
|
sa.Column(
|
|
"updated_at",
|
|
sa.DateTime(timezone=True),
|
|
server_default=sa.func.now(),
|
|
nullable=False,
|
|
),
|
|
)
|
|
|
|
# Enable pg_trgm for fast ILIKE with GIN indexes
|
|
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
|
|
|
|
op.execute(
|
|
"CREATE INDEX ix_document_title_trgm ON document USING gin (title gin_trgm_ops)"
|
|
)
|
|
op.execute(
|
|
"CREATE INDEX ix_document_body_trgm ON document USING gin (body gin_trgm_ops)"
|
|
)
|
|
|
|
# pgai vectorizer — reads search_text, generates 1024-dim embeddings
|
|
# Uses mxbai-embed-large via the GPU embedding gateway.
|
|
op.execute("""
|
|
SELECT ai.create_vectorizer(
|
|
'document'::regclass,
|
|
name => 'document_embedder',
|
|
loading => ai.loading_column(column_name => 'search_text'),
|
|
embedding => ai.embedding_openai(
|
|
model => 'mxbai-embed-large',
|
|
dimensions => 1024
|
|
),
|
|
chunking => ai.chunking_recursive_character_text_splitter(
|
|
chunk_size => 400,
|
|
chunk_overlap => 50,
|
|
separators => array[E'\\n\\n', E'\\n', '. ', ' ']
|
|
),
|
|
formatting => ai.formatting_python_template(
|
|
template => '$chunk'
|
|
)
|
|
)
|
|
""")
|
|
|
|
|
|
def downgrade():
|
|
op.execute("""
|
|
DO $$ BEGIN
|
|
PERFORM ai.drop_vectorizer('document_embedder', drop_all => true);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RAISE NOTICE 'document_embedder not found, skipping drop';
|
|
END $$
|
|
""")
|
|
|
|
op.execute("DROP INDEX IF EXISTS ix_document_title_trgm")
|
|
op.execute("DROP INDEX IF EXISTS ix_document_body_trgm")
|
|
|
|
op.drop_table("document")
|