"""document table with pgai vectorizer Revision ID: 001_baseline Revises: None Create Date: 2026-03-01 """ import sqlalchemy as sa from alembic import op revision = "001_baseline" down_revision = None branch_labels = None depends_on = None def upgrade(): op.create_table( "document", sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), sa.Column("content_type", sa.String(20), nullable=False, index=True), sa.Column("slug", sa.String(300), nullable=False, unique=True), sa.Column("title", sa.String(300), nullable=False), sa.Column("section", sa.String(200), nullable=False, index=True), sa.Column("description", sa.Text, nullable=True), sa.Column("body", sa.Text, nullable=False), sa.Column("search_text", sa.Text, nullable=True), sa.Column("url", sa.String(300), nullable=False), sa.Column("word_count", sa.Integer, nullable=False, server_default="0"), sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False, ), ) # Enable pg_trgm for fast ILIKE with GIN indexes op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm") op.execute( "CREATE INDEX ix_document_title_trgm ON document USING gin (title gin_trgm_ops)" ) op.execute( "CREATE INDEX ix_document_body_trgm ON document USING gin (body gin_trgm_ops)" ) # pgai vectorizer — reads search_text, generates 1024-dim embeddings # Uses mxbai-embed-large via the GPU embedding gateway. op.execute(""" SELECT ai.create_vectorizer( 'document'::regclass, name => 'document_embedder', loading => ai.loading_column(column_name => 'search_text'), embedding => ai.embedding_openai( model => 'mxbai-embed-large', dimensions => 1024 ), chunking => ai.chunking_recursive_character_text_splitter( chunk_size => 400, chunk_overlap => 50, separators => array[E'\\n\\n', E'\\n', '. ', ' '] ), formatting => ai.formatting_python_template( template => '$chunk' ) ) """) def downgrade(): op.execute(""" DO $$ BEGIN PERFORM ai.drop_vectorizer('document_embedder', drop_all => true); EXCEPTION WHEN OTHERS THEN RAISE NOTICE 'document_embedder not found, skipping drop'; END $$ """) op.execute("DROP INDEX IF EXISTS ix_document_title_trgm") op.execute("DROP INDEX IF EXISTS ix_document_body_trgm") op.drop_table("document")