Ryan Malloy 33787e03da Fix ruff lint, Pyright type ambiguity, and MCP tool SQL queries
- Fix 28 ruff errors: E501 line length, B904 raise-from, F401 unused import
- Fix SQLAlchemy Row.count() ambiguity with tuple indexing (Pyright)
- Replace composite column notation with accessor functions in MCP tools
  (topocentric/equatorial/pass_event are C-level base types, not composites)
- Fix satellite_pass: use time window (start + end) not count parameter
  to match predict_passes(tle, observer, start_ts, end_ts, min_el) signature
2026-03-01 16:41:07 -07:00

183 lines
6.0 KiB
Python

"""Ingestion orchestrator — walks content directories, classifies, and upserts.
Run: docker compose exec api-dev python -m orrery_search.ingest
"""
import sys
from pathlib import Path
from sqlalchemy import select
from orrery_search.db import async_session
from orrery_search.ingest.mdx_parser import strip_mdx
from orrery_search.models.document import Document
from orrery_search.services.search_text import build_search_text
CONTENT_DIR = Path("/data/content")
def _resolve_paths() -> Path:
"""Return content_dir, preferring Docker mount."""
if CONTENT_DIR.exists():
return CONTENT_DIR
here = Path(__file__).resolve()
project_root = here
for _ in range(10):
project_root = project_root.parent
if (project_root / "docs" / "src").exists():
break
return project_root / "docs" / "src" / "content" / "docs"
def _classify_content_type(rel_path: str) -> str:
"""Classify content type from the relative path within content/docs/."""
parts = rel_path.split("/")
if parts[0] == "getting-started":
return "getting_started"
if parts[0] == "guides":
return "guide"
if parts[0] == "workflow":
return "workflow"
if parts[0] == "reference":
return "reference"
if parts[0] == "architecture":
return "architecture"
if parts[0] == "performance":
return "performance"
return "page"
def _mdx_path_to_url(rel_path: str) -> str:
"""Convert relative .mdx path to Starlight page URL."""
slug = rel_path.removesuffix(".mdx").removesuffix("/index")
return f"/{slug}/"
def _mdx_path_to_section(rel_path: str) -> str:
"""Extract section from relative path."""
parts = Path(rel_path).parts
if len(parts) > 1:
return "/".join(parts[:-1])
return ""
def _mdx_path_to_slug(rel_path: str) -> str:
"""Convert to a unique slug for dedup."""
return rel_path.removesuffix(".mdx").removesuffix("/index")
def _collect_mdx_pages(content_dir: Path) -> list[dict]:
"""Walk the content directory and parse all .mdx files."""
pages = []
for mdx_path in sorted(content_dir.rglob("*.mdx")):
try:
raw = mdx_path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError) as exc:
print(f" SKIP {mdx_path}: {exc}", file=sys.stderr)
continue
frontmatter, body = strip_mdx(raw)
rel_path = str(mdx_path.relative_to(content_dir))
title = frontmatter.get("title", mdx_path.stem.replace("-", " ").title())
description = frontmatter.get("description")
content_type = _classify_content_type(rel_path)
word_count = len(body.split())
pages.append({
"content_type": content_type,
"slug": _mdx_path_to_slug(rel_path),
"title": title,
"section": _mdx_path_to_section(rel_path),
"description": description,
"body": body,
"url": _mdx_path_to_url(rel_path),
"word_count": word_count,
})
return pages
async def ingest():
"""Main ingestion: read docs content, upsert into document table."""
content_dir = _resolve_paths()
print(f"Content dir: {content_dir}", file=sys.stderr)
if not content_dir.exists():
print(f"Content directory not found: {content_dir}", file=sys.stderr)
sys.exit(1)
pages = _collect_mdx_pages(content_dir)
print(f"Found {len(pages)} published pages", file=sys.stderr)
async with async_session() as db:
inserted = 0
updated = 0
errors = 0
for i, page_data in enumerate(pages):
try:
search_text = build_search_text(
title=page_data["title"],
section=page_data["section"],
content_type=page_data["content_type"],
description=page_data["description"],
body=page_data["body"],
)
async with db.begin_nested():
stmt = select(Document).where(
Document.slug == page_data["slug"]
)
result = await db.execute(stmt)
existing = result.scalar_one_or_none()
if existing:
existing.title = page_data["title"]
existing.section = page_data["section"]
existing.description = page_data["description"]
existing.body = page_data["body"]
existing.search_text = search_text
existing.url = page_data["url"]
existing.content_type = page_data["content_type"]
existing.word_count = page_data["word_count"]
updated += 1
else:
db.add(Document(
content_type=page_data["content_type"],
slug=page_data["slug"],
title=page_data["title"],
section=page_data["section"],
description=page_data["description"],
body=page_data["body"],
search_text=search_text,
url=page_data["url"],
word_count=page_data["word_count"],
))
inserted += 1
if (i + 1) % 50 == 0:
await db.commit()
print(
f" progress: {i + 1}/{len(pages)}",
file=sys.stderr,
)
except Exception as exc:
print(
f" ERROR on {page_data['slug']}: {exc}",
file=sys.stderr,
)
errors += 1
await db.commit()
print(
f"Ingestion complete: {inserted} inserted, {updated} updated, "
f"{errors} errors ({inserted + updated} total)",
file=sys.stderr,
)