gr-mcp-docs/scripts/integrate-extracted-docs.py

#!/usr/bin/env python3
"""
Integrate extracted PDF content into Starlight documentation site.

Transforms extracted markdown files:
1. Adds Starlight-compatible frontmatter
2. Fixes image paths from pdf-image:// to relative paths
3. Copies images to appropriate locations
4. Creates index pages for each series
"""

import re
import shutil
from pathlib import Path

# Paths
EXTRACTED_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs/extracted")
DOCS_SITE_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs-site")
CONTENT_DIR = DOCS_SITE_ROOT / "src/content/docs/cospas-sarsat"
PUBLIC_DIR = DOCS_SITE_ROOT / "public/images/cospas-sarsat"

# Series metadata
SERIES_INFO = {
    "T": {"name": "Technical", "description": "Beacon specifications, LUT standards, MEOLUT requirements"},
    "S": {"name": "Secretariat", "description": "Country registration guides and administrative procedures"},
    "R": {"name": "Reports", "description": "System status reports and performance analyses"},
    "P": {"name": "Programme", "description": "International agreements and programme documentation"},
    "G": {"name": "General", "description": "System overview documents in multiple languages"},
    "A": {"name": "Operational", "description": "Alert distribution and SPOC protocols"},
    "D": {"name": "IBRD", "description": "International Beacon Registration Database standards"},
}


def extract_title_from_metadata(content: str) -> str:
    """Extract document title from metadata header."""
    match = re.search(r"\*\*Title:\*\*\s*(.+)", content)
    if match:
        return match.group(1).strip()
    # Fallback: look for first heading
    match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
    if match:
        return match.group(1).strip()
    return "Untitled Document"


def fix_image_paths(content: str, doc_id: str, series: str) -> str:
    """Replace pdf-image:// URIs with actual image paths."""
    def replace_image(match):
        alt_text = match.group(1)
        image_ref = match.group(2)
        # Extract page and image number
        page_match = re.search(r"page_(\d+)_img_(\d+)", image_ref)
        if page_match:
            page_num = page_match.group(1)
            img_num = page_match.group(2)
            # Use relative path to public images
            return f"![{alt_text}](/images/cospas-sarsat/{series}-series/{doc_id}/{doc_id}_page_{page_num}_img_{img_num}.png)"
        return match.group(0)

    return re.sub(r"!\[([^\]]*)\]\(pdf-image://([^)]+)\)", replace_image, content)


def add_frontmatter(content: str, title: str, doc_id: str, series: str) -> str:
    """Add Starlight frontmatter to content."""
    # Remove existing metadata block
    content = re.sub(r"^# Document Metadata\n(?:.*\n)*?---\n+", "", content, flags=re.MULTILINE)

    # Clean up title
    clean_title = title.replace('"', '\\"')

    frontmatter = f'''---
title: "{clean_title}"
description: "Cospas-Sarsat {series}-series document {doc_id}"
sidebar:
  badge:
    text: "{series}"
    variant: "note"
---

'''
    return frontmatter + content


def copy_images(src_dir: Path, dest_dir: Path, doc_id: str) -> int:
    """Copy images from extracted folder to public folder."""
    images_src = src_dir / "images"
    if not images_src.exists():
        return 0

    dest_dir.mkdir(parents=True, exist_ok=True)
    count = 0

    for img in images_src.glob("*.png"):
        # Normalize image filename
        new_name = img.name
        # Try to extract page/img numbers and rename consistently
        match = re.search(r"page_(\d+)_img_(\d+)", img.name)
        if match:
            new_name = f"{doc_id}_page_{match.group(1)}_img_{match.group(2)}.png"

        dest_path = dest_dir / new_name
        shutil.copy2(img, dest_path)
        count += 1

    return count


def process_series(series_letter: str) -> dict:
    """Process all documents in a series."""
    series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
    if not series_dir.exists():
        return {"docs": 0, "images": 0}

    output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
    output_dir.mkdir(parents=True, exist_ok=True)

    images_output = PUBLIC_DIR / f"{series_letter}-series"

    stats = {"docs": 0, "images": 0}

    for doc_dir in sorted(series_dir.iterdir()):
        if not doc_dir.is_dir():
            continue

        doc_id = doc_dir.name
        md_file = doc_dir / f"{doc_id}.md"

        if not md_file.exists():
            print(f"  Warning: No markdown file in {doc_dir}")
            continue

        # Read and transform content
        content = md_file.read_text(encoding="utf-8")
        title = extract_title_from_metadata(content)

        # Transform content
        content = fix_image_paths(content, doc_id, series_letter)
        content = add_frontmatter(content, title, doc_id, series_letter)

        # Write transformed MD file (not MDX - PDF content has chars that break JSX parsing)
        output_file = output_dir / f"{doc_id.lower()}.md"
        output_file.write_text(content, encoding="utf-8")
        stats["docs"] += 1

        # Copy images
        img_count = copy_images(doc_dir, images_output / doc_id, doc_id)
        stats["images"] += img_count

        print(f"  {doc_id}: {title[:50]}{'...' if len(title) > 50 else ''} ({img_count} images)")

    return stats


def create_series_index(series_letter: str) -> None:
    """Create an index page for each series."""
    info = SERIES_INFO.get(series_letter, {"name": series_letter, "description": ""})
    output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
    output_dir.mkdir(parents=True, exist_ok=True)

    index_content = f'''---
title: "{series_letter}-Series: {info["name"]}"
description: "{info["description"]}"
---

import {{ LinkCard, CardGrid }} from '@astrojs/starlight/components';

{info["description"]}

## Documents

<CardGrid>
'''

    # List documents in the series
    series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
    if series_dir.exists():
        for doc_dir in sorted(series_dir.iterdir()):
            if doc_dir.is_dir():
                doc_id = doc_dir.name
                md_file = doc_dir / f"{doc_id}.md"
                if md_file.exists():
                    content = md_file.read_text(encoding="utf-8")
                    title = extract_title_from_metadata(content)
                    clean_title = title.replace('"', '\\"')
                    index_content += f'  <LinkCard title="{doc_id}" description="{clean_title[:80]}" href="/cospas-sarsat/{series_letter.lower()}-series/{doc_id.lower()}/" />\n'

    index_content += '''</CardGrid>
'''

    index_file = output_dir / "index.mdx"
    index_file.write_text(index_content, encoding="utf-8")


def main():
    print("=== Integrating extracted Cospas-Sarsat documents ===\n")

    # Create output directories
    CONTENT_DIR.mkdir(parents=True, exist_ok=True)
    PUBLIC_DIR.mkdir(parents=True, exist_ok=True)

    total_docs = 0
    total_images = 0

    for series in ["T", "S", "R", "P", "G", "A", "D"]:
        print(f"\n{series}-Series ({SERIES_INFO[series]['name']}):")
        stats = process_series(series)
        create_series_index(series)
        total_docs += stats["docs"]
        total_images += stats["images"]
        print(f"  → {stats['docs']} documents, {stats['images']} images")

    print(f"\n=== Complete ===")
    print(f"Total: {total_docs} documents, {total_images} images")
    print(f"Content: {CONTENT_DIR}")
    print(f"Images: {PUBLIC_DIR}")


if __name__ == "__main__":
    main()