gr-mcp-docs/scripts/integrate-extracted-docs.py
Ryan Malloy 41114373b9 init: Astro/Starlight docs site for gr-sarsat-modern
Diátaxis-structured documentation for 406 MHz SARSAT beacon reception:
- Tutorials: signal chain walkthrough
- Guides: antenna setup, message decoding
- Reference: block API, signal format
- Explanation: Cospas-Sarsat system overview

Includes extracted images from official Cospas-Sarsat specifications (LFS).
2026-02-13 05:01:21 -07:00

220 lines
7.4 KiB
Python

#!/usr/bin/env python3
"""
Integrate extracted PDF content into Starlight documentation site.
Transforms extracted markdown files:
1. Adds Starlight-compatible frontmatter
2. Fixes image paths from pdf-image:// to relative paths
3. Copies images to appropriate locations
4. Creates index pages for each series
"""
import re
import shutil
from pathlib import Path
# Paths
EXTRACTED_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs/extracted")
DOCS_SITE_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs-site")
CONTENT_DIR = DOCS_SITE_ROOT / "src/content/docs/cospas-sarsat"
PUBLIC_DIR = DOCS_SITE_ROOT / "public/images/cospas-sarsat"
# Series metadata
SERIES_INFO = {
"T": {"name": "Technical", "description": "Beacon specifications, LUT standards, MEOLUT requirements"},
"S": {"name": "Secretariat", "description": "Country registration guides and administrative procedures"},
"R": {"name": "Reports", "description": "System status reports and performance analyses"},
"P": {"name": "Programme", "description": "International agreements and programme documentation"},
"G": {"name": "General", "description": "System overview documents in multiple languages"},
"A": {"name": "Operational", "description": "Alert distribution and SPOC protocols"},
"D": {"name": "IBRD", "description": "International Beacon Registration Database standards"},
}
def extract_title_from_metadata(content: str) -> str:
"""Extract document title from metadata header."""
match = re.search(r"\*\*Title:\*\*\s*(.+)", content)
if match:
return match.group(1).strip()
# Fallback: look for first heading
match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
if match:
return match.group(1).strip()
return "Untitled Document"
def fix_image_paths(content: str, doc_id: str, series: str) -> str:
"""Replace pdf-image:// URIs with actual image paths."""
def replace_image(match):
alt_text = match.group(1)
image_ref = match.group(2)
# Extract page and image number
page_match = re.search(r"page_(\d+)_img_(\d+)", image_ref)
if page_match:
page_num = page_match.group(1)
img_num = page_match.group(2)
# Use relative path to public images
return f"![{alt_text}](/images/cospas-sarsat/{series}-series/{doc_id}/{doc_id}_page_{page_num}_img_{img_num}.png)"
return match.group(0)
return re.sub(r"!\[([^\]]*)\]\(pdf-image://([^)]+)\)", replace_image, content)
def add_frontmatter(content: str, title: str, doc_id: str, series: str) -> str:
"""Add Starlight frontmatter to content."""
# Remove existing metadata block
content = re.sub(r"^# Document Metadata\n(?:.*\n)*?---\n+", "", content, flags=re.MULTILINE)
# Clean up title
clean_title = title.replace('"', '\\"')
frontmatter = f'''---
title: "{clean_title}"
description: "Cospas-Sarsat {series}-series document {doc_id}"
sidebar:
badge:
text: "{series}"
variant: "note"
---
'''
return frontmatter + content
def copy_images(src_dir: Path, dest_dir: Path, doc_id: str) -> int:
"""Copy images from extracted folder to public folder."""
images_src = src_dir / "images"
if not images_src.exists():
return 0
dest_dir.mkdir(parents=True, exist_ok=True)
count = 0
for img in images_src.glob("*.png"):
# Normalize image filename
new_name = img.name
# Try to extract page/img numbers and rename consistently
match = re.search(r"page_(\d+)_img_(\d+)", img.name)
if match:
new_name = f"{doc_id}_page_{match.group(1)}_img_{match.group(2)}.png"
dest_path = dest_dir / new_name
shutil.copy2(img, dest_path)
count += 1
return count
def process_series(series_letter: str) -> dict:
"""Process all documents in a series."""
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
if not series_dir.exists():
return {"docs": 0, "images": 0}
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
output_dir.mkdir(parents=True, exist_ok=True)
images_output = PUBLIC_DIR / f"{series_letter}-series"
stats = {"docs": 0, "images": 0}
for doc_dir in sorted(series_dir.iterdir()):
if not doc_dir.is_dir():
continue
doc_id = doc_dir.name
md_file = doc_dir / f"{doc_id}.md"
if not md_file.exists():
print(f" Warning: No markdown file in {doc_dir}")
continue
# Read and transform content
content = md_file.read_text(encoding="utf-8")
title = extract_title_from_metadata(content)
# Transform content
content = fix_image_paths(content, doc_id, series_letter)
content = add_frontmatter(content, title, doc_id, series_letter)
# Write transformed MD file (not MDX - PDF content has chars that break JSX parsing)
output_file = output_dir / f"{doc_id.lower()}.md"
output_file.write_text(content, encoding="utf-8")
stats["docs"] += 1
# Copy images
img_count = copy_images(doc_dir, images_output / doc_id, doc_id)
stats["images"] += img_count
print(f" {doc_id}: {title[:50]}{'...' if len(title) > 50 else ''} ({img_count} images)")
return stats
def create_series_index(series_letter: str) -> None:
"""Create an index page for each series."""
info = SERIES_INFO.get(series_letter, {"name": series_letter, "description": ""})
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
output_dir.mkdir(parents=True, exist_ok=True)
index_content = f'''---
title: "{series_letter}-Series: {info["name"]}"
description: "{info["description"]}"
---
import {{ LinkCard, CardGrid }} from '@astrojs/starlight/components';
{info["description"]}
## Documents
<CardGrid>
'''
# List documents in the series
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
if series_dir.exists():
for doc_dir in sorted(series_dir.iterdir()):
if doc_dir.is_dir():
doc_id = doc_dir.name
md_file = doc_dir / f"{doc_id}.md"
if md_file.exists():
content = md_file.read_text(encoding="utf-8")
title = extract_title_from_metadata(content)
clean_title = title.replace('"', '\\"')
index_content += f' <LinkCard title="{doc_id}" description="{clean_title[:80]}" href="/cospas-sarsat/{series_letter.lower()}-series/{doc_id.lower()}/" />\n'
index_content += '''</CardGrid>
'''
index_file = output_dir / "index.mdx"
index_file.write_text(index_content, encoding="utf-8")
def main():
print("=== Integrating extracted Cospas-Sarsat documents ===\n")
# Create output directories
CONTENT_DIR.mkdir(parents=True, exist_ok=True)
PUBLIC_DIR.mkdir(parents=True, exist_ok=True)
total_docs = 0
total_images = 0
for series in ["T", "S", "R", "P", "G", "A", "D"]:
print(f"\n{series}-Series ({SERIES_INFO[series]['name']}):")
stats = process_series(series)
create_series_index(series)
total_docs += stats["docs"]
total_images += stats["images"]
print(f"{stats['docs']} documents, {stats['images']} images")
print(f"\n=== Complete ===")
print(f"Total: {total_docs} documents, {total_images} images")
print(f"Content: {CONTENT_DIR}")
print(f"Images: {PUBLIC_DIR}")
if __name__ == "__main__":
main()