#!/usr/bin/env python3 """ Integrate extracted PDF content into Starlight documentation site. Transforms extracted markdown files: 1. Adds Starlight-compatible frontmatter 2. Fixes image paths from pdf-image:// to relative paths 3. Copies images to appropriate locations 4. Creates index pages for each series """ import re import shutil from pathlib import Path # Paths EXTRACTED_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs/extracted") DOCS_SITE_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs-site") CONTENT_DIR = DOCS_SITE_ROOT / "src/content/docs/cospas-sarsat" PUBLIC_DIR = DOCS_SITE_ROOT / "public/images/cospas-sarsat" # Series metadata SERIES_INFO = { "T": {"name": "Technical", "description": "Beacon specifications, LUT standards, MEOLUT requirements"}, "S": {"name": "Secretariat", "description": "Country registration guides and administrative procedures"}, "R": {"name": "Reports", "description": "System status reports and performance analyses"}, "P": {"name": "Programme", "description": "International agreements and programme documentation"}, "G": {"name": "General", "description": "System overview documents in multiple languages"}, "A": {"name": "Operational", "description": "Alert distribution and SPOC protocols"}, "D": {"name": "IBRD", "description": "International Beacon Registration Database standards"}, } def extract_title_from_metadata(content: str) -> str: """Extract document title from metadata header.""" match = re.search(r"\*\*Title:\*\*\s*(.+)", content) if match: return match.group(1).strip() # Fallback: look for first heading match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if match: return match.group(1).strip() return "Untitled Document" def fix_image_paths(content: str, doc_id: str, series: str) -> str: """Replace pdf-image:// URIs with actual image paths.""" def replace_image(match): alt_text = match.group(1) image_ref = match.group(2) # Extract page and image number page_match = re.search(r"page_(\d+)_img_(\d+)", image_ref) if page_match: page_num = page_match.group(1) img_num = page_match.group(2) # Use relative path to public images return f"![{alt_text}](/images/cospas-sarsat/{series}-series/{doc_id}/{doc_id}_page_{page_num}_img_{img_num}.png)" return match.group(0) return re.sub(r"!\[([^\]]*)\]\(pdf-image://([^)]+)\)", replace_image, content) def add_frontmatter(content: str, title: str, doc_id: str, series: str) -> str: """Add Starlight frontmatter to content.""" # Remove existing metadata block content = re.sub(r"^# Document Metadata\n(?:.*\n)*?---\n+", "", content, flags=re.MULTILINE) # Clean up title clean_title = title.replace('"', '\\"') frontmatter = f'''--- title: "{clean_title}" description: "Cospas-Sarsat {series}-series document {doc_id}" sidebar: badge: text: "{series}" variant: "note" --- ''' return frontmatter + content def copy_images(src_dir: Path, dest_dir: Path, doc_id: str) -> int: """Copy images from extracted folder to public folder.""" images_src = src_dir / "images" if not images_src.exists(): return 0 dest_dir.mkdir(parents=True, exist_ok=True) count = 0 for img in images_src.glob("*.png"): # Normalize image filename new_name = img.name # Try to extract page/img numbers and rename consistently match = re.search(r"page_(\d+)_img_(\d+)", img.name) if match: new_name = f"{doc_id}_page_{match.group(1)}_img_{match.group(2)}.png" dest_path = dest_dir / new_name shutil.copy2(img, dest_path) count += 1 return count def process_series(series_letter: str) -> dict: """Process all documents in a series.""" series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series" if not series_dir.exists(): return {"docs": 0, "images": 0} output_dir = CONTENT_DIR / f"{series_letter.lower()}-series" output_dir.mkdir(parents=True, exist_ok=True) images_output = PUBLIC_DIR / f"{series_letter}-series" stats = {"docs": 0, "images": 0} for doc_dir in sorted(series_dir.iterdir()): if not doc_dir.is_dir(): continue doc_id = doc_dir.name md_file = doc_dir / f"{doc_id}.md" if not md_file.exists(): print(f" Warning: No markdown file in {doc_dir}") continue # Read and transform content content = md_file.read_text(encoding="utf-8") title = extract_title_from_metadata(content) # Transform content content = fix_image_paths(content, doc_id, series_letter) content = add_frontmatter(content, title, doc_id, series_letter) # Write transformed MD file (not MDX - PDF content has chars that break JSX parsing) output_file = output_dir / f"{doc_id.lower()}.md" output_file.write_text(content, encoding="utf-8") stats["docs"] += 1 # Copy images img_count = copy_images(doc_dir, images_output / doc_id, doc_id) stats["images"] += img_count print(f" {doc_id}: {title[:50]}{'...' if len(title) > 50 else ''} ({img_count} images)") return stats def create_series_index(series_letter: str) -> None: """Create an index page for each series.""" info = SERIES_INFO.get(series_letter, {"name": series_letter, "description": ""}) output_dir = CONTENT_DIR / f"{series_letter.lower()}-series" output_dir.mkdir(parents=True, exist_ok=True) index_content = f'''--- title: "{series_letter}-Series: {info["name"]}" description: "{info["description"]}" --- import {{ LinkCard, CardGrid }} from '@astrojs/starlight/components'; {info["description"]} ## Documents ''' # List documents in the series series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series" if series_dir.exists(): for doc_dir in sorted(series_dir.iterdir()): if doc_dir.is_dir(): doc_id = doc_dir.name md_file = doc_dir / f"{doc_id}.md" if md_file.exists(): content = md_file.read_text(encoding="utf-8") title = extract_title_from_metadata(content) clean_title = title.replace('"', '\\"') index_content += f' \n' index_content += ''' ''' index_file = output_dir / "index.mdx" index_file.write_text(index_content, encoding="utf-8") def main(): print("=== Integrating extracted Cospas-Sarsat documents ===\n") # Create output directories CONTENT_DIR.mkdir(parents=True, exist_ok=True) PUBLIC_DIR.mkdir(parents=True, exist_ok=True) total_docs = 0 total_images = 0 for series in ["T", "S", "R", "P", "G", "A", "D"]: print(f"\n{series}-Series ({SERIES_INFO[series]['name']}):") stats = process_series(series) create_series_index(series) total_docs += stats["docs"] total_images += stats["images"] print(f" → {stats['docs']} documents, {stats['images']} images") print(f"\n=== Complete ===") print(f"Total: {total_docs} documents, {total_images} images") print(f"Content: {CONTENT_DIR}") print(f"Images: {PUBLIC_DIR}") if __name__ == "__main__": main()