#!/usr/bin/env python3 """ Integrate extracted PDF content into Starlight documentation site. Version 2: Produces cleaner content with proper structured metadata. Transforms extracted markdown files: 1. Parses document metadata (Issue, Revision, Date) 2. Adds extended Starlight frontmatter with Cospas-Sarsat schema 3. Cleans up page-by-page structure 4. Fixes image paths 5. Creates summary headers """ import re import shutil from pathlib import Path from datetime import datetime # Paths EXTRACTED_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs/extracted") DOCS_SITE_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs-site") CONTENT_DIR = DOCS_SITE_ROOT / "src/content/docs/cospas-sarsat" PUBLIC_DIR = DOCS_SITE_ROOT / "public/images/cospas-sarsat" # Series metadata with document type classification SERIES_INFO = { "T": { "name": "Technical", "description": "Beacon specifications, LUT standards, MEOLUT requirements", "documentType": "specification", }, "S": { "name": "Secretariat", "description": "Country registration guides and administrative procedures", "documentType": "procedure", }, "R": { "name": "Reports", "description": "System status reports and performance analyses", "documentType": "report", }, "P": { "name": "Programme", "description": "International agreements and programme documentation", "documentType": "programme", }, "G": { "name": "General", "description": "System overview documents in multiple languages", "documentType": "overview", }, "A": { "name": "Operational", "description": "Alert distribution and SPOC protocols", "documentType": "operational", }, "D": { "name": "IBRD", "description": "International Beacon Registration Database standards", "documentType": "database", }, } def parse_version_info(content: str, filename: str) -> dict: """ Extract version information from document content. Returns dict with issue, revision, date, and full title. """ info = { "issue": None, "revision": None, "documentDate": None, "originalTitle": None, } # Try to find "Issue X - Rev. Y" or "Issue X – Revision Y" patterns # Common patterns: "Issue 4 - Rev. 13", "Issue 8 – Revision 1" issue_match = re.search( r"Issue\s+(\d+)\s*[-–]\s*(?:Rev(?:ision)?\.?\s*)?(\d+)?", content, re.IGNORECASE ) if issue_match: info["issue"] = int(issue_match.group(1)) if issue_match.group(2): info["revision"] = int(issue_match.group(2)) # Try to find date (Month Year format) date_match = re.search( r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})", content ) if date_match: info["documentDate"] = f"{date_match.group(1)} {date_match.group(2)}" # Try to get title from metadata block or first heading title_match = re.search(r"\*\*Title:\*\*\s*(.+)", content) if title_match: info["originalTitle"] = title_match.group(1).strip() else: # Look for document title in content (uppercase lines at start) lines = content.split("\n")[:30] for line in lines: # Skip metadata, page markers, headers if line.startswith("#") or line.startswith("**") or not line.strip(): continue if re.match(r"^[A-Z][A-Z\s\-]+$", line.strip()): # Found uppercase title info["originalTitle"] = line.strip().title() break return info def extract_document_title(content: str, doc_id: str) -> str: """Extract a clean, readable title for the document.""" title = None # Try metadata title first title_match = re.search(r"\*\*Title:\*\*\s*(.+)", content) if title_match: title = title_match.group(1).strip() # Clean up "C/S T.001 - Issue 4 - Rev. 13" style titles clean = re.sub(r"C/S\s+[A-Z]\.\d+\s*[-–]?\s*", "", title) clean = re.sub(r"Issue\s+\d+\s*[-–]?\s*(?:Rev(?:ision)?\.?\s*\d+)?", "", clean) if clean.strip(): title = clean.strip() # Look for descriptive title in first few lines if not title: for match in re.finditer(r"(?:SPECIFICATION FOR|INTRODUCTION TO|GUIDE FOR|PLAN FOR)?\s*([A-Z][A-Z\s\-,]+)", content[:2000]): candidate = match.group(0).strip() if len(candidate) > 10 and doc_id.upper() not in candidate: title = candidate.title() break if not title: title = f"Document {doc_id}" # IMPORTANT: Clean up newlines and excessive whitespace for valid YAML title = re.sub(r"[\r\n]+", " ", title) title = re.sub(r"\s+", " ", title) return title.strip() def clean_content(content: str, doc_id: str) -> str: """ Clean up raw PDF-extracted content: - Remove Document Metadata block - Remove redundant page markers - Clean up repeated headers/footers - Fix spacing issues """ # Remove Document Metadata block content = re.sub( r"^# Document Metadata\n(?:.*\n)*?---\n+", "", content, flags=re.MULTILINE ) # Remove "## Page X" markers but keep content content = re.sub(r"^## Page \d+\n+", "", content, flags=re.MULTILINE) # Remove repeated document ID footers (e.g., "C/S T.001 – Issue 4 – Draft Rev. 13") footer_pattern = rf"(?:^|\n)[-–\s]*(?:i+v?|v?i*|x+|[0-9]+)?\s*C/S\s+{doc_id[0]}\.\d+\s*[-–]\s*Issue\s+\d+.*?(?:\n|$)" content = re.sub(footer_pattern, "\n", content, flags=re.IGNORECASE) # Remove standalone page numbers content = re.sub(r"^\s*[-–]?\s*(?:[ivx]+|\d+)\s*[-–]?\s*$", "", content, flags=re.MULTILINE) # Remove date-only lines that are footers content = re.sub( r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\s*$", "", content, flags=re.MULTILINE ) # Clean up excessive blank lines content = re.sub(r"\n{4,}", "\n\n\n", content) # Clean up leading whitespace content = re.sub(r"^\s+", "", content) return content.strip() def fix_image_paths(content: str, doc_id: str, series: str) -> str: """Replace pdf-image:// URIs with actual image paths.""" def replace_image(match): alt_text = match.group(1) image_ref = match.group(2) # Extract page and image number page_match = re.search(r"page_(\d+)_img_(\d+)", image_ref) if page_match: page_num = page_match.group(1) img_num = page_match.group(2) return f"![{alt_text}](/images/cospas-sarsat/{series}-series/{doc_id}/{doc_id}_page_{page_num}_img_{img_num}.png)" return match.group(0) return re.sub(r"!\[([^\]]*)\]\(pdf-image://([^)]+)\)", replace_image, content) def create_frontmatter(doc_id: str, series: str, version_info: dict, title: str) -> str: """Create extended Starlight frontmatter with Cospas-Sarsat schema.""" series_info = SERIES_INFO.get(series, {"name": series, "documentType": "specification"}) # Build version string for title version_str = "" if version_info["issue"]: version_str = f"Issue {version_info['issue']}" if version_info["revision"]: version_str += f" Rev. {version_info['revision']}" # Clean title for YAML clean_title = title.replace('"', '\\"').replace(":", " -") if version_str: display_title = f"{doc_id}: {clean_title}" else: display_title = f"{doc_id}: {clean_title}" frontmatter = f'''--- title: "{display_title}" description: "Official Cospas-Sarsat {series}-series document {doc_id}" sidebar: badge: text: "{series}" variant: "note" # Extended Cospas-Sarsat metadata documentId: "{doc_id}" series: "{series}" seriesName: "{series_info['name']}" documentType: "{series_info['documentType']}" isLatest: true ''' if version_info["issue"]: frontmatter += f'issue: {version_info["issue"]}\n' if version_info["revision"]: frontmatter += f'revision: {version_info["revision"]}\n' if version_info["documentDate"]: frontmatter += f'documentDate: "{version_info["documentDate"]}"\n' if version_info["originalTitle"]: orig_title = version_info["originalTitle"].replace('"', '\\"') frontmatter += f'originalTitle: "{orig_title}"\n' frontmatter += "---\n\n" return frontmatter def create_document_header(doc_id: str, series: str, version_info: dict) -> str: """Create a document header using plain markdown (compatible with .md files).""" series_info = SERIES_INFO.get(series, {"name": series, "description": ""}) # Use blockquote for info box (works in plain markdown) header = f'''> **📋 Document Information** > > **Series:** {series}-Series ({series_info['name']}) ''' if version_info["issue"]: header += f'> **Version:** Issue {version_info["issue"]}' if version_info["revision"]: header += f' - Revision {version_info["revision"]}' header += "\n" if version_info["documentDate"]: header += f'> **Date:** {version_info["documentDate"]}\n' header += f'''> **Source:** [Cospas-Sarsat Official Documents](https://www.cospas-sarsat.int/en/documents-pro/system-documents) --- ''' return header def copy_images(src_dir: Path, dest_dir: Path, doc_id: str) -> int: """Copy images from extracted folder to public folder.""" images_src = src_dir / "images" if not images_src.exists(): return 0 dest_dir.mkdir(parents=True, exist_ok=True) count = 0 for img in images_src.glob("*.png"): # Normalize image filename new_name = img.name match = re.search(r"page_(\d+)_img_(\d+)", img.name) if match: new_name = f"{doc_id}_page_{match.group(1)}_img_{match.group(2)}.png" dest_path = dest_dir / new_name shutil.copy2(img, dest_path) count += 1 return count def process_document(doc_dir: Path, series: str, output_dir: Path, images_output: Path) -> dict: """Process a single document.""" doc_id = doc_dir.name md_file = doc_dir / f"{doc_id}.md" if not md_file.exists(): return None # Read content content = md_file.read_text(encoding="utf-8") # Parse metadata version_info = parse_version_info(content, doc_id) title = extract_document_title(content, doc_id) # Clean and transform content content = clean_content(content, doc_id) content = fix_image_paths(content, doc_id, series) # Build final document frontmatter = create_frontmatter(doc_id, series, version_info, title) header = create_document_header(doc_id, series, version_info) final_content = frontmatter + header + content # Write output (use .md to avoid MDX parsing issues with ASCII art) output_file = output_dir / f"{doc_id.lower()}.md" output_file.write_text(final_content, encoding="utf-8") # Copy images img_count = copy_images(doc_dir, images_output / doc_id, doc_id) return { "doc_id": doc_id, "title": title[:50] + "..." if len(title) > 50 else title, "version": f"Issue {version_info['issue']}" if version_info["issue"] else "Unknown", "images": img_count, } def process_series(series_letter: str) -> dict: """Process all documents in a series.""" series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series" if not series_dir.exists(): return {"docs": 0, "images": 0} output_dir = CONTENT_DIR / f"{series_letter.lower()}-series" output_dir.mkdir(parents=True, exist_ok=True) images_output = PUBLIC_DIR / f"{series_letter}-series" stats = {"docs": 0, "images": 0} for doc_dir in sorted(series_dir.iterdir()): if not doc_dir.is_dir(): continue result = process_document(doc_dir, series_letter, output_dir, images_output) if result: stats["docs"] += 1 stats["images"] += result["images"] print(f" {result['doc_id']}: {result['title']} ({result['version']}, {result['images']} images)") return stats def create_series_index(series_letter: str) -> None: """Create an index page for each series.""" info = SERIES_INFO.get(series_letter, {"name": series_letter, "description": ""}) output_dir = CONTENT_DIR / f"{series_letter.lower()}-series" output_dir.mkdir(parents=True, exist_ok=True) # Collect document info docs = [] series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series" if series_dir.exists(): for doc_dir in sorted(series_dir.iterdir()): if doc_dir.is_dir(): md_file = doc_dir / f"{doc_dir.name}.md" if md_file.exists(): content = md_file.read_text(encoding="utf-8") title = extract_document_title(content, doc_dir.name) version_info = parse_version_info(content, doc_dir.name) docs.append({ "id": doc_dir.name, "title": title[:80], "version": f"Issue {version_info['issue']}" if version_info["issue"] else "", "date": version_info["documentDate"] or "", }) # Generate index content index_content = f'''--- title: "{series_letter}-Series: {info["name"]}" description: "{info["description"]}" --- import {{ LinkCard, CardGrid, Badge }} from '@astrojs/starlight/components'; {info["description"]} ## Documents ({len(docs)} total) | Document | Description | Version | Date | |----------|-------------|---------|------| ''' for doc in docs: title_escaped = doc["title"].replace("|", "\\|").replace('"', "'") index_content += f'| [{doc["id"]}](/cospas-sarsat/{series_letter.lower()}-series/{doc["id"].lower()}/) | {title_escaped} | {doc["version"]} | {doc["date"]} |\n' index_content += "\n" index_file = output_dir / "index.mdx" index_file.write_text(index_content, encoding="utf-8") def main(): print("=== Integrating Cospas-Sarsat documents (v2) ===\n") # Create output directories CONTENT_DIR.mkdir(parents=True, exist_ok=True) PUBLIC_DIR.mkdir(parents=True, exist_ok=True) total_docs = 0 total_images = 0 for series in ["T", "S", "R", "P", "G", "A", "D"]: print(f"\n{series}-Series ({SERIES_INFO[series]['name']}):") stats = process_series(series) create_series_index(series) total_docs += stats["docs"] total_images += stats["images"] print(f" → {stats['docs']} documents, {stats['images']} images") print(f"\n=== Complete ===") print(f"Total: {total_docs} documents, {total_images} images") print(f"Content: {CONTENT_DIR}") print(f"Images: {PUBLIC_DIR}") if __name__ == "__main__": main()