gr-mcp-docs/scripts/integrate-extracted-docs-v2.py
Ryan Malloy 41114373b9 init: Astro/Starlight docs site for gr-sarsat-modern
Diátaxis-structured documentation for 406 MHz SARSAT beacon reception:
- Tutorials: signal chain walkthrough
- Guides: antenna setup, message decoding
- Reference: block API, signal format
- Explanation: Cospas-Sarsat system overview

Includes extracted images from official Cospas-Sarsat specifications (LFS).
2026-02-13 05:01:21 -07:00

446 lines
15 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Integrate extracted PDF content into Starlight documentation site.
Version 2: Produces cleaner content with proper structured metadata.
Transforms extracted markdown files:
1. Parses document metadata (Issue, Revision, Date)
2. Adds extended Starlight frontmatter with Cospas-Sarsat schema
3. Cleans up page-by-page structure
4. Fixes image paths
5. Creates summary headers
"""
import re
import shutil
from pathlib import Path
from datetime import datetime
# Paths
EXTRACTED_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs/extracted")
DOCS_SITE_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs-site")
CONTENT_DIR = DOCS_SITE_ROOT / "src/content/docs/cospas-sarsat"
PUBLIC_DIR = DOCS_SITE_ROOT / "public/images/cospas-sarsat"
# Series metadata with document type classification
SERIES_INFO = {
"T": {
"name": "Technical",
"description": "Beacon specifications, LUT standards, MEOLUT requirements",
"documentType": "specification",
},
"S": {
"name": "Secretariat",
"description": "Country registration guides and administrative procedures",
"documentType": "procedure",
},
"R": {
"name": "Reports",
"description": "System status reports and performance analyses",
"documentType": "report",
},
"P": {
"name": "Programme",
"description": "International agreements and programme documentation",
"documentType": "programme",
},
"G": {
"name": "General",
"description": "System overview documents in multiple languages",
"documentType": "overview",
},
"A": {
"name": "Operational",
"description": "Alert distribution and SPOC protocols",
"documentType": "operational",
},
"D": {
"name": "IBRD",
"description": "International Beacon Registration Database standards",
"documentType": "database",
},
}
def parse_version_info(content: str, filename: str) -> dict:
"""
Extract version information from document content.
Returns dict with issue, revision, date, and full title.
"""
info = {
"issue": None,
"revision": None,
"documentDate": None,
"originalTitle": None,
}
# Try to find "Issue X - Rev. Y" or "Issue X Revision Y" patterns
# Common patterns: "Issue 4 - Rev. 13", "Issue 8 Revision 1"
issue_match = re.search(
r"Issue\s+(\d+)\s*[-]\s*(?:Rev(?:ision)?\.?\s*)?(\d+)?",
content,
re.IGNORECASE
)
if issue_match:
info["issue"] = int(issue_match.group(1))
if issue_match.group(2):
info["revision"] = int(issue_match.group(2))
# Try to find date (Month Year format)
date_match = re.search(
r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})",
content
)
if date_match:
info["documentDate"] = f"{date_match.group(1)} {date_match.group(2)}"
# Try to get title from metadata block or first heading
title_match = re.search(r"\*\*Title:\*\*\s*(.+)", content)
if title_match:
info["originalTitle"] = title_match.group(1).strip()
else:
# Look for document title in content (uppercase lines at start)
lines = content.split("\n")[:30]
for line in lines:
# Skip metadata, page markers, headers
if line.startswith("#") or line.startswith("**") or not line.strip():
continue
if re.match(r"^[A-Z][A-Z\s\-]+$", line.strip()):
# Found uppercase title
info["originalTitle"] = line.strip().title()
break
return info
def extract_document_title(content: str, doc_id: str) -> str:
"""Extract a clean, readable title for the document."""
title = None
# Try metadata title first
title_match = re.search(r"\*\*Title:\*\*\s*(.+)", content)
if title_match:
title = title_match.group(1).strip()
# Clean up "C/S T.001 - Issue 4 - Rev. 13" style titles
clean = re.sub(r"C/S\s+[A-Z]\.\d+\s*[-]?\s*", "", title)
clean = re.sub(r"Issue\s+\d+\s*[-]?\s*(?:Rev(?:ision)?\.?\s*\d+)?", "", clean)
if clean.strip():
title = clean.strip()
# Look for descriptive title in first few lines
if not title:
for match in re.finditer(r"(?:SPECIFICATION FOR|INTRODUCTION TO|GUIDE FOR|PLAN FOR)?\s*([A-Z][A-Z\s\-,]+)", content[:2000]):
candidate = match.group(0).strip()
if len(candidate) > 10 and doc_id.upper() not in candidate:
title = candidate.title()
break
if not title:
title = f"Document {doc_id}"
# IMPORTANT: Clean up newlines and excessive whitespace for valid YAML
title = re.sub(r"[\r\n]+", " ", title)
title = re.sub(r"\s+", " ", title)
return title.strip()
def clean_content(content: str, doc_id: str) -> str:
"""
Clean up raw PDF-extracted content:
- Remove Document Metadata block
- Remove redundant page markers
- Clean up repeated headers/footers
- Fix spacing issues
"""
# Remove Document Metadata block
content = re.sub(
r"^# Document Metadata\n(?:.*\n)*?---\n+",
"",
content,
flags=re.MULTILINE
)
# Remove "## Page X" markers but keep content
content = re.sub(r"^## Page \d+\n+", "", content, flags=re.MULTILINE)
# Remove repeated document ID footers (e.g., "C/S T.001 Issue 4 Draft Rev. 13")
footer_pattern = rf"(?:^|\n)[-\s]*(?:i+v?|v?i*|x+|[0-9]+)?\s*C/S\s+{doc_id[0]}\.\d+\s*[-]\s*Issue\s+\d+.*?(?:\n|$)"
content = re.sub(footer_pattern, "\n", content, flags=re.IGNORECASE)
# Remove standalone page numbers
content = re.sub(r"^\s*[-]?\s*(?:[ivx]+|\d+)\s*[-]?\s*$", "", content, flags=re.MULTILINE)
# Remove date-only lines that are footers
content = re.sub(
r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\s*$",
"",
content,
flags=re.MULTILINE
)
# Clean up excessive blank lines
content = re.sub(r"\n{4,}", "\n\n\n", content)
# Clean up leading whitespace
content = re.sub(r"^\s+", "", content)
return content.strip()
def fix_image_paths(content: str, doc_id: str, series: str) -> str:
"""Replace pdf-image:// URIs with actual image paths."""
def replace_image(match):
alt_text = match.group(1)
image_ref = match.group(2)
# Extract page and image number
page_match = re.search(r"page_(\d+)_img_(\d+)", image_ref)
if page_match:
page_num = page_match.group(1)
img_num = page_match.group(2)
return f"![{alt_text}](/images/cospas-sarsat/{series}-series/{doc_id}/{doc_id}_page_{page_num}_img_{img_num}.png)"
return match.group(0)
return re.sub(r"!\[([^\]]*)\]\(pdf-image://([^)]+)\)", replace_image, content)
def create_frontmatter(doc_id: str, series: str, version_info: dict, title: str) -> str:
"""Create extended Starlight frontmatter with Cospas-Sarsat schema."""
series_info = SERIES_INFO.get(series, {"name": series, "documentType": "specification"})
# Build version string for title
version_str = ""
if version_info["issue"]:
version_str = f"Issue {version_info['issue']}"
if version_info["revision"]:
version_str += f" Rev. {version_info['revision']}"
# Clean title for YAML
clean_title = title.replace('"', '\\"').replace(":", " -")
if version_str:
display_title = f"{doc_id}: {clean_title}"
else:
display_title = f"{doc_id}: {clean_title}"
frontmatter = f'''---
title: "{display_title}"
description: "Official Cospas-Sarsat {series}-series document {doc_id}"
sidebar:
badge:
text: "{series}"
variant: "note"
# Extended Cospas-Sarsat metadata
documentId: "{doc_id}"
series: "{series}"
seriesName: "{series_info['name']}"
documentType: "{series_info['documentType']}"
isLatest: true
'''
if version_info["issue"]:
frontmatter += f'issue: {version_info["issue"]}\n'
if version_info["revision"]:
frontmatter += f'revision: {version_info["revision"]}\n'
if version_info["documentDate"]:
frontmatter += f'documentDate: "{version_info["documentDate"]}"\n'
if version_info["originalTitle"]:
orig_title = version_info["originalTitle"].replace('"', '\\"')
frontmatter += f'originalTitle: "{orig_title}"\n'
frontmatter += "---\n\n"
return frontmatter
def create_document_header(doc_id: str, series: str, version_info: dict) -> str:
"""Create a document header using plain markdown (compatible with .md files)."""
series_info = SERIES_INFO.get(series, {"name": series, "description": ""})
# Use blockquote for info box (works in plain markdown)
header = f'''> **📋 Document Information**
>
> **Series:** {series}-Series ({series_info['name']})
'''
if version_info["issue"]:
header += f'> **Version:** Issue {version_info["issue"]}'
if version_info["revision"]:
header += f' - Revision {version_info["revision"]}'
header += "\n"
if version_info["documentDate"]:
header += f'> **Date:** {version_info["documentDate"]}\n'
header += f'''> **Source:** [Cospas-Sarsat Official Documents](https://www.cospas-sarsat.int/en/documents-pro/system-documents)
---
'''
return header
def copy_images(src_dir: Path, dest_dir: Path, doc_id: str) -> int:
"""Copy images from extracted folder to public folder."""
images_src = src_dir / "images"
if not images_src.exists():
return 0
dest_dir.mkdir(parents=True, exist_ok=True)
count = 0
for img in images_src.glob("*.png"):
# Normalize image filename
new_name = img.name
match = re.search(r"page_(\d+)_img_(\d+)", img.name)
if match:
new_name = f"{doc_id}_page_{match.group(1)}_img_{match.group(2)}.png"
dest_path = dest_dir / new_name
shutil.copy2(img, dest_path)
count += 1
return count
def process_document(doc_dir: Path, series: str, output_dir: Path, images_output: Path) -> dict:
"""Process a single document."""
doc_id = doc_dir.name
md_file = doc_dir / f"{doc_id}.md"
if not md_file.exists():
return None
# Read content
content = md_file.read_text(encoding="utf-8")
# Parse metadata
version_info = parse_version_info(content, doc_id)
title = extract_document_title(content, doc_id)
# Clean and transform content
content = clean_content(content, doc_id)
content = fix_image_paths(content, doc_id, series)
# Build final document
frontmatter = create_frontmatter(doc_id, series, version_info, title)
header = create_document_header(doc_id, series, version_info)
final_content = frontmatter + header + content
# Write output (use .md to avoid MDX parsing issues with ASCII art)
output_file = output_dir / f"{doc_id.lower()}.md"
output_file.write_text(final_content, encoding="utf-8")
# Copy images
img_count = copy_images(doc_dir, images_output / doc_id, doc_id)
return {
"doc_id": doc_id,
"title": title[:50] + "..." if len(title) > 50 else title,
"version": f"Issue {version_info['issue']}" if version_info["issue"] else "Unknown",
"images": img_count,
}
def process_series(series_letter: str) -> dict:
"""Process all documents in a series."""
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
if not series_dir.exists():
return {"docs": 0, "images": 0}
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
output_dir.mkdir(parents=True, exist_ok=True)
images_output = PUBLIC_DIR / f"{series_letter}-series"
stats = {"docs": 0, "images": 0}
for doc_dir in sorted(series_dir.iterdir()):
if not doc_dir.is_dir():
continue
result = process_document(doc_dir, series_letter, output_dir, images_output)
if result:
stats["docs"] += 1
stats["images"] += result["images"]
print(f" {result['doc_id']}: {result['title']} ({result['version']}, {result['images']} images)")
return stats
def create_series_index(series_letter: str) -> None:
"""Create an index page for each series."""
info = SERIES_INFO.get(series_letter, {"name": series_letter, "description": ""})
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
output_dir.mkdir(parents=True, exist_ok=True)
# Collect document info
docs = []
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
if series_dir.exists():
for doc_dir in sorted(series_dir.iterdir()):
if doc_dir.is_dir():
md_file = doc_dir / f"{doc_dir.name}.md"
if md_file.exists():
content = md_file.read_text(encoding="utf-8")
title = extract_document_title(content, doc_dir.name)
version_info = parse_version_info(content, doc_dir.name)
docs.append({
"id": doc_dir.name,
"title": title[:80],
"version": f"Issue {version_info['issue']}" if version_info["issue"] else "",
"date": version_info["documentDate"] or "",
})
# Generate index content
index_content = f'''---
title: "{series_letter}-Series: {info["name"]}"
description: "{info["description"]}"
---
import {{ LinkCard, CardGrid, Badge }} from '@astrojs/starlight/components';
{info["description"]}
## Documents ({len(docs)} total)
| Document | Description | Version | Date |
|----------|-------------|---------|------|
'''
for doc in docs:
title_escaped = doc["title"].replace("|", "\\|").replace('"', "'")
index_content += f'| [{doc["id"]}](/cospas-sarsat/{series_letter.lower()}-series/{doc["id"].lower()}/) | {title_escaped} | {doc["version"]} | {doc["date"]} |\n'
index_content += "\n"
index_file = output_dir / "index.mdx"
index_file.write_text(index_content, encoding="utf-8")
def main():
print("=== Integrating Cospas-Sarsat documents (v2) ===\n")
# Create output directories
CONTENT_DIR.mkdir(parents=True, exist_ok=True)
PUBLIC_DIR.mkdir(parents=True, exist_ok=True)
total_docs = 0
total_images = 0
for series in ["T", "S", "R", "P", "G", "A", "D"]:
print(f"\n{series}-Series ({SERIES_INFO[series]['name']}):")
stats = process_series(series)
create_series_index(series)
total_docs += stats["docs"]
total_images += stats["images"]
print(f"{stats['docs']} documents, {stats['images']} images")
print(f"\n=== Complete ===")
print(f"Total: {total_docs} documents, {total_images} images")
print(f"Content: {CONTENT_DIR}")
print(f"Images: {PUBLIC_DIR}")
if __name__ == "__main__":
main()