Diátaxis-structured documentation for 406 MHz SARSAT beacon reception: - Tutorials: signal chain walkthrough - Guides: antenna setup, message decoding - Reference: block API, signal format - Explanation: Cospas-Sarsat system overview Includes extracted images from official Cospas-Sarsat specifications (LFS).
220 lines
7.4 KiB
Python
220 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Integrate extracted PDF content into Starlight documentation site.
|
|
|
|
Transforms extracted markdown files:
|
|
1. Adds Starlight-compatible frontmatter
|
|
2. Fixes image paths from pdf-image:// to relative paths
|
|
3. Copies images to appropriate locations
|
|
4. Creates index pages for each series
|
|
"""
|
|
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
# Paths
|
|
EXTRACTED_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs/extracted")
|
|
DOCS_SITE_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs-site")
|
|
CONTENT_DIR = DOCS_SITE_ROOT / "src/content/docs/cospas-sarsat"
|
|
PUBLIC_DIR = DOCS_SITE_ROOT / "public/images/cospas-sarsat"
|
|
|
|
# Series metadata
|
|
SERIES_INFO = {
|
|
"T": {"name": "Technical", "description": "Beacon specifications, LUT standards, MEOLUT requirements"},
|
|
"S": {"name": "Secretariat", "description": "Country registration guides and administrative procedures"},
|
|
"R": {"name": "Reports", "description": "System status reports and performance analyses"},
|
|
"P": {"name": "Programme", "description": "International agreements and programme documentation"},
|
|
"G": {"name": "General", "description": "System overview documents in multiple languages"},
|
|
"A": {"name": "Operational", "description": "Alert distribution and SPOC protocols"},
|
|
"D": {"name": "IBRD", "description": "International Beacon Registration Database standards"},
|
|
}
|
|
|
|
|
|
def extract_title_from_metadata(content: str) -> str:
|
|
"""Extract document title from metadata header."""
|
|
match = re.search(r"\*\*Title:\*\*\s*(.+)", content)
|
|
if match:
|
|
return match.group(1).strip()
|
|
# Fallback: look for first heading
|
|
match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
return "Untitled Document"
|
|
|
|
|
|
def fix_image_paths(content: str, doc_id: str, series: str) -> str:
|
|
"""Replace pdf-image:// URIs with actual image paths."""
|
|
def replace_image(match):
|
|
alt_text = match.group(1)
|
|
image_ref = match.group(2)
|
|
# Extract page and image number
|
|
page_match = re.search(r"page_(\d+)_img_(\d+)", image_ref)
|
|
if page_match:
|
|
page_num = page_match.group(1)
|
|
img_num = page_match.group(2)
|
|
# Use relative path to public images
|
|
return f""
|
|
return match.group(0)
|
|
|
|
return re.sub(r"!\[([^\]]*)\]\(pdf-image://([^)]+)\)", replace_image, content)
|
|
|
|
|
|
def add_frontmatter(content: str, title: str, doc_id: str, series: str) -> str:
|
|
"""Add Starlight frontmatter to content."""
|
|
# Remove existing metadata block
|
|
content = re.sub(r"^# Document Metadata\n(?:.*\n)*?---\n+", "", content, flags=re.MULTILINE)
|
|
|
|
# Clean up title
|
|
clean_title = title.replace('"', '\\"')
|
|
|
|
frontmatter = f'''---
|
|
title: "{clean_title}"
|
|
description: "Cospas-Sarsat {series}-series document {doc_id}"
|
|
sidebar:
|
|
badge:
|
|
text: "{series}"
|
|
variant: "note"
|
|
---
|
|
|
|
'''
|
|
return frontmatter + content
|
|
|
|
|
|
def copy_images(src_dir: Path, dest_dir: Path, doc_id: str) -> int:
|
|
"""Copy images from extracted folder to public folder."""
|
|
images_src = src_dir / "images"
|
|
if not images_src.exists():
|
|
return 0
|
|
|
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
count = 0
|
|
|
|
for img in images_src.glob("*.png"):
|
|
# Normalize image filename
|
|
new_name = img.name
|
|
# Try to extract page/img numbers and rename consistently
|
|
match = re.search(r"page_(\d+)_img_(\d+)", img.name)
|
|
if match:
|
|
new_name = f"{doc_id}_page_{match.group(1)}_img_{match.group(2)}.png"
|
|
|
|
dest_path = dest_dir / new_name
|
|
shutil.copy2(img, dest_path)
|
|
count += 1
|
|
|
|
return count
|
|
|
|
|
|
def process_series(series_letter: str) -> dict:
|
|
"""Process all documents in a series."""
|
|
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
|
|
if not series_dir.exists():
|
|
return {"docs": 0, "images": 0}
|
|
|
|
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
images_output = PUBLIC_DIR / f"{series_letter}-series"
|
|
|
|
stats = {"docs": 0, "images": 0}
|
|
|
|
for doc_dir in sorted(series_dir.iterdir()):
|
|
if not doc_dir.is_dir():
|
|
continue
|
|
|
|
doc_id = doc_dir.name
|
|
md_file = doc_dir / f"{doc_id}.md"
|
|
|
|
if not md_file.exists():
|
|
print(f" Warning: No markdown file in {doc_dir}")
|
|
continue
|
|
|
|
# Read and transform content
|
|
content = md_file.read_text(encoding="utf-8")
|
|
title = extract_title_from_metadata(content)
|
|
|
|
# Transform content
|
|
content = fix_image_paths(content, doc_id, series_letter)
|
|
content = add_frontmatter(content, title, doc_id, series_letter)
|
|
|
|
# Write transformed MD file (not MDX - PDF content has chars that break JSX parsing)
|
|
output_file = output_dir / f"{doc_id.lower()}.md"
|
|
output_file.write_text(content, encoding="utf-8")
|
|
stats["docs"] += 1
|
|
|
|
# Copy images
|
|
img_count = copy_images(doc_dir, images_output / doc_id, doc_id)
|
|
stats["images"] += img_count
|
|
|
|
print(f" {doc_id}: {title[:50]}{'...' if len(title) > 50 else ''} ({img_count} images)")
|
|
|
|
return stats
|
|
|
|
|
|
def create_series_index(series_letter: str) -> None:
|
|
"""Create an index page for each series."""
|
|
info = SERIES_INFO.get(series_letter, {"name": series_letter, "description": ""})
|
|
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
index_content = f'''---
|
|
title: "{series_letter}-Series: {info["name"]}"
|
|
description: "{info["description"]}"
|
|
---
|
|
|
|
import {{ LinkCard, CardGrid }} from '@astrojs/starlight/components';
|
|
|
|
{info["description"]}
|
|
|
|
## Documents
|
|
|
|
<CardGrid>
|
|
'''
|
|
|
|
# List documents in the series
|
|
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
|
|
if series_dir.exists():
|
|
for doc_dir in sorted(series_dir.iterdir()):
|
|
if doc_dir.is_dir():
|
|
doc_id = doc_dir.name
|
|
md_file = doc_dir / f"{doc_id}.md"
|
|
if md_file.exists():
|
|
content = md_file.read_text(encoding="utf-8")
|
|
title = extract_title_from_metadata(content)
|
|
clean_title = title.replace('"', '\\"')
|
|
index_content += f' <LinkCard title="{doc_id}" description="{clean_title[:80]}" href="/cospas-sarsat/{series_letter.lower()}-series/{doc_id.lower()}/" />\n'
|
|
|
|
index_content += '''</CardGrid>
|
|
'''
|
|
|
|
index_file = output_dir / "index.mdx"
|
|
index_file.write_text(index_content, encoding="utf-8")
|
|
|
|
|
|
def main():
|
|
print("=== Integrating extracted Cospas-Sarsat documents ===\n")
|
|
|
|
# Create output directories
|
|
CONTENT_DIR.mkdir(parents=True, exist_ok=True)
|
|
PUBLIC_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
total_docs = 0
|
|
total_images = 0
|
|
|
|
for series in ["T", "S", "R", "P", "G", "A", "D"]:
|
|
print(f"\n{series}-Series ({SERIES_INFO[series]['name']}):")
|
|
stats = process_series(series)
|
|
create_series_index(series)
|
|
total_docs += stats["docs"]
|
|
total_images += stats["images"]
|
|
print(f" → {stats['docs']} documents, {stats['images']} images")
|
|
|
|
print(f"\n=== Complete ===")
|
|
print(f"Total: {total_docs} documents, {total_images} images")
|
|
print(f"Content: {CONTENT_DIR}")
|
|
print(f"Images: {PUBLIC_DIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|