Diátaxis-structured documentation for 406 MHz SARSAT beacon reception: - Tutorials: signal chain walkthrough - Guides: antenna setup, message decoding - Reference: block API, signal format - Explanation: Cospas-Sarsat system overview Includes extracted images from official Cospas-Sarsat specifications (LFS).
446 lines
15 KiB
Python
Executable File
446 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Integrate extracted PDF content into Starlight documentation site.
|
||
Version 2: Produces cleaner content with proper structured metadata.
|
||
|
||
Transforms extracted markdown files:
|
||
1. Parses document metadata (Issue, Revision, Date)
|
||
2. Adds extended Starlight frontmatter with Cospas-Sarsat schema
|
||
3. Cleans up page-by-page structure
|
||
4. Fixes image paths
|
||
5. Creates summary headers
|
||
"""
|
||
|
||
import re
|
||
import shutil
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
|
||
# Paths
|
||
EXTRACTED_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs/extracted")
|
||
DOCS_SITE_ROOT = Path("/home/rpm/claude/sdr/gr-sarsat-modern/docs-site")
|
||
CONTENT_DIR = DOCS_SITE_ROOT / "src/content/docs/cospas-sarsat"
|
||
PUBLIC_DIR = DOCS_SITE_ROOT / "public/images/cospas-sarsat"
|
||
|
||
# Series metadata with document type classification
|
||
SERIES_INFO = {
|
||
"T": {
|
||
"name": "Technical",
|
||
"description": "Beacon specifications, LUT standards, MEOLUT requirements",
|
||
"documentType": "specification",
|
||
},
|
||
"S": {
|
||
"name": "Secretariat",
|
||
"description": "Country registration guides and administrative procedures",
|
||
"documentType": "procedure",
|
||
},
|
||
"R": {
|
||
"name": "Reports",
|
||
"description": "System status reports and performance analyses",
|
||
"documentType": "report",
|
||
},
|
||
"P": {
|
||
"name": "Programme",
|
||
"description": "International agreements and programme documentation",
|
||
"documentType": "programme",
|
||
},
|
||
"G": {
|
||
"name": "General",
|
||
"description": "System overview documents in multiple languages",
|
||
"documentType": "overview",
|
||
},
|
||
"A": {
|
||
"name": "Operational",
|
||
"description": "Alert distribution and SPOC protocols",
|
||
"documentType": "operational",
|
||
},
|
||
"D": {
|
||
"name": "IBRD",
|
||
"description": "International Beacon Registration Database standards",
|
||
"documentType": "database",
|
||
},
|
||
}
|
||
|
||
|
||
def parse_version_info(content: str, filename: str) -> dict:
|
||
"""
|
||
Extract version information from document content.
|
||
Returns dict with issue, revision, date, and full title.
|
||
"""
|
||
info = {
|
||
"issue": None,
|
||
"revision": None,
|
||
"documentDate": None,
|
||
"originalTitle": None,
|
||
}
|
||
|
||
# Try to find "Issue X - Rev. Y" or "Issue X – Revision Y" patterns
|
||
# Common patterns: "Issue 4 - Rev. 13", "Issue 8 – Revision 1"
|
||
issue_match = re.search(
|
||
r"Issue\s+(\d+)\s*[-–]\s*(?:Rev(?:ision)?\.?\s*)?(\d+)?",
|
||
content,
|
||
re.IGNORECASE
|
||
)
|
||
if issue_match:
|
||
info["issue"] = int(issue_match.group(1))
|
||
if issue_match.group(2):
|
||
info["revision"] = int(issue_match.group(2))
|
||
|
||
# Try to find date (Month Year format)
|
||
date_match = re.search(
|
||
r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})",
|
||
content
|
||
)
|
||
if date_match:
|
||
info["documentDate"] = f"{date_match.group(1)} {date_match.group(2)}"
|
||
|
||
# Try to get title from metadata block or first heading
|
||
title_match = re.search(r"\*\*Title:\*\*\s*(.+)", content)
|
||
if title_match:
|
||
info["originalTitle"] = title_match.group(1).strip()
|
||
else:
|
||
# Look for document title in content (uppercase lines at start)
|
||
lines = content.split("\n")[:30]
|
||
for line in lines:
|
||
# Skip metadata, page markers, headers
|
||
if line.startswith("#") or line.startswith("**") or not line.strip():
|
||
continue
|
||
if re.match(r"^[A-Z][A-Z\s\-]+$", line.strip()):
|
||
# Found uppercase title
|
||
info["originalTitle"] = line.strip().title()
|
||
break
|
||
|
||
return info
|
||
|
||
|
||
def extract_document_title(content: str, doc_id: str) -> str:
|
||
"""Extract a clean, readable title for the document."""
|
||
title = None
|
||
|
||
# Try metadata title first
|
||
title_match = re.search(r"\*\*Title:\*\*\s*(.+)", content)
|
||
if title_match:
|
||
title = title_match.group(1).strip()
|
||
# Clean up "C/S T.001 - Issue 4 - Rev. 13" style titles
|
||
clean = re.sub(r"C/S\s+[A-Z]\.\d+\s*[-–]?\s*", "", title)
|
||
clean = re.sub(r"Issue\s+\d+\s*[-–]?\s*(?:Rev(?:ision)?\.?\s*\d+)?", "", clean)
|
||
if clean.strip():
|
||
title = clean.strip()
|
||
|
||
# Look for descriptive title in first few lines
|
||
if not title:
|
||
for match in re.finditer(r"(?:SPECIFICATION FOR|INTRODUCTION TO|GUIDE FOR|PLAN FOR)?\s*([A-Z][A-Z\s\-,]+)", content[:2000]):
|
||
candidate = match.group(0).strip()
|
||
if len(candidate) > 10 and doc_id.upper() not in candidate:
|
||
title = candidate.title()
|
||
break
|
||
|
||
if not title:
|
||
title = f"Document {doc_id}"
|
||
|
||
# IMPORTANT: Clean up newlines and excessive whitespace for valid YAML
|
||
title = re.sub(r"[\r\n]+", " ", title)
|
||
title = re.sub(r"\s+", " ", title)
|
||
|
||
return title.strip()
|
||
|
||
|
||
def clean_content(content: str, doc_id: str) -> str:
|
||
"""
|
||
Clean up raw PDF-extracted content:
|
||
- Remove Document Metadata block
|
||
- Remove redundant page markers
|
||
- Clean up repeated headers/footers
|
||
- Fix spacing issues
|
||
"""
|
||
# Remove Document Metadata block
|
||
content = re.sub(
|
||
r"^# Document Metadata\n(?:.*\n)*?---\n+",
|
||
"",
|
||
content,
|
||
flags=re.MULTILINE
|
||
)
|
||
|
||
# Remove "## Page X" markers but keep content
|
||
content = re.sub(r"^## Page \d+\n+", "", content, flags=re.MULTILINE)
|
||
|
||
# Remove repeated document ID footers (e.g., "C/S T.001 – Issue 4 – Draft Rev. 13")
|
||
footer_pattern = rf"(?:^|\n)[-–\s]*(?:i+v?|v?i*|x+|[0-9]+)?\s*C/S\s+{doc_id[0]}\.\d+\s*[-–]\s*Issue\s+\d+.*?(?:\n|$)"
|
||
content = re.sub(footer_pattern, "\n", content, flags=re.IGNORECASE)
|
||
|
||
# Remove standalone page numbers
|
||
content = re.sub(r"^\s*[-–]?\s*(?:[ivx]+|\d+)\s*[-–]?\s*$", "", content, flags=re.MULTILINE)
|
||
|
||
# Remove date-only lines that are footers
|
||
content = re.sub(
|
||
r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\s*$",
|
||
"",
|
||
content,
|
||
flags=re.MULTILINE
|
||
)
|
||
|
||
# Clean up excessive blank lines
|
||
content = re.sub(r"\n{4,}", "\n\n\n", content)
|
||
|
||
# Clean up leading whitespace
|
||
content = re.sub(r"^\s+", "", content)
|
||
|
||
return content.strip()
|
||
|
||
|
||
def fix_image_paths(content: str, doc_id: str, series: str) -> str:
|
||
"""Replace pdf-image:// URIs with actual image paths."""
|
||
def replace_image(match):
|
||
alt_text = match.group(1)
|
||
image_ref = match.group(2)
|
||
# Extract page and image number
|
||
page_match = re.search(r"page_(\d+)_img_(\d+)", image_ref)
|
||
if page_match:
|
||
page_num = page_match.group(1)
|
||
img_num = page_match.group(2)
|
||
return f""
|
||
return match.group(0)
|
||
|
||
return re.sub(r"!\[([^\]]*)\]\(pdf-image://([^)]+)\)", replace_image, content)
|
||
|
||
|
||
def create_frontmatter(doc_id: str, series: str, version_info: dict, title: str) -> str:
|
||
"""Create extended Starlight frontmatter with Cospas-Sarsat schema."""
|
||
series_info = SERIES_INFO.get(series, {"name": series, "documentType": "specification"})
|
||
|
||
# Build version string for title
|
||
version_str = ""
|
||
if version_info["issue"]:
|
||
version_str = f"Issue {version_info['issue']}"
|
||
if version_info["revision"]:
|
||
version_str += f" Rev. {version_info['revision']}"
|
||
|
||
# Clean title for YAML
|
||
clean_title = title.replace('"', '\\"').replace(":", " -")
|
||
if version_str:
|
||
display_title = f"{doc_id}: {clean_title}"
|
||
else:
|
||
display_title = f"{doc_id}: {clean_title}"
|
||
|
||
frontmatter = f'''---
|
||
title: "{display_title}"
|
||
description: "Official Cospas-Sarsat {series}-series document {doc_id}"
|
||
sidebar:
|
||
badge:
|
||
text: "{series}"
|
||
variant: "note"
|
||
# Extended Cospas-Sarsat metadata
|
||
documentId: "{doc_id}"
|
||
series: "{series}"
|
||
seriesName: "{series_info['name']}"
|
||
documentType: "{series_info['documentType']}"
|
||
isLatest: true
|
||
'''
|
||
|
||
if version_info["issue"]:
|
||
frontmatter += f'issue: {version_info["issue"]}\n'
|
||
if version_info["revision"]:
|
||
frontmatter += f'revision: {version_info["revision"]}\n'
|
||
if version_info["documentDate"]:
|
||
frontmatter += f'documentDate: "{version_info["documentDate"]}"\n'
|
||
if version_info["originalTitle"]:
|
||
orig_title = version_info["originalTitle"].replace('"', '\\"')
|
||
frontmatter += f'originalTitle: "{orig_title}"\n'
|
||
|
||
frontmatter += "---\n\n"
|
||
return frontmatter
|
||
|
||
|
||
def create_document_header(doc_id: str, series: str, version_info: dict) -> str:
|
||
"""Create a document header using plain markdown (compatible with .md files)."""
|
||
series_info = SERIES_INFO.get(series, {"name": series, "description": ""})
|
||
|
||
# Use blockquote for info box (works in plain markdown)
|
||
header = f'''> **📋 Document Information**
|
||
>
|
||
> **Series:** {series}-Series ({series_info['name']})
|
||
'''
|
||
|
||
if version_info["issue"]:
|
||
header += f'> **Version:** Issue {version_info["issue"]}'
|
||
if version_info["revision"]:
|
||
header += f' - Revision {version_info["revision"]}'
|
||
header += "\n"
|
||
|
||
if version_info["documentDate"]:
|
||
header += f'> **Date:** {version_info["documentDate"]}\n'
|
||
|
||
header += f'''> **Source:** [Cospas-Sarsat Official Documents](https://www.cospas-sarsat.int/en/documents-pro/system-documents)
|
||
|
||
---
|
||
|
||
'''
|
||
return header
|
||
|
||
|
||
def copy_images(src_dir: Path, dest_dir: Path, doc_id: str) -> int:
|
||
"""Copy images from extracted folder to public folder."""
|
||
images_src = src_dir / "images"
|
||
if not images_src.exists():
|
||
return 0
|
||
|
||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||
count = 0
|
||
|
||
for img in images_src.glob("*.png"):
|
||
# Normalize image filename
|
||
new_name = img.name
|
||
match = re.search(r"page_(\d+)_img_(\d+)", img.name)
|
||
if match:
|
||
new_name = f"{doc_id}_page_{match.group(1)}_img_{match.group(2)}.png"
|
||
|
||
dest_path = dest_dir / new_name
|
||
shutil.copy2(img, dest_path)
|
||
count += 1
|
||
|
||
return count
|
||
|
||
|
||
def process_document(doc_dir: Path, series: str, output_dir: Path, images_output: Path) -> dict:
|
||
"""Process a single document."""
|
||
doc_id = doc_dir.name
|
||
md_file = doc_dir / f"{doc_id}.md"
|
||
|
||
if not md_file.exists():
|
||
return None
|
||
|
||
# Read content
|
||
content = md_file.read_text(encoding="utf-8")
|
||
|
||
# Parse metadata
|
||
version_info = parse_version_info(content, doc_id)
|
||
title = extract_document_title(content, doc_id)
|
||
|
||
# Clean and transform content
|
||
content = clean_content(content, doc_id)
|
||
content = fix_image_paths(content, doc_id, series)
|
||
|
||
# Build final document
|
||
frontmatter = create_frontmatter(doc_id, series, version_info, title)
|
||
header = create_document_header(doc_id, series, version_info)
|
||
|
||
final_content = frontmatter + header + content
|
||
|
||
# Write output (use .md to avoid MDX parsing issues with ASCII art)
|
||
output_file = output_dir / f"{doc_id.lower()}.md"
|
||
output_file.write_text(final_content, encoding="utf-8")
|
||
|
||
# Copy images
|
||
img_count = copy_images(doc_dir, images_output / doc_id, doc_id)
|
||
|
||
return {
|
||
"doc_id": doc_id,
|
||
"title": title[:50] + "..." if len(title) > 50 else title,
|
||
"version": f"Issue {version_info['issue']}" if version_info["issue"] else "Unknown",
|
||
"images": img_count,
|
||
}
|
||
|
||
|
||
def process_series(series_letter: str) -> dict:
|
||
"""Process all documents in a series."""
|
||
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
|
||
if not series_dir.exists():
|
||
return {"docs": 0, "images": 0}
|
||
|
||
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
images_output = PUBLIC_DIR / f"{series_letter}-series"
|
||
|
||
stats = {"docs": 0, "images": 0}
|
||
|
||
for doc_dir in sorted(series_dir.iterdir()):
|
||
if not doc_dir.is_dir():
|
||
continue
|
||
|
||
result = process_document(doc_dir, series_letter, output_dir, images_output)
|
||
if result:
|
||
stats["docs"] += 1
|
||
stats["images"] += result["images"]
|
||
print(f" {result['doc_id']}: {result['title']} ({result['version']}, {result['images']} images)")
|
||
|
||
return stats
|
||
|
||
|
||
def create_series_index(series_letter: str) -> None:
|
||
"""Create an index page for each series."""
|
||
info = SERIES_INFO.get(series_letter, {"name": series_letter, "description": ""})
|
||
output_dir = CONTENT_DIR / f"{series_letter.lower()}-series"
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Collect document info
|
||
docs = []
|
||
series_dir = EXTRACTED_ROOT / "cospas-sarsat" / f"{series_letter}-series"
|
||
if series_dir.exists():
|
||
for doc_dir in sorted(series_dir.iterdir()):
|
||
if doc_dir.is_dir():
|
||
md_file = doc_dir / f"{doc_dir.name}.md"
|
||
if md_file.exists():
|
||
content = md_file.read_text(encoding="utf-8")
|
||
title = extract_document_title(content, doc_dir.name)
|
||
version_info = parse_version_info(content, doc_dir.name)
|
||
docs.append({
|
||
"id": doc_dir.name,
|
||
"title": title[:80],
|
||
"version": f"Issue {version_info['issue']}" if version_info["issue"] else "",
|
||
"date": version_info["documentDate"] or "",
|
||
})
|
||
|
||
# Generate index content
|
||
index_content = f'''---
|
||
title: "{series_letter}-Series: {info["name"]}"
|
||
description: "{info["description"]}"
|
||
---
|
||
|
||
import {{ LinkCard, CardGrid, Badge }} from '@astrojs/starlight/components';
|
||
|
||
{info["description"]}
|
||
|
||
## Documents ({len(docs)} total)
|
||
|
||
| Document | Description | Version | Date |
|
||
|----------|-------------|---------|------|
|
||
'''
|
||
|
||
for doc in docs:
|
||
title_escaped = doc["title"].replace("|", "\\|").replace('"', "'")
|
||
index_content += f'| [{doc["id"]}](/cospas-sarsat/{series_letter.lower()}-series/{doc["id"].lower()}/) | {title_escaped} | {doc["version"]} | {doc["date"]} |\n'
|
||
|
||
index_content += "\n"
|
||
|
||
index_file = output_dir / "index.mdx"
|
||
index_file.write_text(index_content, encoding="utf-8")
|
||
|
||
|
||
def main():
|
||
print("=== Integrating Cospas-Sarsat documents (v2) ===\n")
|
||
|
||
# Create output directories
|
||
CONTENT_DIR.mkdir(parents=True, exist_ok=True)
|
||
PUBLIC_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
total_docs = 0
|
||
total_images = 0
|
||
|
||
for series in ["T", "S", "R", "P", "G", "A", "D"]:
|
||
print(f"\n{series}-Series ({SERIES_INFO[series]['name']}):")
|
||
stats = process_series(series)
|
||
create_series_index(series)
|
||
total_docs += stats["docs"]
|
||
total_images += stats["images"]
|
||
print(f" → {stats['docs']} documents, {stats['images']} images")
|
||
|
||
print(f"\n=== Complete ===")
|
||
print(f"Total: {total_docs} documents, {total_images} images")
|
||
print(f"Content: {CONTENT_DIR}")
|
||
print(f"Images: {PUBLIC_DIR}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|