pg_orrery/bench/load_bench.sh
Ryan Malloy 89ea0246b6 Rewrite load_bench.sh to use pg-orrery-catalog with curl fallback
Three-tier discovery: pg-orrery-catalog in PATH, sibling dev
checkout, or original build_catalog.py + curl. Indexes use
IF NOT EXISTS for idempotent re-runs.
2026-02-18 10:26:00 -07:00

146 lines
5.6 KiB
Bash
Executable File

#!/bin/bash
# Load pg_orrery benchmark catalog into PostgreSQL.
#
# Uses pg-orrery-catalog if available, falls back to pre-generated SQL.
#
# Usage:
# ./bench/load_bench.sh # Load from cached SQL or TLE files
# ./bench/load_bench.sh --rebuild # Re-merge from individual source files
# ./bench/load_bench.sh --download # Re-download sources + rebuild + load
#
# Environment:
# PGPORT PostgreSQL port (default: 5499)
# PGDATABASE Target database (default: contrib_regression)
# SOCKS_PROXY SOCKS5 proxy for CelesTrak (default: none)
#
set -euo pipefail
BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
PGPORT="${PGPORT:-5499}"
PGDATABASE="${PGDATABASE:-contrib_regression}"
TABLE="bench_catalog"
REBUILD=false
DOWNLOAD=false
for arg in "$@"; do
case "$arg" in
--rebuild) REBUILD=true ;;
--download) DOWNLOAD=true; REBUILD=true ;;
--help|-h)
head -14 "$0" | tail -13 | sed 's/^# \?//'
exit 0 ;;
esac
done
# ── Check for pg-orrery-catalog ──────────────────────────────
HAS_CATALOG=false
if command -v pg-orrery-catalog &>/dev/null; then
HAS_CATALOG=true
elif [ -f "$BENCH_DIR/../pg-orrery-catalog/.venv/bin/pg-orrery-catalog" ]; then
# Sibling development checkout
export PATH="$BENCH_DIR/../pg-orrery-catalog/.venv/bin:$PATH"
HAS_CATALOG=true
fi
# ── Download sources ─────────────────────────────────────────
if $DOWNLOAD; then
if $HAS_CATALOG; then
echo "==> Downloading TLE sources via pg-orrery-catalog..."
pg-orrery-catalog download --force
else
echo "==> pg-orrery-catalog not found, downloading via curl..."
CURL_PROXY=""
[ -n "${SOCKS_PROXY:-}" ] && CURL_PROXY="--socks5-hostname $SOCKS_PROXY"
# CelesTrak active (no auth needed)
CURL_CT="/usr/bin/curl -s $CURL_PROXY --connect-timeout 15 --max-time 120"
echo " CelesTrak active..."
$CURL_CT "https://celestrak.org/NORAD/elements/gp.php?GROUP=active&FORMAT=3le" \
-o "$BENCH_DIR/celestrak_active.tle" 2>/dev/null || echo " FAILED"
# CelesTrak supplemental GP
for group in starlink oneweb planet orbcomm; do
echo " CelesTrak SupGP ${group}..."
$CURL_CT "https://celestrak.org/NORAD/elements/supplemental/sup-gp.php?FILE=${group}&FORMAT=3le" \
-o "$BENCH_DIR/supgp_${group}.tle" 2>/dev/null || true
done
REBUILD=true
fi
fi
# ── Build SQL ────────────────────────────────────────────────
if $REBUILD; then
if $HAS_CATALOG; then
echo "==> Building catalog via pg-orrery-catalog..."
# Use cached downloads if available, fall back to bench/ TLE files
SOURCES=()
for f in "$BENCH_DIR"/*.tle; do
[ -f "$f" ] && SOURCES+=("$f")
done
if [ ${#SOURCES[@]} -gt 0 ]; then
pg-orrery-catalog build "${SOURCES[@]}" --table "$TABLE" \
> "$BENCH_DIR/load_mega_catalog.sql"
else
pg-orrery-catalog build --table "$TABLE" \
> "$BENCH_DIR/load_mega_catalog.sql"
fi
echo " Generated load_mega_catalog.sql"
else
echo "==> Building catalog via build_catalog.py..."
SOURCES=()
for f in spacetrack_everything.tle celestrak_active.tle satnogs_full.tle \
supgp_starlink.tle supgp_oneweb.tle supgp_planet.tle supgp_orbcomm.tle; do
[ -f "$BENCH_DIR/$f" ] && SOURCES+=("$BENCH_DIR/$f")
done
if [ ${#SOURCES[@]} -eq 0 ]; then
echo "ERROR: No source TLE files found in $BENCH_DIR" >&2
exit 1
fi
python3 "$BENCH_DIR/build_catalog.py" "${SOURCES[@]}" \
> "$BENCH_DIR/load_mega_catalog.sql"
echo " Generated load_mega_catalog.sql"
fi
fi
# ── Load into PostgreSQL ─────────────────────────────────────
if [ ! -f "$BENCH_DIR/load_mega_catalog.sql" ]; then
echo "ERROR: $BENCH_DIR/load_mega_catalog.sql not found" >&2
echo " Run with --rebuild or --download first" >&2
exit 1
fi
echo "==> Loading catalog into $PGDATABASE (port $PGPORT)..."
PGPORT=$PGPORT psql -d "$PGDATABASE" -f "$BENCH_DIR/load_mega_catalog.sql" -q 2>&1 | tail -3
# ── Create indexes ───────────────────────────────────────────
echo "==> Creating indexes..."
PGPORT=$PGPORT psql -d "$PGDATABASE" -q << 'SQL'
\timing on
CREATE INDEX IF NOT EXISTS bench_spgist_idx ON bench_catalog USING spgist (tle tle_spgist_ops);
CREATE INDEX IF NOT EXISTS bench_gist_idx ON bench_catalog USING gist (tle);
\timing off
SQL
# ── Summary ──────────────────────────────────────────────────
PGPORT=$PGPORT psql -d "$PGDATABASE" -q << 'SQL'
SELECT count(*) || ' objects loaded' AS status FROM bench_catalog;
SELECT
CASE
WHEN tle_mean_motion(tle) > 11.25 THEN 'LEO'
WHEN tle_mean_motion(tle) > 1.8 THEN 'MEO'
WHEN tle_mean_motion(tle) > 0.9 THEN 'GEO'
ELSE 'HEO'
END AS regime,
count(*) AS count
FROM bench_catalog
GROUP BY 1
ORDER BY 2 DESC;
SQL
echo "==> Done. Run benchmarks with:"
echo " PGPORT=$PGPORT psql -d $PGDATABASE -f bench/benchmark.sql"