- Update all dependencies to latest versions (fastmcp, httpx, packaging, etc.) - Downgrade click from yanked 8.2.2 to stable 8.1.7 - Fix code formatting and linting issues with ruff - Most tests passing (2 test failures in dependency resolver need investigation)
607 lines
24 KiB
Python
607 lines
24 KiB
Python
"""PyPI download statistics client with fallback mechanisms for resilient data access."""
|
|
|
|
import asyncio
|
|
import logging
|
|
import random
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from .exceptions import (
|
|
InvalidPackageNameError,
|
|
NetworkError,
|
|
PackageNotFoundError,
|
|
PyPIServerError,
|
|
RateLimitError,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PyPIStatsClient:
|
|
"""Async client for PyPI download statistics with multiple data sources and robust error handling."""
|
|
|
|
def __init__(
|
|
self,
|
|
base_url: str = "https://pypistats.org/api",
|
|
timeout: float = 30.0,
|
|
max_retries: int = 5,
|
|
retry_delay: float = 2.0,
|
|
fallback_enabled: bool = True,
|
|
):
|
|
"""Initialize PyPI stats client with fallback mechanisms.
|
|
|
|
Args:
|
|
base_url: Base URL for pypistats API
|
|
timeout: Request timeout in seconds
|
|
max_retries: Maximum number of retry attempts
|
|
retry_delay: Base delay between retries in seconds
|
|
fallback_enabled: Whether to use fallback data sources when primary fails
|
|
"""
|
|
self.base_url = base_url.rstrip("/")
|
|
self.timeout = timeout
|
|
self.max_retries = max_retries
|
|
self.retry_delay = retry_delay
|
|
self.fallback_enabled = fallback_enabled
|
|
|
|
# Enhanced in-memory cache with longer TTL for resilience
|
|
self._cache: dict[str, dict[str, Any]] = {}
|
|
self._cache_ttl = 86400 # 24 hours (increased for resilience)
|
|
self._fallback_cache_ttl = 604800 # 7 days for fallback data
|
|
|
|
# Track API health for smart fallback decisions
|
|
self._api_health = {
|
|
"last_success": None,
|
|
"consecutive_failures": 0,
|
|
"last_error": None,
|
|
}
|
|
|
|
# HTTP client configuration
|
|
self._client = httpx.AsyncClient(
|
|
timeout=httpx.Timeout(timeout),
|
|
headers={
|
|
"User-Agent": "pypi-query-mcp-server/0.1.0",
|
|
"Accept": "application/json",
|
|
},
|
|
follow_redirects=True,
|
|
)
|
|
|
|
async def __aenter__(self):
|
|
"""Async context manager entry."""
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
"""Async context manager exit."""
|
|
await self.close()
|
|
|
|
async def close(self):
|
|
"""Close the HTTP client."""
|
|
await self._client.aclose()
|
|
|
|
def _validate_package_name(self, package_name: str) -> str:
|
|
"""Validate and normalize package name.
|
|
|
|
Args:
|
|
package_name: Package name to validate
|
|
|
|
Returns:
|
|
Normalized package name
|
|
|
|
Raises:
|
|
InvalidPackageNameError: If package name is invalid
|
|
"""
|
|
if not package_name or not package_name.strip():
|
|
raise InvalidPackageNameError(package_name)
|
|
|
|
# Basic validation
|
|
normalized = package_name.strip().lower()
|
|
return normalized
|
|
|
|
def _get_cache_key(self, endpoint: str, package_name: str = "", **params) -> str:
|
|
"""Generate cache key for API data."""
|
|
param_str = "&".join(
|
|
f"{k}={v}" for k, v in sorted(params.items()) if v is not None
|
|
)
|
|
return f"{endpoint}:{package_name}:{param_str}"
|
|
|
|
def _is_cache_valid(
|
|
self, cache_entry: dict[str, Any], fallback: bool = False
|
|
) -> bool:
|
|
"""Check if cache entry is still valid.
|
|
|
|
Args:
|
|
cache_entry: Cache entry to validate
|
|
fallback: Whether to use fallback cache TTL (longer for resilience)
|
|
"""
|
|
ttl = self._fallback_cache_ttl if fallback else self._cache_ttl
|
|
return time.time() - cache_entry.get("timestamp", 0) < ttl
|
|
|
|
def _should_use_fallback(self) -> bool:
|
|
"""Determine if fallback mechanisms should be used based on API health."""
|
|
if not self.fallback_enabled:
|
|
return False
|
|
|
|
# Use fallback if we've had multiple consecutive failures
|
|
if self._api_health["consecutive_failures"] >= 3:
|
|
return True
|
|
|
|
# Use fallback if last success was more than 1 hour ago
|
|
if self._api_health["last_success"]:
|
|
time_since_success = time.time() - self._api_health["last_success"]
|
|
if time_since_success > 3600: # 1 hour
|
|
return True
|
|
|
|
return False
|
|
|
|
async def _make_request(self, url: str) -> dict[str, Any]:
|
|
"""Make HTTP request with enhanced retry logic and exponential backoff.
|
|
|
|
Args:
|
|
url: URL to request
|
|
|
|
Returns:
|
|
JSON response data
|
|
|
|
Raises:
|
|
NetworkError: For network-related errors
|
|
PackageNotFoundError: When package is not found
|
|
RateLimitError: When rate limit is exceeded
|
|
PyPIServerError: For server errors
|
|
"""
|
|
last_exception = None
|
|
|
|
for attempt in range(self.max_retries + 1):
|
|
try:
|
|
logger.debug(
|
|
f"Making request to {url} (attempt {attempt + 1}/{self.max_retries + 1})"
|
|
)
|
|
|
|
response = await self._client.get(url)
|
|
|
|
# Handle different HTTP status codes
|
|
if response.status_code == 200:
|
|
# Update API health on success
|
|
self._api_health["last_success"] = time.time()
|
|
self._api_health["consecutive_failures"] = 0
|
|
self._api_health["last_error"] = None
|
|
return response.json()
|
|
elif response.status_code == 404:
|
|
# Extract package name from URL for better error message
|
|
package_name = url.split("/")[-2] if "/" in url else "unknown"
|
|
self._update_api_failure(f"Package not found: {package_name}")
|
|
raise PackageNotFoundError(package_name)
|
|
elif response.status_code == 429:
|
|
retry_after = response.headers.get("Retry-After")
|
|
retry_after_int = int(retry_after) if retry_after else None
|
|
self._update_api_failure(
|
|
f"Rate limit exceeded (retry after {retry_after_int}s)"
|
|
)
|
|
raise RateLimitError(retry_after_int)
|
|
elif response.status_code >= 500:
|
|
error_msg = f"Server error: HTTP {response.status_code}"
|
|
self._update_api_failure(error_msg)
|
|
|
|
# For 502/503/504 errors, continue retrying
|
|
if (
|
|
response.status_code in [502, 503, 504]
|
|
and attempt < self.max_retries
|
|
):
|
|
last_exception = PyPIServerError(
|
|
response.status_code, error_msg
|
|
)
|
|
logger.warning(
|
|
f"Retryable server error {response.status_code}, attempt {attempt + 1}"
|
|
)
|
|
else:
|
|
raise PyPIServerError(response.status_code, error_msg)
|
|
else:
|
|
error_msg = f"Unexpected status code: {response.status_code}"
|
|
self._update_api_failure(error_msg)
|
|
raise PyPIServerError(response.status_code, error_msg)
|
|
|
|
except httpx.TimeoutException as e:
|
|
error_msg = f"Request timeout: {e}"
|
|
last_exception = NetworkError(error_msg, e)
|
|
self._update_api_failure(error_msg)
|
|
logger.warning(f"Timeout on attempt {attempt + 1}: {e}")
|
|
except httpx.NetworkError as e:
|
|
error_msg = f"Network error: {e}"
|
|
last_exception = NetworkError(error_msg, e)
|
|
self._update_api_failure(error_msg)
|
|
logger.warning(f"Network error on attempt {attempt + 1}: {e}")
|
|
except (PackageNotFoundError, RateLimitError):
|
|
# Don't retry these errors - they're definitive
|
|
raise
|
|
except PyPIServerError as e:
|
|
# Only retry certain server errors
|
|
if e.status_code in [502, 503, 504] and attempt < self.max_retries:
|
|
last_exception = e
|
|
logger.warning(
|
|
f"Retrying server error {e.status_code}, attempt {attempt + 1}"
|
|
)
|
|
else:
|
|
raise
|
|
except Exception as e:
|
|
error_msg = f"Unexpected error: {e}"
|
|
last_exception = NetworkError(error_msg, e)
|
|
self._update_api_failure(error_msg)
|
|
logger.error(f"Unexpected error on attempt {attempt + 1}: {e}")
|
|
|
|
# Calculate exponential backoff with jitter
|
|
if attempt < self.max_retries:
|
|
base_delay = self.retry_delay * (2**attempt)
|
|
jitter = random.uniform(0.1, 0.3) * base_delay # Add 10-30% jitter
|
|
delay = base_delay + jitter
|
|
logger.debug(f"Waiting {delay:.2f}s before retry...")
|
|
await asyncio.sleep(delay)
|
|
|
|
# If we get here, all retries failed
|
|
if last_exception:
|
|
raise last_exception
|
|
else:
|
|
raise NetworkError("All retry attempts failed with unknown error")
|
|
|
|
def _update_api_failure(self, error_msg: str) -> None:
|
|
"""Update API health tracking on failure."""
|
|
self._api_health["consecutive_failures"] += 1
|
|
self._api_health["last_error"] = error_msg
|
|
logger.debug(
|
|
f"API failure count: {self._api_health['consecutive_failures']}, error: {error_msg}"
|
|
)
|
|
|
|
def _generate_fallback_recent_downloads(
|
|
self, package_name: str, period: str = "month"
|
|
) -> dict[str, Any]:
|
|
"""Generate fallback download statistics when API is unavailable.
|
|
|
|
This provides estimated download counts based on package popularity patterns
|
|
to ensure the system remains functional during API outages.
|
|
"""
|
|
logger.warning(f"Generating fallback download data for {package_name}")
|
|
|
|
# Base estimates for popular packages (these are conservative estimates)
|
|
popular_packages = {
|
|
"requests": {"day": 1500000, "week": 10500000, "month": 45000000},
|
|
"urllib3": {"day": 1400000, "week": 9800000, "month": 42000000},
|
|
"boto3": {"day": 1200000, "week": 8400000, "month": 36000000},
|
|
"certifi": {"day": 1100000, "week": 7700000, "month": 33000000},
|
|
"charset-normalizer": {"day": 1000000, "week": 7000000, "month": 30000000},
|
|
"idna": {"day": 950000, "week": 6650000, "month": 28500000},
|
|
"setuptools": {"day": 900000, "week": 6300000, "month": 27000000},
|
|
"python-dateutil": {"day": 850000, "week": 5950000, "month": 25500000},
|
|
"six": {"day": 800000, "week": 5600000, "month": 24000000},
|
|
"botocore": {"day": 750000, "week": 5250000, "month": 22500000},
|
|
"typing-extensions": {"day": 700000, "week": 4900000, "month": 21000000},
|
|
"packaging": {"day": 650000, "week": 4550000, "month": 19500000},
|
|
"numpy": {"day": 600000, "week": 4200000, "month": 18000000},
|
|
"pip": {"day": 550000, "week": 3850000, "month": 16500000},
|
|
"pyyaml": {"day": 500000, "week": 3500000, "month": 15000000},
|
|
"cryptography": {"day": 450000, "week": 3150000, "month": 13500000},
|
|
"click": {"day": 400000, "week": 2800000, "month": 12000000},
|
|
"jinja2": {"day": 350000, "week": 2450000, "month": 10500000},
|
|
"markupsafe": {"day": 300000, "week": 2100000, "month": 9000000},
|
|
"wheel": {"day": 250000, "week": 1750000, "month": 7500000},
|
|
"django": {"day": 100000, "week": 700000, "month": 3000000},
|
|
"flask": {"day": 80000, "week": 560000, "month": 2400000},
|
|
"fastapi": {"day": 60000, "week": 420000, "month": 1800000},
|
|
"pandas": {"day": 200000, "week": 1400000, "month": 6000000},
|
|
"sqlalchemy": {"day": 90000, "week": 630000, "month": 2700000},
|
|
}
|
|
|
|
# Get estimates for known packages or generate based on package name characteristics
|
|
if package_name.lower() in popular_packages:
|
|
estimates = popular_packages[package_name.lower()]
|
|
else:
|
|
# Generate estimates based on common package patterns
|
|
if any(
|
|
keyword in package_name.lower() for keyword in ["test", "dev", "debug"]
|
|
):
|
|
# Development/testing packages - lower usage
|
|
base_daily = random.randint(100, 1000)
|
|
elif any(
|
|
keyword in package_name.lower()
|
|
for keyword in ["aws", "google", "microsoft", "azure"]
|
|
):
|
|
# Cloud provider packages - higher usage
|
|
base_daily = random.randint(10000, 50000)
|
|
elif any(
|
|
keyword in package_name.lower()
|
|
for keyword in ["http", "request", "client", "api"]
|
|
):
|
|
# HTTP/API packages - moderate to high usage
|
|
base_daily = random.randint(5000, 25000)
|
|
elif any(
|
|
keyword in package_name.lower()
|
|
for keyword in ["data", "pandas", "numpy", "scipy"]
|
|
):
|
|
# Data science packages - high usage
|
|
base_daily = random.randint(15000, 75000)
|
|
else:
|
|
# Generic packages - moderate usage
|
|
base_daily = random.randint(1000, 10000)
|
|
|
|
estimates = {
|
|
"day": base_daily,
|
|
"week": base_daily * 7,
|
|
"month": base_daily * 30,
|
|
}
|
|
|
|
# Add some realistic variation (±20%)
|
|
variation = random.uniform(0.8, 1.2)
|
|
for key in estimates:
|
|
estimates[key] = int(estimates[key] * variation)
|
|
|
|
return {
|
|
"data": {
|
|
"last_day": estimates["day"],
|
|
"last_week": estimates["week"],
|
|
"last_month": estimates["month"],
|
|
},
|
|
"package": package_name,
|
|
"type": "recent_downloads",
|
|
"source": "fallback_estimates",
|
|
"note": "Estimated data due to API unavailability. Actual values may differ.",
|
|
}
|
|
|
|
def _generate_fallback_overall_downloads(
|
|
self, package_name: str, mirrors: bool = False
|
|
) -> dict[str, Any]:
|
|
"""Generate fallback time series data when API is unavailable."""
|
|
logger.warning(f"Generating fallback time series data for {package_name}")
|
|
|
|
# Generate 180 days of synthetic time series data
|
|
time_series = []
|
|
base_date = datetime.now() - timedelta(days=180)
|
|
|
|
# Get base daily estimate from recent downloads fallback
|
|
recent_fallback = self._generate_fallback_recent_downloads(package_name)
|
|
base_daily = recent_fallback["data"]["last_day"]
|
|
|
|
for i in range(180):
|
|
current_date = base_date + timedelta(days=i)
|
|
|
|
# Add weekly and seasonal patterns
|
|
day_of_week = current_date.weekday()
|
|
# Lower downloads on weekends
|
|
week_factor = 0.7 if day_of_week >= 5 else 1.0
|
|
|
|
# Add some growth trend (packages generally grow over time)
|
|
growth_factor = 1.0 + (i / 180) * 0.3 # 30% growth over 180 days
|
|
|
|
# Add random daily variation
|
|
daily_variation = random.uniform(0.7, 1.3)
|
|
|
|
daily_downloads = int(
|
|
base_daily * week_factor * growth_factor * daily_variation
|
|
)
|
|
|
|
category = "with_mirrors" if mirrors else "without_mirrors"
|
|
time_series.append(
|
|
{
|
|
"category": category,
|
|
"date": current_date.strftime("%Y-%m-%d"),
|
|
"downloads": daily_downloads,
|
|
}
|
|
)
|
|
|
|
return {
|
|
"data": time_series,
|
|
"package": package_name,
|
|
"type": "overall_downloads",
|
|
"source": "fallback_estimates",
|
|
"note": "Estimated time series data due to API unavailability. Actual values may differ.",
|
|
}
|
|
|
|
async def get_recent_downloads(
|
|
self, package_name: str, period: str = "month", use_cache: bool = True
|
|
) -> dict[str, Any]:
|
|
"""Get recent download statistics for a package.
|
|
|
|
Args:
|
|
package_name: Name of the package to query
|
|
period: Time period ('day', 'week', 'month')
|
|
use_cache: Whether to use cached data if available
|
|
|
|
Returns:
|
|
Dictionary containing recent download statistics
|
|
|
|
Raises:
|
|
InvalidPackageNameError: If package name is invalid
|
|
PackageNotFoundError: If package is not found
|
|
NetworkError: For network-related errors
|
|
"""
|
|
normalized_name = self._validate_package_name(package_name)
|
|
cache_key = self._get_cache_key("recent", normalized_name, period=period)
|
|
|
|
# Check cache first (including fallback cache)
|
|
if use_cache and cache_key in self._cache:
|
|
cache_entry = self._cache[cache_key]
|
|
if self._is_cache_valid(cache_entry):
|
|
logger.debug(f"Using cached recent downloads for: {normalized_name}")
|
|
return cache_entry["data"]
|
|
elif self._should_use_fallback() and self._is_cache_valid(
|
|
cache_entry, fallback=True
|
|
):
|
|
logger.info(
|
|
f"Using extended cache (fallback mode) for: {normalized_name}"
|
|
)
|
|
cache_entry["data"]["note"] = "Extended cache data due to API issues"
|
|
return cache_entry["data"]
|
|
|
|
# Check if we should use fallback immediately
|
|
if self._should_use_fallback():
|
|
logger.warning(
|
|
f"API health poor, using fallback data for: {normalized_name}"
|
|
)
|
|
fallback_data = self._generate_fallback_recent_downloads(
|
|
normalized_name, period
|
|
)
|
|
|
|
# Cache fallback data with extended TTL
|
|
self._cache[cache_key] = {"data": fallback_data, "timestamp": time.time()}
|
|
return fallback_data
|
|
|
|
# Make API request
|
|
url = f"{self.base_url}/packages/{normalized_name}/recent"
|
|
if period and period != "all":
|
|
url += f"?period={period}"
|
|
|
|
logger.info(
|
|
f"Fetching recent downloads for: {normalized_name} (period: {period})"
|
|
)
|
|
|
|
try:
|
|
data = await self._make_request(url)
|
|
|
|
# Cache the result
|
|
self._cache[cache_key] = {"data": data, "timestamp": time.time()}
|
|
|
|
return data
|
|
|
|
except (PyPIServerError, NetworkError) as e:
|
|
logger.error(f"API request failed for {normalized_name}: {e}")
|
|
|
|
# Try to use stale cache data if available
|
|
if use_cache and cache_key in self._cache:
|
|
cache_entry = self._cache[cache_key]
|
|
logger.warning(
|
|
f"Using stale cache data for {normalized_name} due to API failure"
|
|
)
|
|
cache_entry["data"]["note"] = f"Stale cache data due to API error: {e}"
|
|
return cache_entry["data"]
|
|
|
|
# Last resort: generate fallback data
|
|
if self.fallback_enabled:
|
|
logger.warning(
|
|
f"Generating fallback data for {normalized_name} due to API failure"
|
|
)
|
|
fallback_data = self._generate_fallback_recent_downloads(
|
|
normalized_name, period
|
|
)
|
|
|
|
# Cache fallback data
|
|
self._cache[cache_key] = {
|
|
"data": fallback_data,
|
|
"timestamp": time.time(),
|
|
}
|
|
return fallback_data
|
|
|
|
# If fallback is disabled, re-raise the original exception
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Unexpected error fetching recent downloads for {normalized_name}: {e}"
|
|
)
|
|
raise
|
|
|
|
async def get_overall_downloads(
|
|
self, package_name: str, mirrors: bool = False, use_cache: bool = True
|
|
) -> dict[str, Any]:
|
|
"""Get overall download time series for a package.
|
|
|
|
Args:
|
|
package_name: Name of the package to query
|
|
mirrors: Whether to include mirror downloads
|
|
use_cache: Whether to use cached data if available
|
|
|
|
Returns:
|
|
Dictionary containing overall download time series
|
|
|
|
Raises:
|
|
InvalidPackageNameError: If package name is invalid
|
|
PackageNotFoundError: If package is not found
|
|
NetworkError: For network-related errors
|
|
"""
|
|
normalized_name = self._validate_package_name(package_name)
|
|
cache_key = self._get_cache_key("overall", normalized_name, mirrors=mirrors)
|
|
|
|
# Check cache first (including fallback cache)
|
|
if use_cache and cache_key in self._cache:
|
|
cache_entry = self._cache[cache_key]
|
|
if self._is_cache_valid(cache_entry):
|
|
logger.debug(f"Using cached overall downloads for: {normalized_name}")
|
|
return cache_entry["data"]
|
|
elif self._should_use_fallback() and self._is_cache_valid(
|
|
cache_entry, fallback=True
|
|
):
|
|
logger.info(
|
|
f"Using extended cache (fallback mode) for: {normalized_name}"
|
|
)
|
|
cache_entry["data"]["note"] = "Extended cache data due to API issues"
|
|
return cache_entry["data"]
|
|
|
|
# Check if we should use fallback immediately
|
|
if self._should_use_fallback():
|
|
logger.warning(
|
|
f"API health poor, using fallback data for: {normalized_name}"
|
|
)
|
|
fallback_data = self._generate_fallback_overall_downloads(
|
|
normalized_name, mirrors
|
|
)
|
|
|
|
# Cache fallback data with extended TTL
|
|
self._cache[cache_key] = {"data": fallback_data, "timestamp": time.time()}
|
|
return fallback_data
|
|
|
|
# Make API request
|
|
url = f"{self.base_url}/packages/{normalized_name}/overall"
|
|
if mirrors is not None:
|
|
url += f"?mirrors={'true' if mirrors else 'false'}"
|
|
|
|
logger.info(
|
|
f"Fetching overall downloads for: {normalized_name} (mirrors: {mirrors})"
|
|
)
|
|
|
|
try:
|
|
data = await self._make_request(url)
|
|
|
|
# Cache the result
|
|
self._cache[cache_key] = {"data": data, "timestamp": time.time()}
|
|
|
|
return data
|
|
|
|
except (PyPIServerError, NetworkError) as e:
|
|
logger.error(f"API request failed for {normalized_name}: {e}")
|
|
|
|
# Try to use stale cache data if available
|
|
if use_cache and cache_key in self._cache:
|
|
cache_entry = self._cache[cache_key]
|
|
logger.warning(
|
|
f"Using stale cache data for {normalized_name} due to API failure"
|
|
)
|
|
cache_entry["data"]["note"] = f"Stale cache data due to API error: {e}"
|
|
return cache_entry["data"]
|
|
|
|
# Last resort: generate fallback data
|
|
if self.fallback_enabled:
|
|
logger.warning(
|
|
f"Generating fallback data for {normalized_name} due to API failure"
|
|
)
|
|
fallback_data = self._generate_fallback_overall_downloads(
|
|
normalized_name, mirrors
|
|
)
|
|
|
|
# Cache fallback data
|
|
self._cache[cache_key] = {
|
|
"data": fallback_data,
|
|
"timestamp": time.time(),
|
|
}
|
|
return fallback_data
|
|
|
|
# If fallback is disabled, re-raise the original exception
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Unexpected error fetching overall downloads for {normalized_name}: {e}"
|
|
)
|
|
raise
|
|
|
|
def clear_cache(self):
|
|
"""Clear the internal cache."""
|
|
self._cache.clear()
|
|
logger.debug("Stats cache cleared")
|