Ryan Malloy e205176ace feat: add comprehensive PyPI search functionality with advanced filtering
- Implemented PyPISearchClient with semantic search, filtering, and sorting
- Added 4 new search tools: search_packages, search_by_category, find_alternatives, get_trending_packages
- Created SearchFilter and SearchSort classes for flexible configuration
- Added SearchError exception for search-specific error handling
- Comprehensive test suite with 13 tests covering all search functionality
- Enhanced MCP server with 4 new search endpoints
- Support for filtering by Python version, license, category, downloads, maintenance status
- Multiple sorting options: relevance, popularity, quality, recency, name, downloads
- Semantic search using description similarity scoring
- Category-based package discovery with intelligent keyword matching
- Package alternatives finder using metadata analysis
- Trending packages analysis with download activity tracking
- Robust fallback mechanisms using curated package database
- All tests passing (13/13)

This implements feature #6 from the roadmap: "Advanced PyPI Search with filtering by Python version, license, maintenance status and sorting by popularity, recency, quality score with semantic search capabilities"
2025-08-16 07:45:40 -06:00

309 lines
12 KiB
Python

"""PyPI search tools with advanced filtering and sorting capabilities."""
import logging
from typing import Any, Dict, List, Optional
from ..core.exceptions import InvalidPackageNameError, SearchError
from ..core.search_client import PyPISearchClient, SearchFilter, SearchSort
logger = logging.getLogger(__name__)
async def search_packages(
query: str,
limit: int = 20,
python_versions: Optional[List[str]] = None,
licenses: Optional[List[str]] = None,
categories: Optional[List[str]] = None,
min_downloads: Optional[int] = None,
maintenance_status: Optional[str] = None,
has_wheels: Optional[bool] = None,
sort_by: str = "relevance",
sort_desc: bool = True,
semantic_search: bool = False,
) -> Dict[str, Any]:
"""
Search PyPI packages with advanced filtering and sorting.
Args:
query: Search query string
limit: Maximum number of results to return (default: 20)
python_versions: List of Python versions to filter by (e.g., ["3.9", "3.10"])
licenses: List of license types to filter by (e.g., ["mit", "apache", "bsd"])
categories: List of categories to filter by (e.g., ["web", "data-science"])
min_downloads: Minimum monthly downloads threshold
maintenance_status: Filter by maintenance status ("active", "maintained", "stale", "abandoned")
has_wheels: Filter packages that have/don't have wheel distributions
sort_by: Sort field ("relevance", "popularity", "recency", "quality", "name", "downloads")
sort_desc: Sort in descending order (default: True)
semantic_search: Use semantic search on package descriptions (default: False)
Returns:
Dictionary containing search results and metadata
Raises:
InvalidPackageNameError: If search query is invalid
SearchError: If search operation fails
"""
if not query or not query.strip():
raise InvalidPackageNameError("Search query cannot be empty")
if limit <= 0 or limit > 100:
limit = 20
logger.info(f"Searching PyPI: '{query}' (limit: {limit}, sort: {sort_by})")
try:
# Create search filters
filters = SearchFilter(
python_versions=python_versions,
licenses=licenses,
categories=categories,
min_downloads=min_downloads,
maintenance_status=maintenance_status,
has_wheels=has_wheels,
)
# Create sort configuration
sort = SearchSort(field=sort_by, reverse=sort_desc)
# Perform search
async with PyPISearchClient() as search_client:
result = await search_client.search_packages(
query=query,
limit=limit,
filters=filters,
sort=sort,
semantic_search=semantic_search,
)
return result
except SearchError:
raise
except Exception as e:
logger.error(f"Unexpected error during search: {e}")
raise SearchError(f"Search failed: {e}") from e
async def search_by_category(
category: str,
limit: int = 20,
sort_by: str = "popularity",
python_version: Optional[str] = None,
) -> Dict[str, Any]:
"""
Search packages by category with popularity sorting.
Args:
category: Category to search for (e.g., "web", "data-science", "testing")
limit: Maximum number of results to return
sort_by: Sort field (default: "popularity")
python_version: Filter by Python version compatibility
Returns:
Dictionary containing categorized search results
"""
logger.info(f"Searching category: '{category}' (limit: {limit})")
# Map category to search query and filters
category_queries = {
"web": "web framework flask django fastapi",
"data-science": "data science machine learning pandas numpy",
"database": "database sql orm sqlite postgres mysql",
"testing": "testing pytest unittest mock coverage",
"cli": "command line interface cli argparse click",
"security": "security encryption crypto ssl authentication",
"networking": "network http requests urllib socket",
"dev-tools": "development tools build package deploy",
"cloud": "cloud aws azure gcp docker kubernetes",
"gui": "gui interface tkinter qt desktop",
}
query = category_queries.get(category.lower(), category)
return await search_packages(
query=query,
limit=limit,
categories=[category.lower()],
python_versions=[python_version] if python_version else None,
sort_by=sort_by,
semantic_search=True,
)
async def find_alternatives(
package_name: str,
limit: int = 10,
include_similar: bool = True,
) -> Dict[str, Any]:
"""
Find alternative packages to a given package.
Args:
package_name: Name of the package to find alternatives for
limit: Maximum number of alternatives to return
include_similar: Include packages with similar functionality
Returns:
Dictionary containing alternative packages and analysis
"""
logger.info(f"Finding alternatives for: '{package_name}'")
try:
# First, get information about the target package
from ..core.pypi_client import PyPIClient
async with PyPIClient() as client:
package_data = await client.get_package_info(package_name)
info = package_data["info"]
keywords = info.get("keywords", "")
summary = info.get("summary", "")
categories = info.get("classifiers", [])
# Extract category information
category_terms = []
for classifier in categories:
if "Topic ::" in classifier:
topic = classifier.split("Topic ::")[-1].strip().lower()
category_terms.append(topic)
# Create search query from package metadata
search_terms = []
if keywords:
search_terms.extend(keywords.split())
if summary:
# Extract key terms from summary
summary_words = [w for w in summary.lower().split() if len(w) > 3]
search_terms.extend(summary_words[:5])
search_query = " ".join(search_terms[:8]) # Limit to most relevant terms
if not search_query:
search_query = package_name # Fallback to package name
# Search for alternatives
results = await search_packages(
query=search_query,
limit=limit + 5, # Get extra to filter out the original package
sort_by="popularity",
semantic_search=include_similar,
)
# Filter out the original package
alternatives = []
for pkg in results["packages"]:
if pkg["name"].lower() != package_name.lower():
alternatives.append(pkg)
alternatives = alternatives[:limit]
return {
"target_package": {
"name": package_name,
"summary": summary,
"keywords": keywords,
"categories": category_terms,
},
"alternatives": alternatives,
"search_query_used": search_query,
"total_alternatives": len(alternatives),
"analysis": {
"search_method": "keyword_similarity" if search_terms else "name_based",
"semantic_search_used": include_similar,
"category_based": len(category_terms) > 0,
},
"timestamp": results["timestamp"],
}
except Exception as e:
logger.error(f"Error finding alternatives for {package_name}: {e}")
raise SearchError(f"Failed to find alternatives: {e}") from e
async def get_trending_packages(
category: Optional[str] = None,
time_period: str = "week",
limit: int = 20,
) -> Dict[str, Any]:
"""
Get trending packages based on recent download activity.
Args:
category: Optional category filter
time_period: Time period for trending analysis ("day", "week", "month")
limit: Maximum number of packages to return
Returns:
Dictionary containing trending packages
"""
logger.info(f"Getting trending packages: category={category}, period={time_period}")
try:
# Use our top packages functionality as a base
from .download_stats import get_top_packages_by_downloads
top_packages_result = await get_top_packages_by_downloads(period=time_period, limit=limit * 2)
# Filter by category if specified
if category:
# Enhance with category information
enhanced_packages = []
for pkg in top_packages_result["top_packages"]:
try:
# Get package metadata for category classification
from ..core.pypi_client import PyPIClient
async with PyPIClient() as client:
package_data = await client.get_package_info(pkg["package"])
# Simple category matching
info = package_data["info"]
text = f"{info.get('keywords', '')} {info.get('summary', '')}".lower()
category_keywords = {
"web": ["web framework", "web", "flask", "django", "fastapi", "wsgi", "asgi"],
"data-science": ["data", "science", "pandas", "numpy", "ml"],
"database": ["database", "sql", "orm"],
"testing": ["test", "pytest", "mock"],
"cli": ["cli", "command", "argparse", "click"],
}
if category.lower() in category_keywords:
keywords = category_keywords[category.lower()]
# For web category, be more specific to avoid HTTP clients
if category.lower() == "web":
web_patterns = ["web framework", "micro web", "flask", "django", "fastapi", "wsgi", "asgi"]
match_found = any(pattern in text for pattern in web_patterns)
else:
match_found = any(keyword in text for keyword in keywords)
if match_found:
enhanced_packages.append({
**pkg,
"category_match": True,
"summary": info.get("summary", ""),
})
except:
continue
trending_packages = enhanced_packages[:limit]
else:
trending_packages = top_packages_result["top_packages"][:limit]
return {
"trending_packages": trending_packages,
"time_period": time_period,
"category": category,
"total_found": len(trending_packages),
"analysis": {
"source": "download_statistics",
"category_filtered": category is not None,
"methodology": "Based on download counts and popularity metrics",
},
"timestamp": top_packages_result["timestamp"],
}
except Exception as e:
logger.error(f"Error getting trending packages: {e}")
raise SearchError(f"Failed to get trending packages: {e}") from e