Initial mcarchive-org MCP server

FastMCP server wrapping archive.org's public read APIs: - search_items / scrape_items: advanced search + bulk cursor pagination - get_item_metadata / list_files: progressive disclosure with filtering - get_file_url / download_file: canonical URLs and streaming downloads with HTTP Range resume + optional MD5 verification Smoke-tested end-to-end via claude -p headless MCP and pytest against live archive.org endpoints.
2026-04-21 09:41:20 -06:00 · 2026-04-21 09:41:20 -06:00 · 5265a6440b
commit 5265a6440b
10 changed files with 2295 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,14 @@
 __pycache__/
 *.py[cod]
 *.egg-info/
 .venv/
 .ruff_cache/
 .pytest_cache/
 dist/
 build/
 .mypy_cache/
 *.log
 # downloads from test runs
 downloads/
 tmp/
--- a/README.md
+++ b/README.md
@ -0,0 +1,73 @@
 # mcarchive-org
 An MCP (Model Context Protocol) server that lets an LLM search, inspect, and download content from the [Internet Archive](https://archive.org).
 Built on [FastMCP](https://gofastmcp.com) + [httpx](https://www.python-httpx.org/). No API key required — archive.org's read endpoints are public.
 ## Tools
 | Tool | Purpose |
 |------|---------|
 | `search_items` | Small Solr-style search via `advancedsearch.php` (1–200 rows, paginated) |
 | `scrape_items` | Bulk cursor-paginated search via Scrape API (count ≥ 100) |
 | `get_item_metadata` | Metadata for one item; skips the (possibly huge) files list by default |
 | `list_files` | Files array with optional format / glob filtering — includes `download_url` per file |
 | `get_file_url` | Build a canonical download URL without hitting the network |
 | `download_file` | Stream a file to disk with resume support and optional MD5 verification |
 Also exposes an MCP resource template: `archive://item/{identifier}`.
 ## Install & run
 ```bash
 # From a checkout:
 uv sync
 uv run mcarchive-org
 # Or from PyPI (once published):
 uvx mcarchive-org
 ```
 Register with Claude Code:
 ```bash
 claude mcp add archive-org -- uvx mcarchive-org
 # or, from a local checkout:
 claude mcp add archive-org -- uv run --directory /path/to/mcarchive-org mcarchive-org
 ```
 ## Environment
 | Variable | Default | Purpose |
 |----------|---------|---------|
 | `MCARCHIVE_DOWNLOAD_ROOT` | `./downloads` | Base directory for `download_file` |
 ## Example flow
 ```
 search_items(query='mediatype:audio AND creator:"Grateful Dead"', sort=['downloads desc'])
  → identifier 'gd77-05-08.sbd.hicks.4982.sbeok.shnf' (among others)
 list_files(identifier='gd77-05-08.sbd.hicks.4982.sbeok.shnf', formats=['VBR MP3'])
  → [{ name: 'gd1977-05-08d1t01.mp3', size: 6342912, md5: '…', download_url: '…' }, …]
 download_file(identifier='gd77-…', filename='gd1977-05-08d1t01.mp3', verify_md5='…')
  → { path: './downloads/gd77-…/gd1977-…mp3', bytes: 6342912, md5_ok: True }
 ```
 ## Query syntax notes
 archive.org uses a Solr/Lucene dialect:
 - `mediatype:(audio OR movies)` — restrict to media types
 - `collection:etree` — items in a specific collection
 - `date:[1977-01-01 TO 1977-12-31]` — date ranges
 - `creator:"Grateful Dead"` — phrase match
 - `-subject:bootleg` — exclusion
 - Sort by `downloads desc`, `date asc`, `addeddate desc`, etc.
 See [archive.org's search docs](https://archive.org/advancedsearch.php) for the full grammar.
 ## License
 MIT
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,54 @@
 [project]
 name = "mcarchive-org"
 version = "2026.04.21"
 description = "MCP server for searching and downloading files from the Internet Archive (archive.org)"
 readme = "README.md"
 requires-python = ">=3.10"
 license = { text = "MIT" }
 authors = [
    { name = "Ryan Malloy", email = "ryan@supported.systems" },
 ]
 keywords = ["mcp", "archive.org", "internet-archive", "fastmcp", "llm"]
 classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
    "License :: OSI Approved :: MIT License",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Topic :: Internet :: WWW/HTTP",
 ]
 dependencies = [
    "fastmcp>=3.2.4",
    "httpx>=0.28.1",
 ]
 [project.scripts]
 mcarchive-org = "mcarchive_org.server:main"
 [project.urls]
 Homepage = "https://archive.org/developers/"
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["src/mcarchive_org"]
 [tool.ruff]
 line-length = 100
 target-version = "py310"
 [tool.ruff.lint]
 select = ["E", "F", "W", "I", "UP", "B", "SIM", "RUF"]
 ignore = ["E501"]
 [dependency-groups]
 dev = [
    "pytest>=8.0",
    "pytest-asyncio>=0.23",
    "ruff>=0.5",
 ]
--- a/src/mcarchive_org/init.py
+++ b/src/mcarchive_org/init.py
@ -0,0 +1,8 @@
 """MCP server for the Internet Archive (archive.org)."""
 from importlib.metadata import PackageNotFoundError, version
 try:
    __version__ = version("mcarchive-org")
 except PackageNotFoundError:
    __version__ = "0.0.0"
--- a/src/mcarchive_org/main.py
+++ b/src/mcarchive_org/main.py
@ -0,0 +1,4 @@
 from mcarchive_org.server import main
 if __name__ == "__main__":
    main()
--- a/src/mcarchive_org/client.py
+++ b/src/mcarchive_org/client.py
@ -0,0 +1,196 @@
 """Low-level archive.org HTTP client (pure httpx, no MCP dependencies)."""
 from __future__ import annotations
 import hashlib
 from collections.abc import AsyncIterator
 from pathlib import Path
 from typing import Any
 import httpx
 ARCHIVE_BASE = "https://archive.org"
 DEFAULT_UA = "mcarchive-org/2026.04.21 (+https://archive.org/developers/)"
 DEFAULT_TIMEOUT = httpx.Timeout(30.0, read=60.0)
 class ArchiveError(RuntimeError):
    """Raised when archive.org returns an error payload or unexpected status."""
 class ArchiveClient:
    """Async client for the three archive.org endpoints we care about.
    - advancedsearch.php : small Solr-style queries (<= ~10,000 rows paginated)
    - services/search/v1/scrape : bulk cursor-based iteration (count >= 100)
    - metadata/{id} : full item manifest including files[]
    - download/{id}/{file} : byte stream with Range support
    """
    def __init__(
        self,
        base_url: str = ARCHIVE_BASE,
        user_agent: str = DEFAULT_UA,
        timeout: httpx.Timeout | float = DEFAULT_TIMEOUT,
    ) -> None:
        self._base = base_url.rstrip("/")
        self._client = httpx.AsyncClient(
            headers={"User-Agent": user_agent, "Accept": "application/json"},
            timeout=timeout,
            follow_redirects=True,
        )
    async def aclose(self) -> None:
        await self._client.aclose()
    async def __aenter__(self) -> ArchiveClient:
        return self
    async def __aexit__(self, *exc: object) -> None:
        await self.aclose()
    # ---------- search ----------
    async def search(
        self,
        query: str,
        fields: list[str] | None = None,
        sort: list[str] | None = None,
        rows: int = 25,
        page: int = 1,
    ) -> dict[str, Any]:
        """Advanced search — best for small result sets (<=10k total)."""
        params: list[tuple[str, str]] = [
            ("q", query),
            ("output", "json"),
            ("rows", str(rows)),
            ("page", str(page)),
        ]
        for f in fields or ["identifier", "title", "mediatype", "creator", "date"]:
            params.append(("fl[]", f))
        for s in sort or []:
            params.append(("sort[]", s))
        r = await self._client.get(f"{self._base}/advancedsearch.php", params=params)
        r.raise_for_status()
        data = r.json()
        resp = data.get("response", {})
        return {
            "num_found": resp.get("numFound", 0),
            "start": resp.get("start", 0),
            "page": page,
            "rows": rows,
            "docs": resp.get("docs", []),
        }
    async def scrape(
        self,
        query: str,
        fields: list[str] | None = None,
        sorts: list[str] | None = None,
        count: int = 100,
        cursor: str | None = None,
    ) -> dict[str, Any]:
        """Scrape API — cursor-paginated; count must be >= 100."""
        if count < 100:
            raise ValueError("scrape count must be >= 100; use search() for smaller queries")
        params: dict[str, str] = {"q": query, "count": str(count)}
        if fields:
            params["fields"] = ",".join(fields)
        if sorts:
            params["sorts"] = ",".join(sorts)
        if cursor:
            params["cursor"] = cursor
        r = await self._client.get(f"{self._base}/services/search/v1/scrape", params=params)
        r.raise_for_status()
        data = r.json()
        if "error" in data:
            raise ArchiveError(f"{data.get('errorType', 'ScrapeError')}: {data['error']}")
        return data  # keys: items, count, total, cursor (if more pages)
    # ---------- metadata ----------
    async def metadata(self, identifier: str) -> dict[str, Any]:
        """Full metadata blob for an item."""
        r = await self._client.get(f"{self._base}/metadata/{identifier}")
        r.raise_for_status()
        data = r.json()
        if not data:
            raise ArchiveError(f"item not found: {identifier}")
        return data
    async def files(self, identifier: str) -> list[dict[str, Any]]:
        """Just the files[] slice — smaller payload when that's all you want."""
        r = await self._client.get(f"{self._base}/metadata/{identifier}/files")
        r.raise_for_status()
        data = r.json()
        if isinstance(data, dict) and "result" in data:
            return data["result"]
        if isinstance(data, list):
            return data
        raise ArchiveError(f"unexpected files response for {identifier}")
    # ---------- download ----------
    def download_url(self, identifier: str, filename: str) -> str:
        return f"{self._base}/download/{identifier}/{filename}"
    async def stream_file(
        self,
        identifier: str,
        filename: str,
        resume_from: int = 0,
    ) -> AsyncIterator[bytes]:
        """Async byte iterator — caller is responsible for writing to disk."""
        headers = {}
        if resume_from > 0:
            headers["Range"] = f"bytes={resume_from}-"
        url = self.download_url(identifier, filename)
        async with self._client.stream("GET", url, headers=headers) as r:
            r.raise_for_status()
            async for chunk in r.aiter_bytes(chunk_size=1 << 16):
                yield chunk
    async def download_to_file(
        self,
        identifier: str,
        filename: str,
        dest: Path,
        verify_md5: str | None = None,
        chunk_cb=None,
    ) -> dict[str, Any]:
        """Download with resume support. Returns stats + md5 verification result."""
        dest.parent.mkdir(parents=True, exist_ok=True)
        resume_from = dest.stat().st_size if dest.exists() else 0
        hasher = hashlib.md5() if verify_md5 else None
        if hasher and resume_from:
            # re-hash existing bytes so the final digest is correct
            with dest.open("rb") as f:
                while chunk := f.read(1 << 16):
                    hasher.update(chunk)
        bytes_written = resume_from
        mode = "ab" if resume_from else "wb"
        with dest.open(mode) as f:
            async for chunk in self.stream_file(identifier, filename, resume_from=resume_from):
                f.write(chunk)
                bytes_written += len(chunk)
                if hasher:
                    hasher.update(chunk)
                if chunk_cb:
                    chunk_cb(bytes_written)
        result = {
            "path": str(dest),
            "bytes": bytes_written,
            "resumed_from": resume_from,
        }
        if verify_md5 and hasher:
            actual = hasher.hexdigest()
            result["md5_actual"] = actual
            result["md5_expected"] = verify_md5
            result["md5_ok"] = actual.lower() == verify_md5.lower()
        return result
--- a/src/mcarchive_org/server.py
+++ b/src/mcarchive_org/server.py
@ -0,0 +1,258 @@
 """FastMCP server exposing archive.org search, metadata, and download."""
 from __future__ import annotations
 import fnmatch
 import os
 from pathlib import Path
 from typing import Annotated, Any
 from fastmcp import FastMCP
 from pydantic import Field
 from mcarchive_org import __version__
 from mcarchive_org.client import ArchiveClient
 DEFAULT_DOWNLOAD_ROOT = Path(
    os.environ.get("MCARCHIVE_DOWNLOAD_ROOT", Path.cwd() / "downloads")
 ).expanduser()
 mcp = FastMCP(
    name="mcarchive-org",
    instructions=(
        "Search and download files from the Internet Archive (archive.org). "
        "Typical flow: search_items -> get_item_metadata -> list_files -> download_file. "
        "Use scrape_items (count>=100) only for bulk cursor-paginated iteration."
    ),
 )
 # ---------- helpers (not exposed as tools) ----------
 def _human_size(n: int | str | None) -> str:
    try:
        x = float(n)  # type: ignore[arg-type]
    except (TypeError, ValueError):
        return "?"
    for unit in ("B", "KB", "MB", "GB", "TB"):
        if x < 1024:
            return f"{x:.1f} {unit}" if unit != "B" else f"{int(x)} B"
        x /= 1024
    return f"{x:.1f} PB"
 def _enrich_file(identifier: str, f: dict[str, Any]) -> dict[str, Any]:
    name = f.get("name", "")
    return {
        "name": name,
        "format": f.get("format"),
        "size": int(f["size"]) if f.get("size") and str(f["size"]).isdigit() else None,
        "size_human": _human_size(f.get("size")),
        "md5": f.get("md5"),
        "sha1": f.get("sha1"),
        "mtime": f.get("mtime"),
        "source": f.get("source"),
        "download_url": f"https://archive.org/download/{identifier}/{name}",
    }
 def _matches(name: str, format_: str | None, name_glob: str | None, formats: list[str] | None) -> bool:
    if name_glob and not fnmatch.fnmatchcase(name, name_glob):
        return False
    return not (formats and (format_ or "").lower() not in {f.lower() for f in formats})
 # ---------- tools ----------
@mcp.tool
 async def search_items(
    query: Annotated[str, Field(description="Lucene/Solr query, e.g. 'mediatype:audio AND creator:\"Grateful Dead\"'")],
    fields: Annotated[
        list[str] | None,
        Field(description="Which metadata fields to return per doc. Defaults to identifier,title,mediatype,creator,date."),
    ] = None,
    sort: Annotated[
        list[str] | None,
        Field(description="Sort expressions like 'downloads desc' or 'date asc'."),
    ] = None,
    rows: Annotated[int, Field(ge=1, le=200, description="Results per page (1-200).")] = 25,
    page: Annotated[int, Field(ge=1, description="1-indexed page number.")] = 1,
 ) -> dict[str, Any]:
    """Search archive.org items. Good for small/interactive queries.
    Returns up to `rows` matching items plus `num_found` (total hits) and `has_more`.
    Use scrape_items for bulk iteration over large result sets.
    """
    async with ArchiveClient() as c:
        result = await c.search(query=query, fields=fields, sort=sort, rows=rows, page=page)
    total = result["num_found"]
    seen = (page - 1) * rows + len(result["docs"])
    return {
        "query": query,
        "num_found": total,
        "page": page,
        "rows": rows,
        "has_more": seen < total,
        "docs": result["docs"],
    }
@mcp.tool
 async def scrape_items(
    query: Annotated[str, Field(description="Lucene/Solr query.")],
    fields: Annotated[list[str] | None, Field(description="Metadata fields per item.")] = None,
    sorts: Annotated[list[str] | None, Field(description="Sort expressions, e.g. ['date asc'].")] = None,
    count: Annotated[int, Field(ge=100, le=10000, description="Items per page (>=100 required by API).")] = 500,
    cursor: Annotated[str | None, Field(description="Pass the `cursor` from a prior response to fetch next page.")] = None,
 ) -> dict[str, Any]:
    """Scrape API — high-throughput cursor-paginated search. count >= 100.
    Response includes `cursor` (for next page) when more results exist; missing when done.
    """
    async with ArchiveClient() as c:
        data = await c.scrape(query=query, fields=fields, sorts=sorts, count=count, cursor=cursor)
    return {
        "items": data.get("items", []),
        "count": data.get("count"),
        "total": data.get("total"),
        "next_cursor": data.get("cursor"),
    }
@mcp.tool
 async def get_item_metadata(
    identifier: Annotated[str, Field(description="Archive.org item identifier, e.g. 'nasa'.")],
    include_files: Annotated[
        bool, Field(description="If true, include the full files[] array. Can be large.")
    ] = False,
 ) -> dict[str, Any]:
    """Get metadata for a single item.
    By default omits the (potentially huge) files[] array — call list_files for that.
    """
    async with ArchiveClient() as c:
        data = await c.metadata(identifier)
    md = data.get("metadata", {})
    out: dict[str, Any] = {
        "identifier": md.get("identifier", identifier),
        "title": md.get("title"),
        "mediatype": md.get("mediatype"),
        "collection": md.get("collection"),
        "creator": md.get("creator"),
        "date": md.get("date"),
        "description": md.get("description"),
        "publicdate": md.get("publicdate"),
        "uploader": md.get("uploader"),
        "subject": md.get("subject"),
        "licenseurl": md.get("licenseurl"),
        "item_size_bytes": data.get("item_size"),
        "item_size_human": _human_size(data.get("item_size")),
        "files_count": data.get("files_count"),
        "server": data.get("server"),
        "dir": data.get("dir"),
        "item_url": f"https://archive.org/details/{identifier}",
    }
    if include_files:
        out["files"] = [_enrich_file(identifier, f) for f in data.get("files", [])]
    return out
@mcp.tool
 async def list_files(
    identifier: Annotated[str, Field(description="Archive.org item identifier.")],
    formats: Annotated[
        list[str] | None,
        Field(description="Filter by format, e.g. ['MP3','VBR MP3','JPEG']. Case-insensitive."),
    ] = None,
    name_glob: Annotated[
        str | None,
        Field(description="fnmatch-style glob on filename, e.g. '*.mp3' or 'cover.*'."),
    ] = None,
    limit: Annotated[int, Field(ge=1, le=1000, description="Max files to return.")] = 100,
 ) -> dict[str, Any]:
    """List files in an item, with optional format/glob filtering.
    Each entry includes a ready-to-use `download_url`.
    """
    async with ArchiveClient() as c:
        files = await c.files(identifier)
    matches = [
        _enrich_file(identifier, f)
        for f in files
        if _matches(f.get("name", ""), f.get("format"), name_glob, formats)
    ]
    return {
        "identifier": identifier,
        "total_matching": len(matches),
        "returned": min(len(matches), limit),
        "files": matches[:limit],
    }
@mcp.tool
 def get_file_url(
    identifier: Annotated[str, Field(description="Item identifier.")],
    filename: Annotated[str, Field(description="Exact filename as shown in list_files.")],
 ) -> dict[str, str]:
    """Build the canonical download URL for a file without fetching anything."""
    return {
        "url": f"https://archive.org/download/{identifier}/{filename}",
        "item_url": f"https://archive.org/details/{identifier}",
    }
@mcp.tool
 async def download_file(
    identifier: Annotated[str, Field(description="Item identifier.")],
    filename: Annotated[str, Field(description="Exact filename from list_files.")],
    dest_dir: Annotated[
        str | None,
        Field(description="Directory to save into. Defaults to $MCARCHIVE_DOWNLOAD_ROOT/{identifier}."),
    ] = None,
    verify_md5: Annotated[
        str | None,
        Field(description="Expected MD5 hex digest (from list_files). If provided, checksum is verified."),
    ] = None,
    overwrite: Annotated[
        bool,
        Field(description="If false and file exists, resume the download (Range request)."),
    ] = False,
 ) -> dict[str, Any]:
    """Download a file to disk. Supports resume via HTTP Range when overwrite=false."""
    target_dir = Path(dest_dir).expanduser() if dest_dir else (DEFAULT_DOWNLOAD_ROOT / identifier)
    dest = target_dir / filename
    if overwrite and dest.exists():
        dest.unlink()
    async with ArchiveClient() as c:
        result = await c.download_to_file(identifier, filename, dest, verify_md5=verify_md5)
    result["identifier"] = identifier
    result["filename"] = filename
    result["size_human"] = _human_size(result.get("bytes"))
    return result
 # ---------- resources ----------
@mcp.resource("archive://item/{identifier}")
 async def item_resource(identifier: str) -> dict[str, Any]:
    """Expose item metadata as a readable MCP resource."""
    return await get_item_metadata.fn(identifier=identifier, include_files=False)  # type: ignore[attr-defined]
 # ---------- entry point ----------
 def main() -> None:
    print(f"mcarchive-org v{__version__} — Internet Archive MCP server")
    mcp.run()
 if __name__ == "__main__":
    main()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,14 @@
 import pytest
 def pytest_collection_modifyitems(config, items):
    for item in items:
        if "asyncio" in item.keywords or item.get_closest_marker("asyncio"):
            continue
 pytest_plugins = ["pytest_asyncio"]
 def pytest_configure(config: pytest.Config) -> None:
    config.addinivalue_line("markers", "network: test hits live archive.org")
--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -0,0 +1,52 @@
 """End-to-end smoke tests against live archive.org (network required).
 Run with:  uv run pytest -v
 Skip with: uv run pytest -v -m 'not network'
 """
 from __future__ import annotations
 from pathlib import Path
 import pytest
 from mcarchive_org.client import ArchiveClient
 pytestmark = [pytest.mark.asyncio, pytest.mark.network]
 async def test_search_nasa_item():
    async with ArchiveClient() as c:
        result = await c.search(query="identifier:nasa", rows=5)
    assert result["num_found"] >= 1
    assert any(d["identifier"] == "nasa" for d in result["docs"])
 async def test_metadata_nasa():
    async with ArchiveClient() as c:
        data = await c.metadata("nasa")
    assert data["metadata"]["identifier"] == "nasa"
    assert isinstance(data["files"], list) and data["files"]
 async def test_download_small_file(tmp_path: Path):
    async with ArchiveClient() as c:
        files = await c.files("nasa")
        # pick the smallest file to keep the test fast
        small = min(
            (f for f in files if f.get("size") and str(f["size"]).isdigit()),
            key=lambda f: int(f["size"]),
        )
        dest = tmp_path / small["name"]
        result = await c.download_to_file(
            "nasa", small["name"], dest, verify_md5=small.get("md5")
        )
    assert result["bytes"] > 0
    if small.get("md5"):
        assert result["md5_ok"] is True
 async def test_scrape_requires_min_count():
    async with ArchiveClient() as c:
        with pytest.raises(ValueError):
            await c.scrape(query="identifier:nasa", count=10)
--- a/uv.lock
+++ b/uv.lock