mcarchive-org/tests/test_client.py

"""End-to-end smoke tests against live archive.org (network required).

Run with:  uv run pytest -v
Skip with: uv run pytest -v -m 'not network'
"""

from __future__ import annotations

from pathlib import Path

import pytest

from mcarchive_org.client import ArchiveClient

pytestmark = pytest.mark.network


async def test_search_nasa_item():
    async with ArchiveClient() as c:
        result = await c.search(query="identifier:nasa", rows=5)
    assert result["num_found"] >= 1
    assert any(d["identifier"] == "nasa" for d in result["docs"])


async def test_metadata_nasa():
    async with ArchiveClient() as c:
        data = await c.metadata("nasa")
    assert data["metadata"]["identifier"] == "nasa"
    assert isinstance(data["files"], list) and data["files"]


async def test_download_small_file(tmp_path: Path):
    async with ArchiveClient() as c:
        files = await c.files("nasa")
        # pick the smallest file to keep the test fast
        small = min(
            (f for f in files if f.get("size") and str(f["size"]).isdigit()),
            key=lambda f: int(f["size"]),
        )
        dest = tmp_path / small["name"]
        result = await c.download_to_file(
            "nasa", small["name"], dest, verify_md5=small.get("md5")
        )
    assert result["bytes_written"] > 0
    if small.get("md5"):
        assert result["md5_ok"] is True


async def test_scrape_requires_min_count():
    async with ArchiveClient() as c:
        with pytest.raises(ValueError):
            await c.scrape(query="identifier:nasa", count=10)