Critical fixes:
- Validate identifier (^[A-Za-z0-9._-]+$) and filename (no '..', absolute
paths, NUL bytes, drive letters) at the client boundary
- Confine download destinations under MCARCHIVE_DOWNLOAD_ROOT via
Path.resolve() + is_relative_to() check; reject symlinked dirs
- Use O_NOFOLLOW on the destination open() to refuse symlink substitution
- Detect Range-ignored responses: if resume requested but server returns 200
(or 206 with wrong Content-Range start), raise ArchiveError BEFORE writing
any bytes — closes the silent file-corruption hole
Usability:
- Wrap raise_for_status everywhere with ArchiveError that includes the
response body preview — 4xx Solr errors now tell you what's wrong
- URL-encode filenames in download URLs (handles spaces and special chars)
- Map archive.org's {"error": ...} payloads on /metadata/{id}/files to
ArchiveError with the server's message
- Lazy-resolve download root so env-var changes after import are honored
- Refactor item_resource to a shared async helper (drops .fn type-ignore)
- Rename result key 'bytes' -> 'bytes_written' (avoids shadowing builtin)
Tests:
- New tests/test_client_mocked.py: 29 regression tests using
httpx.MockTransport covering every Hamilton finding above (path traversal,
symlink refusal, Range-ignored, Content-Range mismatch, error body
surfacing, malformed JSON, dark items, etc.)
- Set asyncio_mode = "auto" in pyproject for cleaner test markers
33/33 tests pass (4 live + 29 mocked), ruff clean.
53 lines
1.5 KiB
Python
53 lines
1.5 KiB
Python
"""End-to-end smoke tests against live archive.org (network required).
|
|
|
|
Run with: uv run pytest -v
|
|
Skip with: uv run pytest -v -m 'not network'
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from mcarchive_org.client import ArchiveClient
|
|
|
|
pytestmark = pytest.mark.network
|
|
|
|
|
|
async def test_search_nasa_item():
|
|
async with ArchiveClient() as c:
|
|
result = await c.search(query="identifier:nasa", rows=5)
|
|
assert result["num_found"] >= 1
|
|
assert any(d["identifier"] == "nasa" for d in result["docs"])
|
|
|
|
|
|
async def test_metadata_nasa():
|
|
async with ArchiveClient() as c:
|
|
data = await c.metadata("nasa")
|
|
assert data["metadata"]["identifier"] == "nasa"
|
|
assert isinstance(data["files"], list) and data["files"]
|
|
|
|
|
|
async def test_download_small_file(tmp_path: Path):
|
|
async with ArchiveClient() as c:
|
|
files = await c.files("nasa")
|
|
# pick the smallest file to keep the test fast
|
|
small = min(
|
|
(f for f in files if f.get("size") and str(f["size"]).isdigit()),
|
|
key=lambda f: int(f["size"]),
|
|
)
|
|
dest = tmp_path / small["name"]
|
|
result = await c.download_to_file(
|
|
"nasa", small["name"], dest, verify_md5=small.get("md5")
|
|
)
|
|
assert result["bytes_written"] > 0
|
|
if small.get("md5"):
|
|
assert result["md5_ok"] is True
|
|
|
|
|
|
async def test_scrape_requires_min_count():
|
|
async with ArchiveClient() as c:
|
|
with pytest.raises(ValueError):
|
|
await c.scrape(query="identifier:nasa", count=10)
|