mcarchive-org/tests/test_client_mocked.py
Ryan Malloy 6198defeca Resilience: address Hamilton tier-2 findings
H7 — Process-wide shared httpx.AsyncClient via get_shared_client().
Each tool call no longer pays a TCP+TLS handshake; connection pool is
reused across the server's lifetime. Tests inject mock transports
directly via ArchiveClient(transport=...) so the singleton stays clean.

M1 — Retry/backoff on 429/502/503/504 with Retry-After honored
(both delta-seconds and HTTP-date forms). Exponential backoff with
jitter, capped at 30s, max 3 attempts. Applied to both _fetch_json
and stream_file (retry happens BEFORE any bytes are yielded so it
can't corrupt a partial write).

M2 — Per-(identifier, filename) asyncio.Lock in download_file
serializes concurrent downloads of the same file inside one process.
Different files still download in parallel.

M5 — collection field normalized to list[str] in all output paths
(search docs, scrape items, item metadata). LLMs can write
`if 'foo' in doc['collection']` without checking the type first.

M7 — `is_collection: bool` derived from mediatype on every doc /
metadata response, so LLMs can route collection containers vs.
real media items without re-querying.

H1 — Stream-abort errors (httpx.ReadError, RemoteProtocolError,
ConnectError, ReadTimeout) caught and re-raised as ArchiveError
with bytes-written context so the caller knows where the partial
download ended. Bytes already on disk remain valid for resume.

19 new regression tests (52 total, all green, ruff clean):
- 4 tests covering retry/backoff, exhaustion, HTTP-date Retry-After
- 1 test for stream-abort byte-count surfacing
- 6 tests for collection normalization shapes
- 4 tests for is_collection in real tool flow + shared client lifecycle
- 2 tests verifying download lock: same-file serialized, different files parallel
2026-04-21 20:24:21 -06:00

364 lines
12 KiB
Python

"""Failure-mode regression tests using httpx.MockTransport (no network).
Each test pins down one of the Hamilton review findings (C1/C2/C3/H4 etc.) so
future refactors can't silently regress safety.
"""
from __future__ import annotations
import hashlib
import httpx
import pytest
from mcarchive_org.client import (
ArchiveClient,
ArchiveError,
validate_filename,
validate_identifier,
)
from mcarchive_org.server import _confine_dest
def _client_with(handler) -> ArchiveClient:
"""Build an ArchiveClient backed by a MockTransport handler."""
return ArchiveClient(transport=httpx.MockTransport(handler))
# ---------- C1: identifier + filename validation ----------
@pytest.mark.parametrize("bad", ["", "../etc", "foo/bar", "has space", "a" * 200])
def test_invalid_identifier_rejected(bad):
with pytest.raises(ValueError, match=r"invalid archive\.org identifier"):
validate_identifier(bad)
@pytest.mark.parametrize(
"bad",
[
"../escape.txt",
"/etc/passwd",
"C:\\windows.txt",
"with\x00null.bin",
"foo/../bar.mp3",
"foo\\..\\bar.mp3",
"",
],
)
def test_invalid_filename_rejected(bad):
with pytest.raises(ValueError):
validate_filename(bad)
@pytest.mark.parametrize(
"ok",
["song.mp3", "cover/back.jpg", "subdir/file with space.txt", "a.b.c.d"],
)
def test_legitimate_filenames_accepted(ok):
assert validate_filename(ok) == ok
def test_confine_dest_blocks_traversal(tmp_path, monkeypatch):
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
# validate_filename catches '..' before _confine_dest's path-resolution check,
# so this raises ValueError from the validator — both layers in agreement.
with pytest.raises(ValueError):
_confine_dest("nasa", "../escape.txt", dest_dir=None)
def test_confine_dest_legit_filename_lands_in_root(tmp_path, monkeypatch):
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
dest = _confine_dest("nasa", "globe.jpg", dest_dir=None)
assert dest.is_relative_to(tmp_path)
assert dest.name == "globe.jpg"
# ---------- C2: symlink refusal ----------
async def test_download_refuses_symlink_at_dest(tmp_path):
target = tmp_path / "real.bin"
target.write_bytes(b"original-content")
link = tmp_path / "evil.bin"
link.symlink_to(target)
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(200, content=b"new-content-that-should-not-overwrite")
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="symlink"):
await c.download_to_file("nasa", "evil.bin", link)
# Symlink target must be unchanged.
assert target.read_bytes() == b"original-content"
# ---------- C3: Range-ignored detection ----------
async def test_resume_with_200_response_raises_before_writing(tmp_path):
"""If the server returns 200 instead of 206 on a Range request, we must not
append to the existing file — that path corrupts data silently."""
dest = tmp_path / "partial.bin"
dest.write_bytes(b"X" * 100) # pretend we have a partial download
def handler(req: httpx.Request) -> httpx.Response:
# Server ignores Range header and returns the full body with 200
assert req.headers.get("Range") == "bytes=100-"
return httpx.Response(200, content=b"FULL_FILE_BODY")
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="ignored Range"):
await c.download_to_file("nasa", "partial.bin", dest)
# File must be unchanged — corruption avoided.
assert dest.read_bytes() == b"X" * 100
async def test_resume_with_correct_206_succeeds(tmp_path):
full_body = b"0123456789ABCDEF" * 16 # 256 bytes
dest = tmp_path / "resume.bin"
dest.write_bytes(full_body[:64]) # we already have first 64 bytes
def handler(req: httpx.Request) -> httpx.Response:
assert req.headers.get("Range") == "bytes=64-"
return httpx.Response(
206,
content=full_body[64:],
headers={"Content-Range": f"bytes 64-{len(full_body)-1}/{len(full_body)}"},
)
expected_md5 = hashlib.md5(full_body).hexdigest()
async with _client_with(handler) as c:
result = await c.download_to_file(
"nasa", "resume.bin", dest, verify_md5=expected_md5
)
assert result["bytes_written"] == len(full_body)
assert result["resumed_from"] == 64
assert result["md5_ok"] is True
assert dest.read_bytes() == full_body
async def test_resume_with_wrong_content_range_start_raises(tmp_path):
dest = tmp_path / "off.bin"
dest.write_bytes(b"X" * 100)
def handler(req: httpx.Request) -> httpx.Response:
# Server returns 206 but with WRONG starting offset
return httpx.Response(
206,
content=b"junk",
headers={"Content-Range": "bytes 50-99/100"},
)
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="Content-Range start"):
await c.download_to_file("nasa", "off.bin", dest)
assert dest.read_bytes() == b"X" * 100 # unchanged
# ---------- H4: error body surfacing ----------
async def test_search_400_includes_response_body():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(400, text='{"error":"bad query syntax"}')
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="bad query syntax"):
await c.search(query="INVALID:::")
async def test_metadata_404_includes_status():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(404, text="not found")
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="HTTP 404"):
await c.metadata("nasa")
async def test_metadata_empty_dict_means_not_found():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(200, json={})
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="not found or unavailable"):
await c.metadata("nasa")
async def test_files_returns_error_payload_as_archive_error():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(200, json={"error": "item is dark"})
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="item is dark"):
await c.files("nasa")
async def test_scrape_error_payload_surfaced():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(
200, json={"error": "count too small", "errorType": "RangeException"}
)
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match=r"RangeException.*count too small"):
await c.scrape(query="identifier:nasa", count=100)
async def test_invalid_json_response_surfaced():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(200, text="<html>not json</html>")
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="invalid JSON"):
await c.metadata("nasa")
# ---------- happy path ----------
# ---------- M1: retry/backoff with Retry-After ----------
async def test_retry_on_429_then_success(monkeypatch):
"""First call gets 429 with Retry-After: 0, second call succeeds."""
sleeps: list[float] = []
async def fake_sleep(d: float) -> None:
sleeps.append(d)
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep)
calls = {"n": 0}
def handler(req: httpx.Request) -> httpx.Response:
calls["n"] += 1
if calls["n"] == 1:
return httpx.Response(429, headers={"Retry-After": "0"}, json={"error": "slow down"})
return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}})
async with _client_with(handler) as c:
result = await c.search(query="x", rows=1)
assert result["num_found"] == 0
assert calls["n"] == 2
assert sleeps == [0.0] # honored Retry-After: 0
async def test_retry_exhaustion_raises_with_body(monkeypatch):
"""If 429 persists past max_attempts, the final error body is surfaced."""
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep())
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(429, json={"error": "rate limit exhausted"})
async with _client_with(handler) as c:
with pytest.raises(ArchiveError, match="rate limit exhausted"):
await c.search(query="x")
async def _noop_sleep():
"""Used in place of asyncio.sleep when we don't care about backoff timing."""
async def test_retry_on_503_for_stream(monkeypatch, tmp_path):
"""Stream-level retry: 503 once, then 200 with body."""
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep())
body = b"actual file body"
calls = {"n": 0}
def handler(req: httpx.Request) -> httpx.Response:
calls["n"] += 1
if calls["n"] == 1:
return httpx.Response(503, text="overloaded")
return httpx.Response(200, content=body)
dest = tmp_path / "f.bin"
async with _client_with(handler) as c:
result = await c.download_to_file("nasa", "f.bin", dest)
assert result["bytes_written"] == len(body)
assert calls["n"] == 2
assert dest.read_bytes() == body
async def test_retry_after_http_date_form(monkeypatch):
"""Retry-After can be an HTTP-date; we must parse it to a delta seconds."""
sleeps: list[float] = []
async def fake_sleep(d: float) -> None:
sleeps.append(d)
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep)
calls = {"n": 0}
def handler(req: httpx.Request) -> httpx.Response:
calls["n"] += 1
if calls["n"] == 1:
# An HTTP-date in the past should produce a 0-or-negative wait, clamped to 0.
return httpx.Response(429, headers={"Retry-After": "Wed, 21 Oct 2015 07:28:00 GMT"})
return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}})
async with _client_with(handler) as c:
await c.search(query="x")
assert sleeps == [0.0]
# ---------- H1: stream-abort error context ----------
async def test_stream_abort_raises_archive_error_with_byte_count(tmp_path):
"""If httpx raises mid-stream, we wrap it in ArchiveError with byte count
so the caller knows where the partial download ended."""
# Yield enough bytes to flush past httpx's internal chunk buffer (64KB) so
# at least one chunk reaches our writer before the error fires.
chunk_payload = b"X" * (1 << 17) # 128KB — multiple buffer fills
async def evil_body():
yield chunk_payload
raise httpx.ReadError("simulated network drop")
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(200, content=evil_body())
dest = tmp_path / "interrupted.bin"
async with _client_with(handler) as c:
with pytest.raises(ArchiveError) as exc_info:
await c.download_to_file("nasa", "interrupted.bin", dest)
msg = str(exc_info.value)
assert "interrupted after" in msg
assert "ReadError" in msg
# Partial bytes ARE on disk — at least the first delivered chunk.
on_disk = dest.read_bytes()
assert len(on_disk) > 0
assert on_disk == chunk_payload[: len(on_disk)]
# ---------- happy path ----------
async def test_fresh_download_writes_full_body(tmp_path):
body = b"hello world" * 100
dest = tmp_path / "new.bin"
def handler(req: httpx.Request) -> httpx.Response:
assert "Range" not in req.headers
return httpx.Response(200, content=body)
async with _client_with(handler) as c:
result = await c.download_to_file(
"nasa", "new.bin", dest, verify_md5=hashlib.md5(body).hexdigest()
)
assert result["bytes_written"] == len(body)
assert result["resumed_from"] == 0
assert result["md5_ok"] is True
assert dest.read_bytes() == body