"""Failure-mode regression tests using httpx.MockTransport (no network). Each test pins down one of the Hamilton review findings (C1/C2/C3/H4 etc.) so future refactors can't silently regress safety. """ from __future__ import annotations import hashlib import httpx import pytest from mcarchive_org.client import ( ArchiveClient, ArchiveError, validate_filename, validate_identifier, ) from mcarchive_org.server import _confine_dest def _client_with(handler) -> ArchiveClient: """Build an ArchiveClient backed by a MockTransport handler.""" return ArchiveClient(transport=httpx.MockTransport(handler)) # ---------- C1: identifier + filename validation ---------- @pytest.mark.parametrize("bad", ["", "../etc", "foo/bar", "has space", "a" * 200]) def test_invalid_identifier_rejected(bad): with pytest.raises(ValueError, match=r"invalid archive\.org identifier"): validate_identifier(bad) @pytest.mark.parametrize( "bad", [ "../escape.txt", "/etc/passwd", "C:\\windows.txt", "with\x00null.bin", "foo/../bar.mp3", "foo\\..\\bar.mp3", "", ], ) def test_invalid_filename_rejected(bad): with pytest.raises(ValueError): validate_filename(bad) @pytest.mark.parametrize( "ok", ["song.mp3", "cover/back.jpg", "subdir/file with space.txt", "a.b.c.d"], ) def test_legitimate_filenames_accepted(ok): assert validate_filename(ok) == ok def test_confine_dest_blocks_traversal(tmp_path, monkeypatch): monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path)) # validate_filename catches '..' before _confine_dest's path-resolution check, # so this raises ValueError from the validator — both layers in agreement. with pytest.raises(ValueError): _confine_dest("nasa", "../escape.txt", dest_dir=None) def test_confine_dest_legit_filename_lands_in_root(tmp_path, monkeypatch): monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path)) dest = _confine_dest("nasa", "globe.jpg", dest_dir=None) assert dest.is_relative_to(tmp_path) assert dest.name == "globe.jpg" # ---------- C2: symlink refusal ---------- async def test_download_refuses_symlink_at_dest(tmp_path): target = tmp_path / "real.bin" target.write_bytes(b"original-content") link = tmp_path / "evil.bin" link.symlink_to(target) def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(200, content=b"new-content-that-should-not-overwrite") async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="symlink"): await c.download_to_file("nasa", "evil.bin", link) # Symlink target must be unchanged. assert target.read_bytes() == b"original-content" # ---------- C3: Range-ignored detection ---------- async def test_resume_with_200_response_raises_before_writing(tmp_path): """If the server returns 200 instead of 206 on a Range request, we must not append to the existing file — that path corrupts data silently.""" dest = tmp_path / "partial.bin" dest.write_bytes(b"X" * 100) # pretend we have a partial download def handler(req: httpx.Request) -> httpx.Response: # Server ignores Range header and returns the full body with 200 assert req.headers.get("Range") == "bytes=100-" return httpx.Response(200, content=b"FULL_FILE_BODY") async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="ignored Range"): await c.download_to_file("nasa", "partial.bin", dest) # File must be unchanged — corruption avoided. assert dest.read_bytes() == b"X" * 100 async def test_resume_with_correct_206_succeeds(tmp_path): full_body = b"0123456789ABCDEF" * 16 # 256 bytes dest = tmp_path / "resume.bin" dest.write_bytes(full_body[:64]) # we already have first 64 bytes def handler(req: httpx.Request) -> httpx.Response: assert req.headers.get("Range") == "bytes=64-" return httpx.Response( 206, content=full_body[64:], headers={"Content-Range": f"bytes 64-{len(full_body)-1}/{len(full_body)}"}, ) expected_md5 = hashlib.md5(full_body).hexdigest() async with _client_with(handler) as c: result = await c.download_to_file( "nasa", "resume.bin", dest, verify_md5=expected_md5 ) assert result["bytes_written"] == len(full_body) assert result["resumed_from"] == 64 assert result["md5_ok"] is True assert dest.read_bytes() == full_body async def test_resume_with_wrong_content_range_start_raises(tmp_path): dest = tmp_path / "off.bin" dest.write_bytes(b"X" * 100) def handler(req: httpx.Request) -> httpx.Response: # Server returns 206 but with WRONG starting offset return httpx.Response( 206, content=b"junk", headers={"Content-Range": "bytes 50-99/100"}, ) async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="Content-Range start"): await c.download_to_file("nasa", "off.bin", dest) assert dest.read_bytes() == b"X" * 100 # unchanged # ---------- H4: error body surfacing ---------- async def test_search_400_includes_response_body(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(400, text='{"error":"bad query syntax"}') async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="bad query syntax"): await c.search(query="INVALID:::") async def test_metadata_404_includes_status(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(404, text="not found") async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="HTTP 404"): await c.metadata("nasa") async def test_metadata_empty_dict_means_not_found(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(200, json={}) async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="not found or unavailable"): await c.metadata("nasa") async def test_files_returns_error_payload_as_archive_error(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(200, json={"error": "item is dark"}) async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="item is dark"): await c.files("nasa") async def test_scrape_error_payload_surfaced(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response( 200, json={"error": "count too small", "errorType": "RangeException"} ) async with _client_with(handler) as c: with pytest.raises(ArchiveError, match=r"RangeException.*count too small"): await c.scrape(query="identifier:nasa", count=100) async def test_invalid_json_response_surfaced(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(200, text="not json") async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="invalid JSON"): await c.metadata("nasa") # ---------- happy path ---------- # ---------- M1: retry/backoff with Retry-After ---------- async def test_retry_on_429_then_success(monkeypatch): """First call gets 429 with Retry-After: 0, second call succeeds.""" sleeps: list[float] = [] async def fake_sleep(d: float) -> None: sleeps.append(d) monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep) calls = {"n": 0} def handler(req: httpx.Request) -> httpx.Response: calls["n"] += 1 if calls["n"] == 1: return httpx.Response(429, headers={"Retry-After": "0"}, json={"error": "slow down"}) return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}}) async with _client_with(handler) as c: result = await c.search(query="x", rows=1) assert result["num_found"] == 0 assert calls["n"] == 2 assert sleeps == [0.0] # honored Retry-After: 0 async def test_retry_exhaustion_raises_with_body(monkeypatch): """If 429 persists past max_attempts, the final error body is surfaced.""" monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep()) def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(429, json={"error": "rate limit exhausted"}) async with _client_with(handler) as c: with pytest.raises(ArchiveError, match="rate limit exhausted"): await c.search(query="x") async def _noop_sleep(): """Used in place of asyncio.sleep when we don't care about backoff timing.""" async def test_retry_on_503_for_stream(monkeypatch, tmp_path): """Stream-level retry: 503 once, then 200 with body.""" monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep()) body = b"actual file body" calls = {"n": 0} def handler(req: httpx.Request) -> httpx.Response: calls["n"] += 1 if calls["n"] == 1: return httpx.Response(503, text="overloaded") return httpx.Response(200, content=body) dest = tmp_path / "f.bin" async with _client_with(handler) as c: result = await c.download_to_file("nasa", "f.bin", dest) assert result["bytes_written"] == len(body) assert calls["n"] == 2 assert dest.read_bytes() == body async def test_retry_after_http_date_form(monkeypatch): """Retry-After can be an HTTP-date; we must parse it to a delta seconds.""" sleeps: list[float] = [] async def fake_sleep(d: float) -> None: sleeps.append(d) monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep) calls = {"n": 0} def handler(req: httpx.Request) -> httpx.Response: calls["n"] += 1 if calls["n"] == 1: # An HTTP-date in the past should produce a 0-or-negative wait, clamped to 0. return httpx.Response(429, headers={"Retry-After": "Wed, 21 Oct 2015 07:28:00 GMT"}) return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}}) async with _client_with(handler) as c: await c.search(query="x") assert sleeps == [0.0] # ---------- H1: stream-abort error context ---------- async def test_stream_abort_raises_archive_error_with_byte_count(tmp_path): """If httpx raises mid-stream, we wrap it in ArchiveError with byte count so the caller knows where the partial download ended.""" # Yield enough bytes to flush past httpx's internal chunk buffer (64KB) so # at least one chunk reaches our writer before the error fires. chunk_payload = b"X" * (1 << 17) # 128KB — multiple buffer fills async def evil_body(): yield chunk_payload raise httpx.ReadError("simulated network drop") def handler(req: httpx.Request) -> httpx.Response: return httpx.Response(200, content=evil_body()) dest = tmp_path / "interrupted.bin" async with _client_with(handler) as c: with pytest.raises(ArchiveError) as exc_info: await c.download_to_file("nasa", "interrupted.bin", dest) msg = str(exc_info.value) assert "interrupted after" in msg assert "ReadError" in msg # Partial bytes ARE on disk — at least the first delivered chunk. on_disk = dest.read_bytes() assert len(on_disk) > 0 assert on_disk == chunk_payload[: len(on_disk)] # ---------- happy path ---------- async def test_fresh_download_writes_full_body(tmp_path): body = b"hello world" * 100 dest = tmp_path / "new.bin" def handler(req: httpx.Request) -> httpx.Response: assert "Range" not in req.headers return httpx.Response(200, content=body) async with _client_with(handler) as c: result = await c.download_to_file( "nasa", "new.bin", dest, verify_md5=hashlib.md5(body).hexdigest() ) assert result["bytes_written"] == len(body) assert result["resumed_from"] == 0 assert result["md5_ok"] is True assert dest.read_bytes() == body