Hardening: address Hamilton review ship-blockers

Critical fixes: - Validate identifier (^[A-Za-z0-9._-]+$) and filename (no '..', absolute paths, NUL bytes, drive letters) at the client boundary - Confine download destinations under MCARCHIVE_DOWNLOAD_ROOT via Path.resolve() + is_relative_to() check; reject symlinked dirs - Use O_NOFOLLOW on the destination open() to refuse symlink substitution - Detect Range-ignored responses: if resume requested but server returns 200 (or 206 with wrong Content-Range start), raise ArchiveError BEFORE writing any bytes — closes the silent file-corruption hole Usability: - Wrap raise_for_status everywhere with ArchiveError that includes the response body preview — 4xx Solr errors now tell you what's wrong - URL-encode filenames in download URLs (handles spaces and special chars) - Map archive.org's {"error": ...} payloads on /metadata/{id}/files to ArchiveError with the server's message - Lazy-resolve download root so env-var changes after import are honored - Refactor item_resource to a shared async helper (drops .fn type-ignore) - Rename result key 'bytes' -> 'bytes_written' (avoids shadowing builtin) Tests: - New tests/test_client_mocked.py: 29 regression tests using httpx.MockTransport covering every Hamilton finding above (path traversal, symlink refusal, Range-ignored, Content-Range mismatch, error body surfacing, malformed JSON, dark items, etc.) - Set asyncio_mode = "auto" in pyproject for cleaner test markers 33/33 tests pass (4 live + 29 mocked), ruff clean.
2026-04-21 15:34:30 -06:00 · 2026-04-21 15:34:30 -06:00 · 4a03af1675
commit 4a03af1675
parent 5265a6440b
5 changed files with 513 additions and 56 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -38,6 +38,9 @@ build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["src/mcarchive_org"]

+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+
 [tool.ruff]
 line-length = 100
 target-version = "py310"
--- a/src/mcarchive_org/client.py
+++ b/src/mcarchive_org/client.py
@ -2,29 +2,104 @@

 from __future__ import annotations

+import errno
 import hashlib
+import os
+import re
 from collections.abc import AsyncIterator
 from pathlib import Path
 from typing import Any
+from urllib.parse import quote

 import httpx

 ARCHIVE_BASE = "https://archive.org"
 DEFAULT_UA = "mcarchive-org/2026.04.21 (+https://archive.org/developers/)"
+# Per-chunk read timeout (60s) means a stalled stream is caught between chunks.
+# Don't relax this without thinking about hung TCP connections.
 DEFAULT_TIMEOUT = httpx.Timeout(30.0, read=60.0)

+# Archive.org documents identifiers as [A-Za-z0-9._-], starting with alnum, max 100 chars.
+_IDENTIFIER_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,99}$")
+_MAX_FILENAME = 512
+

 class ArchiveError(RuntimeError):
    """Raised when archive.org returns an error payload or unexpected status."""


-class ArchiveClient:
-    """Async client for the three archive.org endpoints we care about.
+# ---------- input validators (defense-in-depth) ----------

-    - advancedsearch.php : small Solr-style queries (<= ~10,000 rows paginated)
-    - services/search/v1/scrape : bulk cursor-based iteration (count >= 100)
+
+def validate_identifier(identifier: str) -> str:
+    """Reject identifiers that don't match archive.org's documented grammar.
+
+    Both archive.org and the local filesystem trust this string — '/' or '..'
+    in an identifier is a path-traversal vector.
+    """
+    if not isinstance(identifier, str) or not _IDENTIFIER_RE.match(identifier):
+        raise ValueError(
+            f"invalid archive.org identifier: {identifier!r} "
+            f"(must match {_IDENTIFIER_RE.pattern})"
+        )
+    return identifier
+
+
+def validate_filename(filename: str) -> str:
+    """Reject filenames that could escape a download root or attack the FS.
+
+    archive.org files[].name CAN contain forward slashes for subdirectory files
+    (e.g. 'cover/back.jpg') — that's allowed. What's rejected: '..' components,
+    absolute paths, NUL bytes, Windows drive letters, and excessive length.
+    """
+    if not isinstance(filename, str) or not filename:
+        raise ValueError(f"filename must be a non-empty string, got {filename!r}")
+    if len(filename) > _MAX_FILENAME:
+        raise ValueError(f"filename exceeds {_MAX_FILENAME} chars")
+    if "\x00" in filename:
+        raise ValueError(f"filename contains NUL byte: {filename!r}")
+    if filename.startswith(("/", "\\")):
+        raise ValueError(f"filename must not be absolute: {filename!r}")
+    if len(filename) >= 2 and filename[1] == ":":
+        raise ValueError(f"filename must not be a Windows drive path: {filename!r}")
+    if any(part == ".." for part in filename.replace("\\", "/").split("/")):
+        raise ValueError(f"filename must not contain '..' components: {filename!r}")
+    return filename
+
+
+# ---------- safe filesystem open ----------
+
+
+def _safe_open_for_write(dest: Path, append: bool):
+    """Open dest for writing, refusing to follow a symlink at the leaf.
+
+    Defense against the symlink-substitution race: even if our path-confinement
+    check passes, a symlink at `dest` could redirect the write. O_NOFOLLOW tells
+    the kernel to fail the open instead.
+    """
+    flags = os.O_WRONLY | os.O_CREAT
+    flags |= os.O_APPEND if append else os.O_TRUNC
+    nofollow = getattr(os, "O_NOFOLLOW", 0)  # not present on Windows
+    flags |= nofollow
+    try:
+        fd = os.open(dest, flags, 0o644)
+    except OSError as e:
+        if nofollow and e.errno == errno.ELOOP:
+            raise ArchiveError(f"refusing to write through symlink at {dest}") from e
+        raise
+    return os.fdopen(fd, "ab" if append else "wb")
+
+
+# ---------- client ----------
+
+
+class ArchiveClient:
+    """Async client for the archive.org endpoints we wrap.
+
+    - advancedsearch.php : Solr-style queries (<= ~10,000 rows paginated)
+    - services/search/v1/scrape : bulk cursor pagination (count >= 100)
    - metadata/{id} : full item manifest including files[]
-    - download/{id}/{file} : byte stream with Range support
+    - download/{id}/{file} : byte stream with HTTP Range support
    """

    def __init__(
@ -32,13 +107,17 @@ class ArchiveClient:
        base_url: str = ARCHIVE_BASE,
        user_agent: str = DEFAULT_UA,
        timeout: httpx.Timeout | float = DEFAULT_TIMEOUT,
+        transport: httpx.AsyncBaseTransport | None = None,
    ) -> None:
        self._base = base_url.rstrip("/")
-        self._client = httpx.AsyncClient(
-            headers={"User-Agent": user_agent, "Accept": "application/json"},
-            timeout=timeout,
-            follow_redirects=True,
-        )
+        kwargs: dict[str, Any] = {
+            "headers": {"User-Agent": user_agent, "Accept": "application/json"},
+            "timeout": timeout,
+            "follow_redirects": True,
+        }
+        if transport is not None:
+            kwargs["transport"] = transport
+        self._client = httpx.AsyncClient(**kwargs)

    async def aclose(self) -> None:
        await self._client.aclose()
@ -49,6 +128,25 @@ class ArchiveClient:
    async def __aexit__(self, *exc: object) -> None:
        await self.aclose()

+    # ---------- internal: error-surfacing fetch ----------
+
+    async def _fetch_json(self, url: str, params: Any = None) -> Any:
+        """GET + JSON decode with archive.org-friendly error messages.
+
+        Wraps raise_for_status so that 4xx/5xx responses include a body preview
+        in the exception — invaluable for an LLM trying to fix a bad query.
+        """
+        r = await self._client.get(url, params=params)
+        if r.is_error:
+            body = r.text[:500] if r.content else ""
+            raise ArchiveError(f"HTTP {r.status_code} from {r.url}: {body}".strip())
+        try:
+            return r.json()
+        except ValueError as e:
+            raise ArchiveError(
+                f"invalid JSON from {r.url}: {r.text[:200]!r}"
+            ) from e
+
    # ---------- search ----------

    async def search(
@ -71,9 +169,7 @@ class ArchiveClient:
        for s in sort or []:
            params.append(("sort[]", s))

-        r = await self._client.get(f"{self._base}/advancedsearch.php", params=params)
-        r.raise_for_status()
-        data = r.json()
+        data = await self._fetch_json(f"{self._base}/advancedsearch.php", params=params)
        resp = data.get("response", {})
        return {
            "num_found": resp.get("numFound", 0),
@ -103,39 +199,41 @@ class ArchiveClient:
        if cursor:
            params["cursor"] = cursor

-        r = await self._client.get(f"{self._base}/services/search/v1/scrape", params=params)
-        r.raise_for_status()
-        data = r.json()
-        if "error" in data:
+        data = await self._fetch_json(f"{self._base}/services/search/v1/scrape", params=params)
+        if isinstance(data, dict) and "error" in data:
            raise ArchiveError(f"{data.get('errorType', 'ScrapeError')}: {data['error']}")
        return data  # keys: items, count, total, cursor (if more pages)

    # ---------- metadata ----------

    async def metadata(self, identifier: str) -> dict[str, Any]:
-        """Full metadata blob for an item."""
-        r = await self._client.get(f"{self._base}/metadata/{identifier}")
-        r.raise_for_status()
-        data = r.json()
+        """Full metadata blob for an item. Empty {} from archive.org → not found."""
+        validate_identifier(identifier)
+        data = await self._fetch_json(f"{self._base}/metadata/{identifier}")
        if not data:
-            raise ArchiveError(f"item not found: {identifier}")
+            raise ArchiveError(f"item not found or unavailable: {identifier}")
        return data

    async def files(self, identifier: str) -> list[dict[str, Any]]:
        """Just the files[] slice — smaller payload when that's all you want."""
-        r = await self._client.get(f"{self._base}/metadata/{identifier}/files")
-        r.raise_for_status()
-        data = r.json()
-        if isinstance(data, dict) and "result" in data:
+        validate_identifier(identifier)
+        data = await self._fetch_json(f"{self._base}/metadata/{identifier}/files")
+        if isinstance(data, dict):
+            if "error" in data:
+                raise ArchiveError(f"archive.org error for {identifier}: {data['error']}")
+            if "result" in data:
                return data["result"]
        if isinstance(data, list):
            return data
-        raise ArchiveError(f"unexpected files response for {identifier}")
+        raise ArchiveError(f"unexpected files response shape for {identifier}: {type(data).__name__}")

    # ---------- download ----------

    def download_url(self, identifier: str, filename: str) -> str:
-        return f"{self._base}/download/{identifier}/{filename}"
+        """Build the canonical download URL. Filename is URL-encoded but '/' preserved."""
+        validate_identifier(identifier)
+        validate_filename(filename)
+        return f"{self._base}/download/{identifier}/{quote(filename, safe='/')}"

    async def stream_file(
        self,
@ -143,13 +241,40 @@ class ArchiveClient:
        filename: str,
        resume_from: int = 0,
    ) -> AsyncIterator[bytes]:
-        """Async byte iterator — caller is responsible for writing to disk."""
+        """Async byte iterator. If resume_from > 0, requires a 206 response.
+
+        Raises ArchiveError BEFORE yielding any bytes if:
+          - the server returns a 4xx/5xx
+          - resume was requested but the server returned 200 (Range ignored)
+          - the Content-Range start byte doesn't match resume_from
+        """
+        validate_identifier(identifier)
+        validate_filename(filename)
+
        headers = {}
        if resume_from > 0:
            headers["Range"] = f"bytes={resume_from}-"
        url = self.download_url(identifier, filename)
+
        async with self._client.stream("GET", url, headers=headers) as r:
-            r.raise_for_status()
+            if r.is_error:
+                body = (await r.aread())[:500].decode("utf-8", errors="replace")
+                raise ArchiveError(f"HTTP {r.status_code} from {r.url}: {body}".strip())
+            if resume_from > 0:
+                if r.status_code != 206:
+                    raise ArchiveError(
+                        f"server ignored Range request (got HTTP {r.status_code}); "
+                        f"local file may be stale — retry download_file with overwrite=True"
+                    )
+                # Verify the byte range starts where we expect. archive.org's CDN
+                # is normally well-behaved here, but trust-but-verify.
+                cr = r.headers.get("Content-Range", "")
+                m = re.match(r"bytes\s+(\d+)-", cr)
+                if m and int(m.group(1)) != resume_from:
+                    raise ArchiveError(
+                        f"Content-Range start {m.group(1)} != resume_from {resume_from}; "
+                        f"refusing to corrupt {filename}"
+                    )
            async for chunk in r.aiter_bytes(chunk_size=1 << 16):
                yield chunk

@ -161,20 +286,34 @@ class ArchiveClient:
        verify_md5: str | None = None,
        chunk_cb=None,
    ) -> dict[str, Any]:
-        """Download with resume support. Returns stats + md5 verification result."""
+        """Download with resume support. Returns stats + md5 verification result.
+
+        Caller is responsible for ensuring `dest` is confined to a safe directory
+        (use mcarchive_org.server's path validation, or do your own). This method
+        adds defense-in-depth via O_NOFOLLOW but does not validate path confinement.
+        """
+        validate_identifier(identifier)
+        validate_filename(filename)
+
        dest.parent.mkdir(parents=True, exist_ok=True)
-        resume_from = dest.stat().st_size if dest.exists() else 0
+        # If parent is a symlink to elsewhere, refuse — our caller's confinement
+        # check should already have caught this, but redundant defense is cheap.
+        if dest.parent.is_symlink():
+            raise ArchiveError(f"refusing to write into symlinked directory: {dest.parent}")
+
+        resume_from = dest.stat().st_size if dest.exists() and not dest.is_symlink() else 0
+        # is_symlink() detection: if dest is a symlink, treat as fresh (open with
+        # O_NOFOLLOW will refuse anyway, so we won't actually corrupt the target).

        hasher = hashlib.md5() if verify_md5 else None
        if hasher and resume_from:
-            # re-hash existing bytes so the final digest is correct
            with dest.open("rb") as f:
                while chunk := f.read(1 << 16):
                    hasher.update(chunk)

        bytes_written = resume_from
-        mode = "ab" if resume_from else "wb"
-        with dest.open(mode) as f:
+        f = _safe_open_for_write(dest, append=resume_from > 0)
+        try:
            async for chunk in self.stream_file(identifier, filename, resume_from=resume_from):
                f.write(chunk)
                bytes_written += len(chunk)
@ -182,10 +321,12 @@ class ArchiveClient:
                    hasher.update(chunk)
                if chunk_cb:
                    chunk_cb(bytes_written)
+        finally:
+            f.close()

-        result = {
+        result: dict[str, Any] = {
            "path": str(dest),
-            "bytes": bytes_written,
+            "bytes_written": bytes_written,
            "resumed_from": resume_from,
        }
        if verify_md5 and hasher:
--- a/src/mcarchive_org/server.py
+++ b/src/mcarchive_org/server.py
@ -6,15 +6,24 @@ import fnmatch
 import os
 from pathlib import Path
 from typing import Annotated, Any
+from urllib.parse import quote

 from fastmcp import FastMCP
 from pydantic import Field

 from mcarchive_org import __version__
-from mcarchive_org.client import ArchiveClient
+from mcarchive_org.client import (
+    ArchiveClient,
+    ArchiveError,
+    validate_filename,
+    validate_identifier,
+)

-DEFAULT_DOWNLOAD_ROOT = Path(
-    os.environ.get("MCARCHIVE_DOWNLOAD_ROOT", Path.cwd() / "downloads")
+
+def _resolve_download_root() -> Path:
+    """Resolve the download root lazily so env-var changes after import are honored."""
+    return Path(
+        os.environ.get("MCARCHIVE_DOWNLOAD_ROOT", str(Path.cwd() / "downloads"))
    ).expanduser()

 mcp = FastMCP(
@ -42,18 +51,29 @@ def _human_size(n: int | str | None) -> str:
    return f"{x:.1f} PB"


+def _parse_size(raw: Any) -> int | None:
+    """Best-effort int parse of archive.org size fields. None if unparseable."""
+    if raw is None:
+        return None
+    try:
+        return int(str(raw).strip())
+    except (TypeError, ValueError):
+        return None
+
+
 def _enrich_file(identifier: str, f: dict[str, Any]) -> dict[str, Any]:
    name = f.get("name", "")
+    size = _parse_size(f.get("size"))
    return {
        "name": name,
        "format": f.get("format"),
-        "size": int(f["size"]) if f.get("size") and str(f["size"]).isdigit() else None,
-        "size_human": _human_size(f.get("size")),
+        "size": size,
+        "size_human": _human_size(size if size is not None else f.get("size")),
        "md5": f.get("md5"),
        "sha1": f.get("sha1"),
        "mtime": f.get("mtime"),
        "source": f.get("source"),
-        "download_url": f"https://archive.org/download/{identifier}/{name}",
+        "download_url": f"https://archive.org/download/{identifier}/{quote(name, safe='/')}",
    }


@ -63,6 +83,36 @@ def _matches(name: str, format_: str | None, name_glob: str | None, formats: lis
    return not (formats and (format_ or "").lower() not in {f.lower() for f in formats})


+def _confine_dest(identifier: str, filename: str, dest_dir: str | None) -> Path:
+    """Construct + verify the download destination path.
+
+    Validates inputs, resolves the path, and asserts it lives inside the allowed
+    download root. Raises ValueError on any escape attempt — never returns an
+    unsafe path.
+    """
+    validate_identifier(identifier)
+    validate_filename(filename)
+
+    download_root = _resolve_download_root().resolve()
+    if dest_dir:
+        target_dir = Path(dest_dir).expanduser().resolve()
+    else:
+        target_dir = (download_root / identifier).resolve()
+
+    target_dir.mkdir(parents=True, exist_ok=True)
+    if target_dir.is_symlink():
+        raise ValueError(f"download directory must not be a symlink: {target_dir}")
+
+    # Build dest, then resolve and confirm containment. Path.resolve() collapses
+    # '..' even on non-existent leaves, so escape attempts surface here.
+    dest = (target_dir / filename).resolve()
+    if not dest.is_relative_to(target_dir):
+        raise ValueError(
+            f"refusing destination outside target dir: {dest} not under {target_dir}"
+        )
+    return dest
+
+
 # ---------- tools ----------


@ -132,6 +182,13 @@ async def get_item_metadata(

    By default omits the (potentially huge) files[] array — call list_files for that.
    """
+    return await _fetch_item_metadata(identifier, include_files=include_files)
+
+
+async def _fetch_item_metadata(identifier: str, include_files: bool) -> dict[str, Any]:
+    """Shared metadata-fetching logic. Used by both the tool and the MCP resource
+    so neither has to depend on the other's `.fn` attribute."""
+    validate_identifier(identifier)
    async with ArchiveClient() as c:
        data = await c.metadata(identifier)

@ -177,6 +234,7 @@ async def list_files(

    Each entry includes a ready-to-use `download_url`.
    """
+    validate_identifier(identifier)
    async with ArchiveClient() as c:
        files = await c.files(identifier)

@ -199,8 +257,10 @@ def get_file_url(
    filename: Annotated[str, Field(description="Exact filename as shown in list_files.")],
 ) -> dict[str, str]:
    """Build the canonical download URL for a file without fetching anything."""
+    validate_identifier(identifier)
+    validate_filename(filename)
    return {
-        "url": f"https://archive.org/download/{identifier}/{filename}",
+        "url": f"https://archive.org/download/{identifier}/{quote(filename, safe='/')}",
        "item_url": f"https://archive.org/details/{identifier}",
    }

@ -222,18 +282,29 @@ async def download_file(
        Field(description="If false and file exists, resume the download (Range request)."),
    ] = False,
 ) -> dict[str, Any]:
-    """Download a file to disk. Supports resume via HTTP Range when overwrite=false."""
-    target_dir = Path(dest_dir).expanduser() if dest_dir else (DEFAULT_DOWNLOAD_ROOT / identifier)
-    dest = target_dir / filename
-    if overwrite and dest.exists():
+    """Download a file to disk. Supports resume via HTTP Range when overwrite=false.
+
+    The destination is path-confined to either `dest_dir` (when given) or
+    $MCARCHIVE_DOWNLOAD_ROOT/{identifier}. Filenames containing '..', absolute
+    paths, or NUL bytes are rejected before any FS or network I/O.
+    """
+    dest = _confine_dest(identifier, filename, dest_dir)
+    if overwrite and dest.exists() and not dest.is_symlink():
+        dest.unlink()
+    elif overwrite and dest.is_symlink():
+        # Don't follow the symlink to delete the target; remove the link itself.
        dest.unlink()

+    try:
        async with ArchiveClient() as c:
            result = await c.download_to_file(identifier, filename, dest, verify_md5=verify_md5)
+    except ArchiveError as e:
+        # Re-raise with the destination context so the caller can act on it.
+        raise ArchiveError(f"{e} (dest={dest})") from e

    result["identifier"] = identifier
    result["filename"] = filename
-    result["size_human"] = _human_size(result.get("bytes"))
+    result["size_human"] = _human_size(result.get("bytes_written"))
    return result


@ -243,7 +314,7 @@ async def download_file(
@mcp.resource("archive://item/{identifier}")
 async def item_resource(identifier: str) -> dict[str, Any]:
    """Expose item metadata as a readable MCP resource."""
-    return await get_item_metadata.fn(identifier=identifier, include_files=False)  # type: ignore[attr-defined]
+    return await _fetch_item_metadata(identifier, include_files=False)


 # ---------- entry point ----------
--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -12,7 +12,7 @@ import pytest

 from mcarchive_org.client import ArchiveClient

-pytestmark = [pytest.mark.asyncio, pytest.mark.network]
+pytestmark = pytest.mark.network


 async def test_search_nasa_item():
@ -41,7 +41,7 @@ async def test_download_small_file(tmp_path: Path):
        result = await c.download_to_file(
            "nasa", small["name"], dest, verify_md5=small.get("md5")
        )
-    assert result["bytes"] > 0
+    assert result["bytes_written"] > 0
    if small.get("md5"):
        assert result["md5_ok"] is True

--- a/tests/test_client_mocked.py
+++ b/tests/test_client_mocked.py
@ -0,0 +1,242 @@
+"""Failure-mode regression tests using httpx.MockTransport (no network).
+
+Each test pins down one of the Hamilton review findings (C1/C2/C3/H4 etc.) so
+future refactors can't silently regress safety.
+"""
+
+from __future__ import annotations
+
+import hashlib
+
+import httpx
+import pytest
+
+from mcarchive_org.client import (
+    ArchiveClient,
+    ArchiveError,
+    validate_filename,
+    validate_identifier,
+)
+from mcarchive_org.server import _confine_dest
+
+
+def _client_with(handler) -> ArchiveClient:
+    """Build an ArchiveClient backed by a MockTransport handler."""
+    return ArchiveClient(transport=httpx.MockTransport(handler))
+
+
+# ---------- C1: identifier + filename validation ----------
+
+
+@pytest.mark.parametrize("bad", ["", "../etc", "foo/bar", "has space", "a" * 200])
+def test_invalid_identifier_rejected(bad):
+    with pytest.raises(ValueError, match=r"invalid archive\.org identifier"):
+        validate_identifier(bad)
+
+
+@pytest.mark.parametrize(
+    "bad",
+    [
+        "../escape.txt",
+        "/etc/passwd",
+        "C:\\windows.txt",
+        "with\x00null.bin",
+        "foo/../bar.mp3",
+        "foo\\..\\bar.mp3",
+        "",
+    ],
+)
+def test_invalid_filename_rejected(bad):
+    with pytest.raises(ValueError):
+        validate_filename(bad)
+
+
+@pytest.mark.parametrize(
+    "ok",
+    ["song.mp3", "cover/back.jpg", "subdir/file with space.txt", "a.b.c.d"],
+)
+def test_legitimate_filenames_accepted(ok):
+    assert validate_filename(ok) == ok
+
+
+def test_confine_dest_blocks_traversal(tmp_path, monkeypatch):
+    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
+    # validate_filename catches '..' before _confine_dest's path-resolution check,
+    # so this raises ValueError from the validator — both layers in agreement.
+    with pytest.raises(ValueError):
+        _confine_dest("nasa", "../escape.txt", dest_dir=None)
+
+
+def test_confine_dest_legit_filename_lands_in_root(tmp_path, monkeypatch):
+    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
+    dest = _confine_dest("nasa", "globe.jpg", dest_dir=None)
+    assert dest.is_relative_to(tmp_path)
+    assert dest.name == "globe.jpg"
+
+
+# ---------- C2: symlink refusal ----------
+
+
+async def test_download_refuses_symlink_at_dest(tmp_path):
+    target = tmp_path / "real.bin"
+    target.write_bytes(b"original-content")
+
+    link = tmp_path / "evil.bin"
+    link.symlink_to(target)
+
+    def handler(req: httpx.Request) -> httpx.Response:
+        return httpx.Response(200, content=b"new-content-that-should-not-overwrite")
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="symlink"):
+            await c.download_to_file("nasa", "evil.bin", link)
+
+    # Symlink target must be unchanged.
+    assert target.read_bytes() == b"original-content"
+
+
+# ---------- C3: Range-ignored detection ----------
+
+
+async def test_resume_with_200_response_raises_before_writing(tmp_path):
+    """If the server returns 200 instead of 206 on a Range request, we must not
+    append to the existing file — that path corrupts data silently."""
+    dest = tmp_path / "partial.bin"
+    dest.write_bytes(b"X" * 100)  # pretend we have a partial download
+
+    def handler(req: httpx.Request) -> httpx.Response:
+        # Server ignores Range header and returns the full body with 200
+        assert req.headers.get("Range") == "bytes=100-"
+        return httpx.Response(200, content=b"FULL_FILE_BODY")
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="ignored Range"):
+            await c.download_to_file("nasa", "partial.bin", dest)
+
+    # File must be unchanged — corruption avoided.
+    assert dest.read_bytes() == b"X" * 100
+
+
+async def test_resume_with_correct_206_succeeds(tmp_path):
+    full_body = b"0123456789ABCDEF" * 16  # 256 bytes
+    dest = tmp_path / "resume.bin"
+    dest.write_bytes(full_body[:64])  # we already have first 64 bytes
+
+    def handler(req: httpx.Request) -> httpx.Response:
+        assert req.headers.get("Range") == "bytes=64-"
+        return httpx.Response(
+            206,
+            content=full_body[64:],
+            headers={"Content-Range": f"bytes 64-{len(full_body)-1}/{len(full_body)}"},
+        )
+
+    expected_md5 = hashlib.md5(full_body).hexdigest()
+    async with _client_with(handler) as c:
+        result = await c.download_to_file(
+            "nasa", "resume.bin", dest, verify_md5=expected_md5
+        )
+
+    assert result["bytes_written"] == len(full_body)
+    assert result["resumed_from"] == 64
+    assert result["md5_ok"] is True
+    assert dest.read_bytes() == full_body
+
+
+async def test_resume_with_wrong_content_range_start_raises(tmp_path):
+    dest = tmp_path / "off.bin"
+    dest.write_bytes(b"X" * 100)
+
+    def handler(req: httpx.Request) -> httpx.Response:
+        # Server returns 206 but with WRONG starting offset
+        return httpx.Response(
+            206,
+            content=b"junk",
+            headers={"Content-Range": "bytes 50-99/100"},
+        )
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="Content-Range start"):
+            await c.download_to_file("nasa", "off.bin", dest)
+
+    assert dest.read_bytes() == b"X" * 100  # unchanged
+
+
+# ---------- H4: error body surfacing ----------
+
+
+async def test_search_400_includes_response_body():
+    def handler(req: httpx.Request) -> httpx.Response:
+        return httpx.Response(400, text='{"error":"bad query syntax"}')
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="bad query syntax"):
+            await c.search(query="INVALID:::")
+
+
+async def test_metadata_404_includes_status():
+    def handler(req: httpx.Request) -> httpx.Response:
+        return httpx.Response(404, text="not found")
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="HTTP 404"):
+            await c.metadata("nasa")
+
+
+async def test_metadata_empty_dict_means_not_found():
+    def handler(req: httpx.Request) -> httpx.Response:
+        return httpx.Response(200, json={})
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="not found or unavailable"):
+            await c.metadata("nasa")
+
+
+async def test_files_returns_error_payload_as_archive_error():
+    def handler(req: httpx.Request) -> httpx.Response:
+        return httpx.Response(200, json={"error": "item is dark"})
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="item is dark"):
+            await c.files("nasa")
+
+
+async def test_scrape_error_payload_surfaced():
+    def handler(req: httpx.Request) -> httpx.Response:
+        return httpx.Response(
+            200, json={"error": "count too small", "errorType": "RangeException"}
+        )
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match=r"RangeException.*count too small"):
+            await c.scrape(query="identifier:nasa", count=100)
+
+
+async def test_invalid_json_response_surfaced():
+    def handler(req: httpx.Request) -> httpx.Response:
+        return httpx.Response(200, text="<html>not json</html>")
+
+    async with _client_with(handler) as c:
+        with pytest.raises(ArchiveError, match="invalid JSON"):
+            await c.metadata("nasa")
+
+
+# ---------- happy path ----------
+
+
+async def test_fresh_download_writes_full_body(tmp_path):
+    body = b"hello world" * 100
+    dest = tmp_path / "new.bin"
+
+    def handler(req: httpx.Request) -> httpx.Response:
+        assert "Range" not in req.headers
+        return httpx.Response(200, content=body)
+
+    async with _client_with(handler) as c:
+        result = await c.download_to_file(
+            "nasa", "new.bin", dest, verify_md5=hashlib.md5(body).hexdigest()
+        )
+
+    assert result["bytes_written"] == len(body)
+    assert result["resumed_from"] == 0
+    assert result["md5_ok"] is True
+    assert dest.read_bytes() == body