mcarchive-org/tests/test_client_mocked.py

"""Failure-mode regression tests using httpx.MockTransport (no network).

Each test pins down one of the Hamilton review findings (C1/C2/C3/H4 etc.) so
future refactors can't silently regress safety.
"""

from __future__ import annotations

import hashlib

import httpx
import pytest

from mcarchive_org.client import (
    ArchiveClient,
    ArchiveError,
    validate_filename,
    validate_identifier,
)
from mcarchive_org.server import _confine_dest


def _client_with(handler) -> ArchiveClient:
    """Build an ArchiveClient backed by a MockTransport handler."""
    return ArchiveClient(transport=httpx.MockTransport(handler))


# ---------- C1: identifier + filename validation ----------


@pytest.mark.parametrize("bad", ["", "../etc", "foo/bar", "has space", "a" * 200])
def test_invalid_identifier_rejected(bad):
    with pytest.raises(ValueError, match=r"invalid archive\.org identifier"):
        validate_identifier(bad)


@pytest.mark.parametrize(
    "bad",
    [
        "../escape.txt",
        "/etc/passwd",
        "C:\\windows.txt",
        "with\x00null.bin",
        "foo/../bar.mp3",
        "foo\\..\\bar.mp3",
        "",
    ],
)
def test_invalid_filename_rejected(bad):
    with pytest.raises(ValueError):
        validate_filename(bad)


@pytest.mark.parametrize(
    "ok",
    ["song.mp3", "cover/back.jpg", "subdir/file with space.txt", "a.b.c.d"],
)
def test_legitimate_filenames_accepted(ok):
    assert validate_filename(ok) == ok


def test_confine_dest_blocks_traversal(tmp_path, monkeypatch):
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
    # validate_filename catches '..' before _confine_dest's path-resolution check,
    # so this raises ValueError from the validator — both layers in agreement.
    with pytest.raises(ValueError):
        _confine_dest("nasa", "../escape.txt", dest_dir=None)


def test_confine_dest_legit_filename_lands_in_root(tmp_path, monkeypatch):
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
    dest = _confine_dest("nasa", "globe.jpg", dest_dir=None)
    assert dest.is_relative_to(tmp_path)
    assert dest.name == "globe.jpg"


# ---------- C2: symlink refusal ----------


async def test_download_refuses_symlink_at_dest(tmp_path):
    target = tmp_path / "real.bin"
    target.write_bytes(b"original-content")

    link = tmp_path / "evil.bin"
    link.symlink_to(target)

    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, content=b"new-content-that-should-not-overwrite")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="symlink"):
            await c.download_to_file("nasa", "evil.bin", link)

    # Symlink target must be unchanged.
    assert target.read_bytes() == b"original-content"


# ---------- C3: Range-ignored detection ----------


async def test_resume_with_200_response_raises_before_writing(tmp_path):
    """If the server returns 200 instead of 206 on a Range request, we must not
    append to the existing file — that path corrupts data silently."""
    dest = tmp_path / "partial.bin"
    dest.write_bytes(b"X" * 100)  # pretend we have a partial download

    def handler(req: httpx.Request) -> httpx.Response:
        # Server ignores Range header and returns the full body with 200
        assert req.headers.get("Range") == "bytes=100-"
        return httpx.Response(200, content=b"FULL_FILE_BODY")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="ignored Range"):
            await c.download_to_file("nasa", "partial.bin", dest)

    # File must be unchanged — corruption avoided.
    assert dest.read_bytes() == b"X" * 100


async def test_resume_with_correct_206_succeeds(tmp_path):
    full_body = b"0123456789ABCDEF" * 16  # 256 bytes
    dest = tmp_path / "resume.bin"
    dest.write_bytes(full_body[:64])  # we already have first 64 bytes

    def handler(req: httpx.Request) -> httpx.Response:
        assert req.headers.get("Range") == "bytes=64-"
        return httpx.Response(
            206,
            content=full_body[64:],
            headers={"Content-Range": f"bytes 64-{len(full_body)-1}/{len(full_body)}"},
        )

    expected_md5 = hashlib.md5(full_body).hexdigest()
    async with _client_with(handler) as c:
        result = await c.download_to_file(
            "nasa", "resume.bin", dest, verify_md5=expected_md5
        )

    assert result["bytes_written"] == len(full_body)
    assert result["resumed_from"] == 64
    assert result["md5_ok"] is True
    assert dest.read_bytes() == full_body


async def test_resume_with_wrong_content_range_start_raises(tmp_path):
    dest = tmp_path / "off.bin"
    dest.write_bytes(b"X" * 100)

    def handler(req: httpx.Request) -> httpx.Response:
        # Server returns 206 but with WRONG starting offset
        return httpx.Response(
            206,
            content=b"junk",
            headers={"Content-Range": "bytes 50-99/100"},
        )

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="Content-Range start"):
            await c.download_to_file("nasa", "off.bin", dest)

    assert dest.read_bytes() == b"X" * 100  # unchanged


# ---------- H4: error body surfacing ----------


async def test_search_400_includes_response_body():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(400, text='{"error":"bad query syntax"}')

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="bad query syntax"):
            await c.search(query="INVALID:::")


async def test_metadata_404_includes_status():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(404, text="not found")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="HTTP 404"):
            await c.metadata("nasa")


async def test_metadata_empty_dict_means_not_found():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, json={})

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="not found or unavailable"):
            await c.metadata("nasa")


async def test_files_returns_error_payload_as_archive_error():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, json={"error": "item is dark"})

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="item is dark"):
            await c.files("nasa")


async def test_scrape_error_payload_surfaced():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(
            200, json={"error": "count too small", "errorType": "RangeException"}
        )

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match=r"RangeException.*count too small"):
            await c.scrape(query="identifier:nasa", count=100)


async def test_invalid_json_response_surfaced():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, text="<html>not json</html>")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="invalid JSON"):
            await c.metadata("nasa")


# ---------- happy path ----------


async def test_fresh_download_writes_full_body(tmp_path):
    body = b"hello world" * 100
    dest = tmp_path / "new.bin"

    def handler(req: httpx.Request) -> httpx.Response:
        assert "Range" not in req.headers
        return httpx.Response(200, content=body)

    async with _client_with(handler) as c:
        result = await c.download_to_file(
            "nasa", "new.bin", dest, verify_md5=hashlib.md5(body).hexdigest()
        )

    assert result["bytes_written"] == len(body)
    assert result["resumed_from"] == 0
    assert result["md5_ok"] is True
    assert dest.read_bytes() == body