mcarchive-org/tests/test_server_mocked.py
Ryan Malloy 25a34cd24d Add atomic .part staging + runtime download root tools
Atomic write pattern (tier-3 polish from headless test finding):
- download_to_file now writes to <dest>.part and renames to <dest> only on
  successful stream completion (os.replace is POSIX-atomic). Failed
  downloads leave only the .part file — no misleading 0-byte dest files
  in the user's downloads directory.
- Resume logic reads from <dest>.part instead of <dest>; the user's
  directory only ever contains complete files or clearly-marked .part files.
- New `already_complete` short-circuit: if dest exists and no .part, skip
  the network entirely (still re-verify MD5 if requested). The headless
  Claude test confirmed this avoids redundant CDN load.
- Symlink rejection re-added at the new code path: even though os.replace
  would only replace (not follow) a symlink at dest, predictable refusal
  beats silent symlink removal.

Runtime download root tools (for stdio MCP mode):
- get_download_root(): reports current root, source (env var vs default),
  existence, writability.
- set_download_root(path): change MCARCHIVE_DOWNLOAD_ROOT mid-session.
  Expands ~, creates the dir, refuses system paths
  (/, /etc, /usr, /bin, /sbin, /var, /sys, /proc, /dev, /boot, /root).
  The lazy-resolved root means the change takes effect on the next
  download_file call without restarting the server.

14 new tests (66 total, all green, ruff clean):
- 4 staging tests: failed download leaves no dest, success leaves no .part,
  already_complete short-circuit, MD5 verification on existing files
- 6 root-tools tests: env reporting, default reporting, ~ expansion,
  system-dir refusal (parametrized), set→download takes effect immediately
- 4 existing tests rewritten to use .part as the resume staging file

Headless Claude smoke test verified end-to-end: get_download_root →
set_download_root → search → list → download → second download
short-circuits with already_complete=true and zero network bytes.
2026-04-21 21:11:56 -06:00

266 lines
8.8 KiB
Python

"""Server-layer regression tests using a swapped-in shared client.
These exercise the MCP tool functions directly and verify:
- Collection normalization (M5)
- `is_collection` derived flag (M7)
- Shared client lifecycle (H7)
- Concurrent-download serialization (M2)
"""
from __future__ import annotations
import asyncio
import os
from contextlib import asynccontextmanager
import httpx
import pytest
from mcarchive_org import client as client_mod
from mcarchive_org.client import ArchiveClient
from mcarchive_org.server import (
_enrich_doc,
_normalize_collection,
download_file,
get_download_root,
get_item_metadata,
search_items,
set_download_root,
)
@asynccontextmanager
async def swap_shared_client(handler):
"""Temporarily replace the process-wide shared client with a mock-backed one.
Tests that exercise server.py tools need this because those tools call
get_shared_client() under the hood, and we can't pass a transport in.
"""
saved = client_mod._shared_client
mock = ArchiveClient(transport=httpx.MockTransport(handler))
client_mod._shared_client = mock
try:
yield mock
finally:
client_mod._shared_client = saved
await mock.aclose()
# ---------- M5: collection normalization ----------
@pytest.mark.parametrize(
"raw,expected",
[
(None, []),
("", []),
("nasa", ["nasa"]),
(["nasa", "opensource"], ["nasa", "opensource"]),
([], []),
([None, "nasa", ""], ["nasa"]), # falsy items dropped
],
)
def test_normalize_collection_shapes(raw, expected):
assert _normalize_collection(raw) == expected
def test_enrich_doc_marks_is_collection():
assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True
assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False
assert _enrich_doc({"identifier": "x"})["is_collection"] is False
def test_enrich_doc_normalizes_collection_field():
out = _enrich_doc({"identifier": "x", "collection": "single"})
assert out["collection"] == ["single"]
# ---------- M7: is_collection in real tool flow ----------
async def test_search_items_decorates_docs_with_is_collection():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(
200,
json={
"response": {
"numFound": 2,
"docs": [
{"identifier": "nasa", "mediatype": "collection", "collection": "nasa"},
{"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]},
],
}
},
)
async with swap_shared_client(handler):
result = await search_items(query="x", rows=2)
assert len(result["docs"]) == 2
nasa, song = result["docs"]
assert nasa["is_collection"] is True
assert nasa["collection"] == ["nasa"]
assert song["is_collection"] is False
assert song["collection"] == ["etree", "GratefulDead"]
async def test_get_item_metadata_normalizes_collection():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(
200,
json={
"metadata": {
"identifier": "nasa",
"title": "NASA Images",
"mediatype": "collection",
"collection": "internetarchive",
},
"files_count": 0,
"item_size": 0,
},
)
async with swap_shared_client(handler):
result = await get_item_metadata(identifier="nasa")
assert result["is_collection"] is True
assert result["collection"] == ["internetarchive"]
# ---------- H7: shared client lifecycle ----------
async def test_get_shared_client_returns_same_instance():
await client_mod.close_shared_client()
a = await client_mod.get_shared_client()
b = await client_mod.get_shared_client()
assert a is b
await client_mod.close_shared_client()
async def test_close_shared_client_clears_singleton():
a = await client_mod.get_shared_client()
await client_mod.close_shared_client()
b = await client_mod.get_shared_client()
assert a is not b
await client_mod.close_shared_client()
# ---------- M2: concurrent-download serialization ----------
async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch):
"""Two parallel download_file calls for the same (id, filename) must not
interleave — otherwise they'd race on the destination file."""
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
state = {"active": 0, "max_active": 0}
async def handler(req: httpx.Request) -> httpx.Response:
state["active"] += 1
state["max_active"] = max(state["max_active"], state["active"])
await asyncio.sleep(0.05) # hold the request long enough to overlap
state["active"] -= 1
return httpx.Response(200, content=b"file-content")
async with swap_shared_client(handler):
await asyncio.gather(
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
)
# The lock should have prevented any overlap.
assert state["max_active"] == 1
# ---------- runtime download root management ----------
def test_get_download_root_reports_env_value(tmp_path, monkeypatch):
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
info = get_download_root()
assert info["download_root"] == str(tmp_path.resolve())
assert info["source"] == "MCARCHIVE_DOWNLOAD_ROOT env var"
assert info["raw_env_value"] == str(tmp_path)
def test_get_download_root_reports_default_when_no_env(monkeypatch):
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
info = get_download_root()
assert info["source"] == "default (./downloads under server CWD)"
assert info["raw_env_value"] is None
def test_set_download_root_changes_env_and_creates_dir(tmp_path, monkeypatch):
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
target = tmp_path / "new" / "spot"
assert not target.exists()
info = set_download_root(path=str(target))
assert info["download_root"] == str(target.resolve())
assert info["changed"] is True
assert target.exists() and target.is_dir()
assert os.environ["MCARCHIVE_DOWNLOAD_ROOT"] == str(target.resolve())
def test_set_download_root_expands_tilde(tmp_path, monkeypatch):
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
monkeypatch.setenv("HOME", str(tmp_path))
info = set_download_root(path="~/dl")
assert info["download_root"] == str((tmp_path / "dl").resolve())
assert (tmp_path / "dl").exists()
@pytest.mark.parametrize("forbidden", ["/etc", "/usr/local", "/var/log", "/", "/sys"])
def test_set_download_root_refuses_system_dirs(forbidden):
with pytest.raises(ValueError, match="system directory"):
set_download_root(path=forbidden)
async def test_set_download_root_takes_effect_for_next_download(tmp_path, monkeypatch):
"""The lazy-resolved root means a runtime change is honored by download_file
on the very next call without restarting."""
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
set_download_root(path=str(tmp_path / "first"))
def handler(req):
return httpx.Response(200, content=b"data")
async with swap_shared_client(handler):
await download_file(identifier="nasa", filename="a.bin", overwrite=True)
# Now move the root to a different directory mid-session.
set_download_root(path=str(tmp_path / "second"))
await download_file(identifier="nasa", filename="b.bin", overwrite=True)
assert (tmp_path / "first" / "nasa" / "a.bin").exists()
assert (tmp_path / "second" / "nasa" / "b.bin").exists()
# ---------- M2 (continued): cross-file parallelism ----------
async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch):
"""Different filenames get different locks — they should run concurrently."""
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
state = {"active": 0, "max_active": 0}
async def handler(req: httpx.Request) -> httpx.Response:
state["active"] += 1
state["max_active"] = max(state["max_active"], state["active"])
await asyncio.sleep(0.05)
state["active"] -= 1
return httpx.Response(200, content=b"data")
async with swap_shared_client(handler):
await asyncio.gather(
download_file(identifier="nasa", filename="a.bin", overwrite=True),
download_file(identifier="nasa", filename="b.bin", overwrite=True),
)
# Different files — should overlap.
assert state["max_active"] == 2