"""Server-layer regression tests using a swapped-in shared client. These exercise the MCP tool functions directly and verify: - Collection normalization (M5) - `is_collection` derived flag (M7) - Shared client lifecycle (H7) - Concurrent-download serialization (M2) """ from __future__ import annotations import asyncio import os from contextlib import asynccontextmanager import httpx import pytest from mcarchive_org import client as client_mod from mcarchive_org.client import ArchiveClient from mcarchive_org.server import ( _enrich_doc, _normalize_collection, download_file, get_download_root, get_item_metadata, search_items, set_download_root, ) @asynccontextmanager async def swap_shared_client(handler): """Temporarily replace the process-wide shared client with a mock-backed one. Tests that exercise server.py tools need this because those tools call get_shared_client() under the hood, and we can't pass a transport in. """ saved = client_mod._shared_client mock = ArchiveClient(transport=httpx.MockTransport(handler)) client_mod._shared_client = mock try: yield mock finally: client_mod._shared_client = saved await mock.aclose() # ---------- M5: collection normalization ---------- @pytest.mark.parametrize( "raw,expected", [ (None, []), ("", []), ("nasa", ["nasa"]), (["nasa", "opensource"], ["nasa", "opensource"]), ([], []), ([None, "nasa", ""], ["nasa"]), # falsy items dropped ], ) def test_normalize_collection_shapes(raw, expected): assert _normalize_collection(raw) == expected def test_enrich_doc_marks_is_collection(): assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False assert _enrich_doc({"identifier": "x"})["is_collection"] is False def test_enrich_doc_normalizes_collection_field(): out = _enrich_doc({"identifier": "x", "collection": "single"}) assert out["collection"] == ["single"] # ---------- M7: is_collection in real tool flow ---------- async def test_search_items_decorates_docs_with_is_collection(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response( 200, json={ "response": { "numFound": 2, "docs": [ {"identifier": "nasa", "mediatype": "collection", "collection": "nasa"}, {"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]}, ], } }, ) async with swap_shared_client(handler): result = await search_items(query="x", rows=2) assert len(result["docs"]) == 2 nasa, song = result["docs"] assert nasa["is_collection"] is True assert nasa["collection"] == ["nasa"] assert song["is_collection"] is False assert song["collection"] == ["etree", "GratefulDead"] async def test_get_item_metadata_normalizes_collection(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response( 200, json={ "metadata": { "identifier": "nasa", "title": "NASA Images", "mediatype": "collection", "collection": "internetarchive", }, "files_count": 0, "item_size": 0, }, ) async with swap_shared_client(handler): result = await get_item_metadata(identifier="nasa") assert result["is_collection"] is True assert result["collection"] == ["internetarchive"] # ---------- H7: shared client lifecycle ---------- async def test_get_shared_client_returns_same_instance(): await client_mod.close_shared_client() a = await client_mod.get_shared_client() b = await client_mod.get_shared_client() assert a is b await client_mod.close_shared_client() async def test_close_shared_client_clears_singleton(): a = await client_mod.get_shared_client() await client_mod.close_shared_client() b = await client_mod.get_shared_client() assert a is not b await client_mod.close_shared_client() # ---------- M2: concurrent-download serialization ---------- async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch): """Two parallel download_file calls for the same (id, filename) must not interleave — otherwise they'd race on the destination file.""" monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path)) state = {"active": 0, "max_active": 0} async def handler(req: httpx.Request) -> httpx.Response: state["active"] += 1 state["max_active"] = max(state["max_active"], state["active"]) await asyncio.sleep(0.05) # hold the request long enough to overlap state["active"] -= 1 return httpx.Response(200, content=b"file-content") async with swap_shared_client(handler): await asyncio.gather( download_file(identifier="nasa", filename="shared.bin", overwrite=True), download_file(identifier="nasa", filename="shared.bin", overwrite=True), ) # The lock should have prevented any overlap. assert state["max_active"] == 1 # ---------- runtime download root management ---------- def test_get_download_root_reports_env_value(tmp_path, monkeypatch): monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path)) info = get_download_root() assert info["download_root"] == str(tmp_path.resolve()) assert info["source"] == "MCARCHIVE_DOWNLOAD_ROOT env var" assert info["raw_env_value"] == str(tmp_path) def test_get_download_root_reports_default_when_no_env(monkeypatch): monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False) info = get_download_root() assert info["source"] == "default (./downloads under server CWD)" assert info["raw_env_value"] is None def test_set_download_root_changes_env_and_creates_dir(tmp_path, monkeypatch): monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False) target = tmp_path / "new" / "spot" assert not target.exists() info = set_download_root(path=str(target)) assert info["download_root"] == str(target.resolve()) assert info["changed"] is True assert target.exists() and target.is_dir() assert os.environ["MCARCHIVE_DOWNLOAD_ROOT"] == str(target.resolve()) def test_set_download_root_expands_tilde(tmp_path, monkeypatch): monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False) monkeypatch.setenv("HOME", str(tmp_path)) info = set_download_root(path="~/dl") assert info["download_root"] == str((tmp_path / "dl").resolve()) assert (tmp_path / "dl").exists() @pytest.mark.parametrize("forbidden", ["/etc", "/usr/local", "/var/log", "/", "/sys"]) def test_set_download_root_refuses_system_dirs(forbidden): with pytest.raises(ValueError, match="system directory"): set_download_root(path=forbidden) async def test_set_download_root_takes_effect_for_next_download(tmp_path, monkeypatch): """The lazy-resolved root means a runtime change is honored by download_file on the very next call without restarting.""" monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False) set_download_root(path=str(tmp_path / "first")) def handler(req): return httpx.Response(200, content=b"data") async with swap_shared_client(handler): await download_file(identifier="nasa", filename="a.bin", overwrite=True) # Now move the root to a different directory mid-session. set_download_root(path=str(tmp_path / "second")) await download_file(identifier="nasa", filename="b.bin", overwrite=True) assert (tmp_path / "first" / "nasa" / "a.bin").exists() assert (tmp_path / "second" / "nasa" / "b.bin").exists() # ---------- M2 (continued): cross-file parallelism ---------- async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch): """Different filenames get different locks — they should run concurrently.""" monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path)) state = {"active": 0, "max_active": 0} async def handler(req: httpx.Request) -> httpx.Response: state["active"] += 1 state["max_active"] = max(state["max_active"], state["active"]) await asyncio.sleep(0.05) state["active"] -= 1 return httpx.Response(200, content=b"data") async with swap_shared_client(handler): await asyncio.gather( download_file(identifier="nasa", filename="a.bin", overwrite=True), download_file(identifier="nasa", filename="b.bin", overwrite=True), ) # Different files — should overlap. assert state["max_active"] == 2