"""Server-layer regression tests using a swapped-in shared client. These exercise the MCP tool functions directly and verify: - Collection normalization (M5) - `is_collection` derived flag (M7) - Shared client lifecycle (H7) - Concurrent-download serialization (M2) """ from __future__ import annotations import asyncio from contextlib import asynccontextmanager import httpx import pytest from mcarchive_org import client as client_mod from mcarchive_org.client import ArchiveClient from mcarchive_org.server import ( _enrich_doc, _normalize_collection, download_file, get_item_metadata, search_items, ) @asynccontextmanager async def swap_shared_client(handler): """Temporarily replace the process-wide shared client with a mock-backed one. Tests that exercise server.py tools need this because those tools call get_shared_client() under the hood, and we can't pass a transport in. """ saved = client_mod._shared_client mock = ArchiveClient(transport=httpx.MockTransport(handler)) client_mod._shared_client = mock try: yield mock finally: client_mod._shared_client = saved await mock.aclose() # ---------- M5: collection normalization ---------- @pytest.mark.parametrize( "raw,expected", [ (None, []), ("", []), ("nasa", ["nasa"]), (["nasa", "opensource"], ["nasa", "opensource"]), ([], []), ([None, "nasa", ""], ["nasa"]), # falsy items dropped ], ) def test_normalize_collection_shapes(raw, expected): assert _normalize_collection(raw) == expected def test_enrich_doc_marks_is_collection(): assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False assert _enrich_doc({"identifier": "x"})["is_collection"] is False def test_enrich_doc_normalizes_collection_field(): out = _enrich_doc({"identifier": "x", "collection": "single"}) assert out["collection"] == ["single"] # ---------- M7: is_collection in real tool flow ---------- async def test_search_items_decorates_docs_with_is_collection(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response( 200, json={ "response": { "numFound": 2, "docs": [ {"identifier": "nasa", "mediatype": "collection", "collection": "nasa"}, {"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]}, ], } }, ) async with swap_shared_client(handler): result = await search_items(query="x", rows=2) assert len(result["docs"]) == 2 nasa, song = result["docs"] assert nasa["is_collection"] is True assert nasa["collection"] == ["nasa"] assert song["is_collection"] is False assert song["collection"] == ["etree", "GratefulDead"] async def test_get_item_metadata_normalizes_collection(): def handler(req: httpx.Request) -> httpx.Response: return httpx.Response( 200, json={ "metadata": { "identifier": "nasa", "title": "NASA Images", "mediatype": "collection", "collection": "internetarchive", }, "files_count": 0, "item_size": 0, }, ) async with swap_shared_client(handler): result = await get_item_metadata(identifier="nasa") assert result["is_collection"] is True assert result["collection"] == ["internetarchive"] # ---------- H7: shared client lifecycle ---------- async def test_get_shared_client_returns_same_instance(): await client_mod.close_shared_client() a = await client_mod.get_shared_client() b = await client_mod.get_shared_client() assert a is b await client_mod.close_shared_client() async def test_close_shared_client_clears_singleton(): a = await client_mod.get_shared_client() await client_mod.close_shared_client() b = await client_mod.get_shared_client() assert a is not b await client_mod.close_shared_client() # ---------- M2: concurrent-download serialization ---------- async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch): """Two parallel download_file calls for the same (id, filename) must not interleave — otherwise they'd race on the destination file.""" monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path)) state = {"active": 0, "max_active": 0} async def handler(req: httpx.Request) -> httpx.Response: state["active"] += 1 state["max_active"] = max(state["max_active"], state["active"]) await asyncio.sleep(0.05) # hold the request long enough to overlap state["active"] -= 1 return httpx.Response(200, content=b"file-content") async with swap_shared_client(handler): await asyncio.gather( download_file(identifier="nasa", filename="shared.bin", overwrite=True), download_file(identifier="nasa", filename="shared.bin", overwrite=True), ) # The lock should have prevented any overlap. assert state["max_active"] == 1 async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch): """Different filenames get different locks — they should run concurrently.""" monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path)) state = {"active": 0, "max_active": 0} async def handler(req: httpx.Request) -> httpx.Response: state["active"] += 1 state["max_active"] = max(state["max_active"], state["active"]) await asyncio.sleep(0.05) state["active"] -= 1 return httpx.Response(200, content=b"data") async with swap_shared_client(handler): await asyncio.gather( download_file(identifier="nasa", filename="a.bin", overwrite=True), download_file(identifier="nasa", filename="b.bin", overwrite=True), ) # Different files — should overlap. assert state["max_active"] == 2