mcaxl/tests/test_client_recovery.py
Ryan Malloy 39d4b29392 Add RisPort70 for real-time registration state + rate-limit backoff
Two ideas borrowed from cisco-cucm-mcp (calltelemetry/cisco-cucm-mcp,
MIT licensed): real-time device registration via RisPort70, and
exponential-backoff retry on transient HTTP 5xx errors. Both are
purpose-built for the audit use case rather than general-purpose
ports — RisPort tools exist to inform audit findings, not as a
standalone "look at my devices" interface.

Rate limit / 503 backoff (~30 lines + 3 tests):
  AxlClient now mounts an HTTPAdapter with a urllib3 Retry policy
  (3 retries, exponential backoff, status_forcelist=[502,503,504]).
  Configurable via AXL_RATE_LIMIT_RETRIES (default 3, 0 disables).
  Surfaces in connection_status() so operators can see the policy.
  Closes a real reliability gap: CUCM SOAP rate-limits under load
  during change windows or with multiple concurrent admins; pre-fix
  any 503 was a hard failure.

RisPort70 (new src/risport.py + 2 tools + prompt update):
  Hand-coded SOAP client for /realtimeservice2/services/RISService70
  (avoids dragging in another zeep instance for one operation).
  Reuses AXL_URL/USER/PASS env vars — RisPort lives on the same host.

  New tools:
    device_registration_status(device_class, status, name_filter, page_size)
    device_registration_summary()  — cluster-wide breakdown by class

  Live-cluster verification (cucm-pub.binghammemorial.org):
    Phone:    803  registered=679  unregistered=123  rejected=1
    Gateway:   85  registered=41   rejected=44   ← real audit finding
    SIPTrunk:  22  registered=18   unregistered=4
    HuntList:  28  registered=28
    H323/CTI:  0   (cluster doesn't use these)

  Discovered while live-verifying: CUCM 15 wraps the RisPort response
  in an extra <SelectCmDeviceResult> element inside <selectCmDeviceReturn>.
  Older CUCM versions exposed the fields directly. The parser falls
  back to either shape; tests cover both (test_legacy_response_shape_still_parses
  asserts the older shape still works).

phone_inventory_report prompt updated:
  New Step 3 — "Cross-reference with real-time registration" — recommends
  device_registration_summary() + device_registration_status(status="UnRegistered")
  to surface configured-but-never-registered phones (strongest orphan signal),
  PartiallyRegistered phones (firewall/cert/version mismatch indicator),
  and registration-state vs config-state mismatches.

Tooling delta worth noting:
  AXL device count:    1,377 phones
  RisPort device count:   803 phones
  Delta (~574)         likely templates, hidden phones, or stale config —
                       itself an audit finding the new tool will surface
                       to anyone running phone_inventory_report.

README updated:
  - Added health(), device_registration_status, device_registration_summary
  - Added "Scope and complement" section recommending @calltelemetry/cisco-cucm-mcp
    alongside for operational debugging (logs, perfmon, packet capture,
    service control). The two servers answer different questions; the LLM
    with both can compose audit findings with operational state.
  - Listed all 10 prompts (was 4 outdated entries).

Tests: 134 → 155 (+21).
2026-04-26 10:28:04 -06:00

156 lines
6.2 KiB
Python

"""Hamilton review MAJOR #5: connection recovery and config-vs-operational errors.
Pre-fix: any connection failure set `_connection_error` and pinned it forever.
A transient network blip required restarting the MCP server. Fix: distinguish
*configuration* errors (missing env, bad WSDL) which are pinned, from
*operational* errors (network, TLS, session timeout) which can be retried
on the next call.
"""
from pathlib import Path
import pytest
from mcp_cucm_axl.cache import AxlCache
from mcp_cucm_axl.client import AxlClient
@pytest.fixture
def cache(tmp_path: Path) -> AxlCache:
return AxlCache(tmp_path / "test.sqlite", default_ttl=60, cluster_id="test")
def test_config_error_is_pinned(cache: AxlCache, monkeypatch):
"""Missing AXL_URL is a config error — it doesn't get better on retry,
and the next call should still raise the same clear message."""
monkeypatch.delenv("AXL_URL", raising=False)
monkeypatch.delenv("AXL_USER", raising=False)
monkeypatch.delenv("AXL_PASS", raising=False)
client = AxlClient(cache)
with pytest.raises(RuntimeError, match="AXL_URL"):
client._ensure_connected()
# Second call: same config error, pinned
with pytest.raises(RuntimeError, match="AXL_URL"):
client._ensure_connected()
def test_operational_error_is_not_pinned(cache: AxlCache, monkeypatch):
"""A transient operational error (zeep Client construction failing,
network blip, etc.) should NOT pin the client forever. The next call
must be allowed to retry."""
monkeypatch.setenv("AXL_URL", "https://test.invalid:8443/axl")
monkeypatch.setenv("AXL_USER", "test")
monkeypatch.setenv("AXL_PASS", "test")
monkeypatch.setenv("AXL_VERIFY_TLS", "false")
# Force the zeep Client constructor inside _ensure_connected to raise.
# This simulates "WSDL fetch failed", "TLS handshake error", etc. —
# transient operational failures.
from mcp_cucm_axl import client as client_mod
def boom(*args, **kwargs):
raise ConnectionError("simulated transient network failure")
monkeypatch.setattr(client_mod, "Client", boom)
client = AxlClient(cache)
with pytest.raises(RuntimeError, match="simulated transient"):
client._ensure_connected()
# Hamilton review MAJOR #5: operational errors must NOT set _config_error.
# _config_error is the permanent pin; only set on missing env vars / config
# mistakes. A failed network connection is operational and the next call
# must be allowed to retry.
assert client._config_error is None, (
"operational errors must not set _config_error (the pin); "
"only configuration errors (missing env vars, bad WSDL) should pin"
)
# _last_error is set for diagnostics, but it does not block retries.
assert client._last_error is not None, (
"_last_error should record the operational failure for diagnostics"
)
assert "simulated transient" in client._last_error
def test_health_diagnostic_includes_connection_state(cache: AxlCache):
"""The client should expose its connection age / last-attempt info
so an operator can see what's going on without reading sys.stderr."""
client = AxlClient(cache)
info = client.connection_status()
assert "connected" in info
assert info["connected"] is False # never tried yet
assert "last_error" in info
# ---- Rate limit / 503 retry --------------------------------------------------
# Inspired by cisco-cucm-mcp's exponential-backoff approach. CUCM's SOAP
# layer returns 503 under load (concurrent AXL admins, change window). Without
# retries, we'd fail loudly; with them, transient rate limiting becomes
# invisible to the caller.
def test_retry_config_default_three_retries(cache: AxlCache, monkeypatch):
"""By default, the session is configured for 3 retries with backoff."""
monkeypatch.setenv("AXL_URL", "https://example.invalid:8443/axl")
monkeypatch.setenv("AXL_USER", "test")
monkeypatch.setenv("AXL_PASS", "test")
monkeypatch.setenv("AXL_VERIFY_TLS", "false")
# Stub Client construction so we exercise only the session/retry setup
from mcp_cucm_axl import client as client_mod
constructed = {}
def stub_client(*args, **kwargs):
constructed["transport"] = kwargs.get("transport")
# Raise to short-circuit before service creation
raise ConnectionError("stub: don't actually connect")
monkeypatch.setattr(client_mod, "Client", stub_client)
client = AxlClient(cache)
with pytest.raises(RuntimeError):
client._ensure_connected()
info = client.connection_status()
assert info["retry_config"] is not None
assert info["retry_config"]["max_retries"] == 3
assert 503 in info["retry_config"]["status_forcelist"]
assert 502 in info["retry_config"]["status_forcelist"]
assert 504 in info["retry_config"]["status_forcelist"]
def test_retry_config_overridable_via_env(cache: AxlCache, monkeypatch):
"""Operators can tune the retry count via AXL_RATE_LIMIT_RETRIES."""
monkeypatch.setenv("AXL_URL", "https://example.invalid:8443/axl")
monkeypatch.setenv("AXL_USER", "test")
monkeypatch.setenv("AXL_PASS", "test")
monkeypatch.setenv("AXL_RATE_LIMIT_RETRIES", "7")
from mcp_cucm_axl import client as client_mod
monkeypatch.setattr(client_mod, "Client", lambda *a, **kw: (_ for _ in ()).throw(ConnectionError("stub")))
client = AxlClient(cache)
with pytest.raises(RuntimeError):
client._ensure_connected()
assert client.connection_status()["retry_config"]["max_retries"] == 7
def test_retry_config_zero_disables(cache: AxlCache, monkeypatch):
"""AXL_RATE_LIMIT_RETRIES=0 disables the retry adapter entirely.
Useful for test environments or when an operator wants raw failures."""
monkeypatch.setenv("AXL_URL", "https://example.invalid:8443/axl")
monkeypatch.setenv("AXL_USER", "test")
monkeypatch.setenv("AXL_PASS", "test")
monkeypatch.setenv("AXL_RATE_LIMIT_RETRIES", "0")
from mcp_cucm_axl import client as client_mod
monkeypatch.setattr(client_mod, "Client", lambda *a, **kw: (_ for _ in ()).throw(ConnectionError("stub")))
client = AxlClient(cache)
with pytest.raises(RuntimeError):
client._ensure_connected()
cfg = client.connection_status()["retry_config"]
assert cfg["max_retries"] == 0