diff --git a/src/mcaxl/prompts/__init__.py b/src/mcaxl/prompts/__init__.py index 837eec1..e5ffbe0 100644 --- a/src/mcaxl/prompts/__init__.py +++ b/src/mcaxl/prompts/__init__.py @@ -22,6 +22,7 @@ from . import ( cucm_sql_help, dead_dn_finder, did_block_overlap, + failed_fax_investigation, hunt_pilot_audit, inbound_did_audit, investigate_pattern, @@ -38,6 +39,7 @@ __all__ = [ "cucm_sql_help", "dead_dn_finder", "did_block_overlap", + "failed_fax_investigation", "hunt_pilot_audit", "inbound_did_audit", "investigate_pattern", diff --git a/src/mcaxl/prompts/failed_fax_investigation.py b/src/mcaxl/prompts/failed_fax_investigation.py new file mode 100644 index 0000000..f4db1f3 --- /dev/null +++ b/src/mcaxl/prompts/failed_fax_investigation.py @@ -0,0 +1,277 @@ +"""Cross-server prompt: triage a failed fax call across all the layers. + +A fax call traverses **multiple layers**, any of which can fail +silently from the perspective of the others: + + - **CUCM dial plan** — does the DID even match a route pattern? Does + that pattern point to a working destination? + - **CUBE / SIP-trunk negotiation** — codec preference, T.38 fallback, + SDP exchange — these are between CUCM and the upstream provider's + SBC and only one side's CDR sees the failure + - **Far-end SBC** — the carrier may reject the call before it ever + reaches the destination, returning a Q.850 cause code that's + ambiguous from CUCM's perspective + - **RightFax / fax server** — call connects but the actual fax + handshake (T.30 / T.38) fails between the two fax endpoints + +Today, diagnosing this requires manually correlating CUCM RTMT, CUBE +debug-output, RightFax delivery logs, and provider CDR. This prompt +composes mcaxl + mcsiphon + mcdewey to produce a single triaged +verdict naming the layer that failed, the evidence, and the +recommended next step. + +Lives in mcaxl because the dial plan is the first layer to check — +"did the call make it past CUCM at all?" determines the rest of the +investigation. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ._common import render_schema_block + +if TYPE_CHECKING: + from ..docs_loader import DocsIndex + + +_KEYWORDS = [ + "Q.850 cause code", "T.38 fax", "SIP trunk negotiation", + "CDR cause value", "release direction", "codec mismatch", +] + + +def render( + docs: "DocsIndex | None", + dialed_did: str, + failure_time: str | None = None, + window_minutes: int = 30, +) -> str: + """Triage a failed fax call by composing CUCM dial plan + CDR + docs. + + Args: + dialed_did: the DID the operator dialed when the fax failed. + Required — the whole audit pivots on this number. + failure_time: ISO-8601 datetime of the failure (operator's best + guess; cluster-local time per the `_iso_local` convention). + If None, the prompt instructs the LLM to widen to the most + recent calls to that DID. + window_minutes: search radius around `failure_time`, in minutes. + Default 30 — generous enough to catch operator-time-guess + drift, narrow enough to keep the result set focused. + + Required MCP servers: + - mcaxl (always — primary lens for "did the call make it past + CUCM at all?") + + Strongly recommended: + - mcsiphon — CDR records for the call, including cause codes, + codecs negotiated, and disconnect direction. Without it the + prompt can identify the dial-plan layer's behavior but cannot + diagnose post-CUCM failures. + + Optional: + - mcdewey — Cisco's published troubleshooting guidance for the + Q.850 cause code surfaced by mcsiphon. Without it the verdict + names the cause but doesn't cite the authoritative remediation + steps. + """ + schema_block = render_schema_block( + docs, _KEYWORDS, max_chunks=4, max_chars_per_chunk=900 + ) + when = failure_time or "" + + return f"""# Failed-Fax Investigation: `{dialed_did}` @ `{when}` + +Triage a failed fax call by walking each potential failure layer in +order: CUCM dial plan → CDR record(s) → cause-code interpretation → +authoritative guidance. Produce a verdict naming the layer that +failed, the evidence, and the recommended next step. + +## MCP availability — declare up-front in the verdict + +Before walking the layers, **state which sibling MCP servers are +connected** in your output. This determines how much of the +investigation can complete: + +- **mcaxl + mcsiphon + mcdewey all connected** → full triage + possible: layer named, cause code interpreted, authoritative + remediation cited +- **mcaxl + mcsiphon, no mcdewey** → layer + cause named; remediation + comes from the cause-code mapping in mcsiphon's parser only (no + Cisco-doc citation) +- **mcaxl + mcdewey, no mcsiphon** → CUCM-side diagnosis only ("did + the call match a route pattern?"); post-CUCM layers cannot be + diagnosed, recommend re-running with mcsiphon connected +- **mcaxl only** → only the CUCM dial-plan layer is auditable; + weakest verdict + +If a sibling MCP is **connected but errors mid-call** (different from +not-connected), include the error message in the verdict's confidence +notes — connected-but-broken is its own state. + +## Step 1 — Did the call make it past CUCM? (mcaxl, required) + +Verify the dialed DID has a route pattern at all: + +1. Call `route_patterns_targeting(device_name=...)` for each fax-related + route list — typically `RightFax-RL`, `ZetaFax-RL`, or similar (use + `device_grep('FAX', classes=['Route List'])` to enumerate). Find + the route list whose pattern set includes `{dialed_did}` (or a + wildcard pattern that matches it). +2. If no route list claims `{dialed_did}`, the call was rejected at + the CUCM dial plan layer. **Verdict: `cucm_dial_plan` — DID has no + matching route pattern.** Recommended action: confirm the DID with + the carrier; either it's mis-typed by the operator or the dial plan + is missing the entry. +3. If a route pattern does match, call `route_inspect_pattern(pattern, + partition)` to see the full destination chain (translation patterns, + route list, route group, gateway/trunk). The egress trunk is the + key handoff to the post-CUCM layers. + +Also check if `{dialed_did}` lives inside a wildcard pattern that +might be carved out by a more-specific entry — the +`did_block_overlap` prompt is the dedicated tool for that pattern. +Worth cross-referencing if the call routed somewhere unexpected. + +## Step 2 — CDR record(s) for the failed call (mcsiphon, recommended) + +If `mcsiphon` is connected: + +1. `cdr_query_calls(start=<{window_minutes} min before failure>, + end=<{window_minutes} min after failure>, called_number={dialed_did})` + to fetch the candidate records. +2. If multiple records match, sort by `dateTimeOrigination_iso_local` + and pick the one closest to the operator's reported failure time. + Note the localization convention: timestamps are + cluster-local-time, NOT UTC — do not apply `astimezone()`. +3. Extract the **high-signal fields**: + - `origCause_value` / `destCause_value` — Q.850 cause codes + (decoded names available via mcsiphon's `CAUSE_CODE_NAMES`) + - `duration` — very short (<5s) suggests setup failure; + longer suggests mid-call drop or far-end disconnect + - `dateTimeConnect` — null/0 means call never connected + (rejected at signaling layer); non-zero means audio path + was up before the disconnect + - `origDeviceName` / `destDeviceName` — which CUCM devices + were involved; cross-reference with Step 1's route trace + - `origIpv4v6Addr` / `destIpv4v6Addr` — useful for confirming + which trunk endpoint the call left on + +If `mcsiphon` is **not connected**, note explicitly: *"CDR +investigation unavailable (mcsiphon not connected); diagnosis is +limited to dial-plan-layer findings only."* + +## Step 3 — Map cause to failure layer + +Cross-reference the cause code from Step 2 against likely failure +layer: + +| Cause code | Cause name | Likely layer | Common cause | +|---|---|---|---| +| 1 | unallocated_number | CUCM dial plan / far-end | DID not provisioned, or carrier hasn't routed it | +| 16 | normal_clearing | (not a failure) | Both sides cleared cleanly — call may not actually have failed | +| 17 | user_busy | far-end fax server | Fax server line was busy | +| 19 | no_answer | far-end fax server | RightFax didn't pick up | +| 27 | destination_out_of_order | far-end / network | RightFax server down, or upstream SBC out of service | +| 31 | normal_unspecified | ambiguous | Generic clear; correlate with `duration` to narrow | +| 38 | network_out_of_order | CUBE / SIP trunk | Trunk negotiation failed; check CUBE logs | +| 47 | resource_unavailable_unspecified | CUBE / DSP / fax resources | T.38 resources exhausted, codec mismatch | +| 65 | bearer_capability_not_authorized | CUCM region / CSS | Codec disallowed by region settings | +| 79 | service_or_option_not_implemented | CUBE / far-end | Feature mismatch (often T.38 not negotiated) | +| 127 | interworking | CUBE / interop | Protocol-translation failure between SIP/H.323 sides | + +Also consider `releaseDirection`: +- Originator-released → CUCM-side or operator hung up +- Destination-released → far-end (carrier or RightFax) rejected +- Network-released → upstream component rejected + +## Step 4 — Cite authoritative guidance (mcdewey, optional) + +If `mcdewey` is connected: + +1. `search_docs(query="cause code CUCM SIP trunk troubleshooting")` + for the cause code surfaced in Step 2/3 +2. Pull 2-3 chunks; cite the most-relevant ones in the verdict +3. If the cause is T.38-specific (47, 79), also search for "T.38 + negotiation troubleshooting" — there's an SRND chapter that + typically covers the codec/fallback decision tree + +If `mcdewey` is **not connected**, note: *"Authoritative remediation +guidance unavailable (mcdewey not connected); recommendations below +come from cause-code-to-layer heuristics only."* + +## Step 5 — Verdict + +Produce a structured triage entry: + +``` +DID: {dialed_did} +Failure time: +Layer diagnosed: cube_sip_trunk_negotiation +Evidence: + - mcaxl: {dialed_did} matches route pattern in + Internal-PT, routes to RightFax-RL → RightFax-SIP-TRK + - mcsiphon: CDR origCause_value=47 (resource_unavailable_unspecified), + destCause_value=47, duration=2s, dateTimeConnect=0 (never + connected), releaseDirection=destination + - mcsiphon: origDeviceName=RightFax-SIP-TRK, destIpv4v6Addr= + - mcdewey: SRND chapter 12 — "T.38 fallback negotiation" +Confidence: high (all three sibling MCPs reported) +Recommended action: + Check CUBE configuration for `voice class codec` preference + T.38 + fallback. Cause 47 with destination-release at the carrier SBC + typically means the SBC rejected the codec offer; confirm CUBE is + preferring G.711 µ-law for fax-shaped destinations. +``` + +For partial-coverage cases, the `Evidence` block lists what was +checked AND what was skipped, and `Confidence` adjusts down. + +## Verdict layer names (use these literally) + +- `cucm_dial_plan` — DID didn't match a route pattern, or matched + but routed to wrong destination +- `cucm_region_or_css` — call matched but blocked by region / CSS + settings (cause 65 typical) +- `cube_sip_trunk_negotiation` — codec mismatch, T.38 negotiation + failure, SDP exchange issue +- `far_end_sbc` — provider's SBC rejected the call +- `far_end_fax_server` — RightFax (or equivalent) rejected, was + busy, or didn't answer +- `t38_negotiation_failure` — specific subset of trunk-layer + failure where T.38 fallback didn't happen +- `inconclusive` — evidence doesn't point at a single layer; report + what was observed and what next investigation step would + disambiguate + +## Common patterns to surface + +- **Setup failures with `dateTimeConnect=0`** — call never + established media path. Almost always a signaling-layer issue + (cucm_dial_plan, cube_sip_trunk_negotiation, far_end_sbc). +- **Mid-call drops with non-zero `duration`** — call connected + but disconnected mid-stream. T.38 renegotiation failures often + show this shape (call connects on G.711, fails when fax tones + trigger T.38 switchover). +- **Cause-code mismatch between origCause and destCause** — + typically means a CUBE-side translation. The "real" reason is + whichever side is closer to the actual failure point. +- **Receive-and-abandon at the route point** — operator dialed + the right DID but the route point's CSS doesn't reach the + destination partition. Step 1's `route_inspect_pattern` should + surface this (CSS members listed; if the destination's + partition isn't in the list, the call hits a dead end). + +## Reference: Q.850 cause codes + CUCM CDR cause-value semantics + +""" + schema_block + """ + +Produce a structured triage report. Lead with the MCP-availability +declaration so the operator immediately sees the confidence level. +Name the layer using one of the literal verdict-layer names listed +above (downstream tooling may pattern-match on them). Recommend a +specific next investigation step — not "check CUBE logs" generically, +but "check `voice class codec 1` preference order in CUBE for the +RightFax-SIP-TRK fax-traffic dial-peer." +""" diff --git a/src/mcaxl/server.py b/src/mcaxl/server.py index bad94cd..a08b37c 100644 --- a/src/mcaxl/server.py +++ b/src/mcaxl/server.py @@ -560,6 +560,36 @@ def did_block_overlap(block_pattern: str) -> str: return _prompts.did_block_overlap.render(_docs, block_pattern) +@mcp.prompt +def failed_fax_investigation( + dialed_did: str, + failure_time: str | None = None, + window_minutes: int = 30, +) -> str: + """Cross-server triage for a failed fax call. Composes mcaxl + (route lookup for the DID), mcsiphon (CDR cause codes + duration + + connect direction), and mcdewey (Cisco's published troubleshooting + guidance for the cause code surfaced). Verdict names the failure + layer using a literal layer-name vocabulary so downstream tooling + can pattern-match: cucm_dial_plan, cube_sip_trunk_negotiation, + far_end_sbc, far_end_fax_server, t38_negotiation_failure, + cucm_region_or_css, or inconclusive. + + Args: + dialed_did: the DID the operator dialed when the fax failed. + Required — the whole audit pivots on this number. + failure_time: ISO-8601 datetime of the failure (operator's + best guess; cluster-local time per the _iso_local + convention). If None, the prompt instructs the LLM to + widen to the most recent calls to that DID. + window_minutes: search radius around `failure_time`, in + minutes (default 30). + """ + return _prompts.failed_fax_investigation.render( + _docs, dialed_did, failure_time, window_minutes + ) + + @mcp.prompt def dead_dn_finder(days_inactive: int = 30) -> str: """Find DNs that are definitively dead — exist in numplan but have diff --git a/tests/test_prompts_package.py b/tests/test_prompts_package.py index c3551e2..9e895f1 100644 --- a/tests/test_prompts_package.py +++ b/tests/test_prompts_package.py @@ -172,6 +172,7 @@ def test_all_prompts_registered_in_server(): "did_block_overlap", "partition_summary", "dead_dn_finder", + "failed_fax_investigation", }, f"unexpected prompt set: {names}"