"""Read the sibling cisco-docs index and surface chunks for prompt enrichment. We deliberately do NOT load sentence-transformers here (would add ~500MB to the dep tree). Prompt parameters are well-bounded (topic strings, audit-type enums, table names), so substring-and-keyword matching against chunk text and heading_path gets us most of the value. For free-text semantic queries, the prompt instructs the LLM to invoke the sibling cisco-docs MCP server's `search_docs` tool — composition over duplication. Doc-name weighting: the cisco-docs index for CUCM is dominated by CLI reference chunks (~475 of 511) where most chunks are command syntax with no conceptual content. We bias toward conceptual docs (system-config, feature-config, admin) and penalize cli-reference for topical questions. The bias only matters for ranking — every doc still gets matched. """ from __future__ import annotations import json import os import sys from pathlib import Path # Default to the sibling docs index in this monorepo. Override with env var # if mcp-cucm-axl gets used outside this layout. _DEFAULT_INDEX_DIR = Path("/home/rpm/bingham/docs/src/assets/.cisco-docs-index") # Doc-name multipliers — higher = preferred for conceptual prompts. # Keys match the `doc` field in indexed chunks. _DOC_WEIGHTS: dict[str, float] = { "system-config-guide": 3.0, "feature-config-guide": 2.5, "admin-guide": 2.0, "interop-sip-trunking-guide": 1.5, "security-guide": 1.2, "recording-use-cases": 1.0, "rtmt-guide": 0.8, "cli-reference": 0.3, # mostly command syntax, low conceptual signal "release-notes": 0.5, "hardware-compat": 0.2, "server-os-compat": 0.2, } class DocsIndex: """In-memory chunk store with keyword filtering. Light, fast, no torch.""" def __init__(self, chunks: list[dict], meta: dict): self.chunks = chunks self.meta = meta @classmethod def load(cls, index_dir: Path | None = None) -> "DocsIndex | None": index_dir = index_dir or Path( os.environ.get("CISCO_DOCS_INDEX_PATH", _DEFAULT_INDEX_DIR) ) chunks_path = index_dir / "chunks.jsonl" meta_path = index_dir / "index_meta.json" if not chunks_path.exists() or not meta_path.exists(): print( f"[mcp-cucm-axl] cisco-docs index not found at {index_dir}; " f"prompts will run without schema enrichment.", file=sys.stderr, flush=True, ) return None meta = json.loads(meta_path.read_text()) chunks = [ json.loads(line) for line in chunks_path.read_text(encoding="utf-8").splitlines() if line.strip() ] print( f"[mcp-cucm-axl] loaded {len(chunks)} doc chunks from {index_dir}", file=sys.stderr, flush=True, ) return cls(chunks, meta) def cucm_chunks(self) -> list[dict]: return [c for c in self.chunks if c.get("product") == "cucm"] def find( self, keywords: list[str], product: str = "cucm", max_chunks: int = 6, max_chars_per_chunk: int = 800, ) -> list[dict]: """Score chunks by keyword hits in heading_path + text. Lowercase-insensitive. Heading hits weight 3x text hits — heading paths are a much better topical signal than incidental text mentions. """ if not keywords: return [] kws = [k.lower() for k in keywords if k] scored: list[tuple[float, dict]] = [] for chunk in self.chunks: if product and chunk.get("product") != product: continue heading = " ".join(chunk.get("heading_path") or []).lower() text = (chunk.get("text") or "").lower() doc = chunk.get("doc") or "" doc_lower = doc.lower() raw = 0 for k in kws: raw += heading.count(k) * 3 raw += doc_lower.count(k) * 2 raw += text.count(k) if raw > 0: weight = _DOC_WEIGHTS.get(doc, 1.0) scored.append((raw * weight, chunk)) scored.sort(key=lambda t: t[0], reverse=True) out = [] for score, chunk in scored[:max_chunks]: text = chunk.get("text", "") if len(text) > max_chars_per_chunk: text = text[:max_chars_per_chunk] + "…" out.append({ "score": round(score, 1), "heading_path": chunk.get("heading_path"), "doc": chunk.get("doc"), "version": chunk.get("version"), "source_path": chunk.get("source_path"), "text": text, "chunk_id": chunk.get("id"), }) return out def format_chunks_for_prompt(self, chunks: list[dict]) -> str: """Render chunks as a markdown reference block for embedding in prompt seeds.""" if not chunks: return "_No matching schema documentation found in the local index._" lines = [] for c in chunks: heading = " > ".join(c.get("heading_path") or []) or "(no heading)" doc = c.get("doc", "") version = c.get("version", "") lines.append(f"### {heading} \n_source: {doc} ({version}) — score {c['score']}_") lines.append("") lines.append(c["text"]) lines.append("") return "\n".join(lines)