mcaxl/src/mcp_cucm_axl/docs_loader.py

"""Read the sibling cisco-docs index and surface chunks for prompt enrichment.

We deliberately do NOT load sentence-transformers here (would add ~500MB to
the dep tree). Prompt parameters are well-bounded (topic strings, audit-type
enums, table names), so substring-and-keyword matching against chunk text
and heading_path gets us most of the value.

For free-text semantic queries, the prompt instructs the LLM to invoke the
sibling cisco-docs MCP server's `search_docs` tool — composition over
duplication.

Doc-name weighting: the cisco-docs index for CUCM is dominated by CLI
reference chunks (~475 of 511) where most chunks are command syntax with
no conceptual content. We bias toward conceptual docs (system-config,
feature-config, admin) and penalize cli-reference for topical questions.
The bias only matters for ranking — every doc still gets matched.
"""

from __future__ import annotations

import json
import os
import sys
from pathlib import Path


# Default to the sibling docs index in this monorepo. Override with env var
# if mcp-cucm-axl gets used outside this layout.
_DEFAULT_INDEX_DIR = Path("/home/rpm/bingham/docs/src/assets/.cisco-docs-index")


# Doc-name multipliers — higher = preferred for conceptual prompts.
# Keys match the `doc` field in indexed chunks.
_DOC_WEIGHTS: dict[str, float] = {
    "system-config-guide": 3.0,
    "feature-config-guide": 2.5,
    "admin-guide": 2.0,
    "interop-sip-trunking-guide": 1.5,
    "security-guide": 1.2,
    "recording-use-cases": 1.0,
    "rtmt-guide": 0.8,
    "cli-reference": 0.3,  # mostly command syntax, low conceptual signal
    "release-notes": 0.5,
    "hardware-compat": 0.2,
    "server-os-compat": 0.2,
}


class DocsIndex:
    """In-memory chunk store with keyword filtering. Light, fast, no torch."""

    def __init__(self, chunks: list[dict], meta: dict):
        self.chunks = chunks
        self.meta = meta

    @classmethod
    def load(cls, index_dir: Path | None = None) -> "DocsIndex | None":
        index_dir = index_dir or Path(
            os.environ.get("CISCO_DOCS_INDEX_PATH", _DEFAULT_INDEX_DIR)
        )
        chunks_path = index_dir / "chunks.jsonl"
        meta_path = index_dir / "index_meta.json"

        if not chunks_path.exists() or not meta_path.exists():
            print(
                f"[mcp-cucm-axl] cisco-docs index not found at {index_dir}; "
                f"prompts will run without schema enrichment.",
                file=sys.stderr,
                flush=True,
            )
            return None

        meta = json.loads(meta_path.read_text())
        chunks = [
            json.loads(line)
            for line in chunks_path.read_text(encoding="utf-8").splitlines()
            if line.strip()
        ]
        print(
            f"[mcp-cucm-axl] loaded {len(chunks)} doc chunks from {index_dir}",
            file=sys.stderr,
            flush=True,
        )
        return cls(chunks, meta)

    def cucm_chunks(self) -> list[dict]:
        return [c for c in self.chunks if c.get("product") == "cucm"]

    def find(
        self,
        keywords: list[str],
        product: str = "cucm",
        max_chunks: int = 6,
        max_chars_per_chunk: int = 800,
    ) -> list[dict]:
        """Score chunks by keyword hits in heading_path + text. Lowercase-insensitive.

        Heading hits weight 3x text hits — heading paths are a much better
        topical signal than incidental text mentions.
        """
        if not keywords:
            return []
        kws = [k.lower() for k in keywords if k]

        scored: list[tuple[float, dict]] = []
        for chunk in self.chunks:
            if product and chunk.get("product") != product:
                continue
            heading = " ".join(chunk.get("heading_path") or []).lower()
            text = (chunk.get("text") or "").lower()
            doc = chunk.get("doc") or ""
            doc_lower = doc.lower()
            raw = 0
            for k in kws:
                raw += heading.count(k) * 3
                raw += doc_lower.count(k) * 2
                raw += text.count(k)
            if raw > 0:
                weight = _DOC_WEIGHTS.get(doc, 1.0)
                scored.append((raw * weight, chunk))

        scored.sort(key=lambda t: t[0], reverse=True)
        out = []
        for score, chunk in scored[:max_chunks]:
            text = chunk.get("text", "")
            if len(text) > max_chars_per_chunk:
                text = text[:max_chars_per_chunk] + "…"
            out.append({
                "score": round(score, 1),
                "heading_path": chunk.get("heading_path"),
                "doc": chunk.get("doc"),
                "version": chunk.get("version"),
                "source_path": chunk.get("source_path"),
                "text": text,
                "chunk_id": chunk.get("id"),
            })
        return out

    def format_chunks_for_prompt(self, chunks: list[dict]) -> str:
        """Render chunks as a markdown reference block for embedding in prompt seeds."""
        if not chunks:
            return "_No matching schema documentation found in the local index._"
        lines = []
        for c in chunks:
            heading = " > ".join(c.get("heading_path") or []) or "(no heading)"
            doc = c.get("doc", "")
            version = c.get("version", "")
            lines.append(f"### {heading}  \n_source: {doc} ({version}) — score {c['score']}_")
            lines.append("")
            lines.append(c["text"])
            lines.append("")
        return "\n".join(lines)