mcp-office-tools/src/mcp_office_tools/mixins/word.py

"""Word Document Tools Mixin - Specialized tools for Word document processing."""

import os
import time
from typing import Any, Optional

from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
from pydantic import Field

from ..utils import (
    OfficeFileError,
    resolve_office_file_path,
    validate_office_file,
    detect_format,
    resolve_field_defaults,
    handle_office_errors
)
from ..pagination import paginate_document_conversion, PaginationParams


class WordMixin(MCPMixin):
    """Mixin containing Word-specific tools for advanced document processing."""

    @mcp_tool(
        name="convert_to_markdown",
        description="Convert Office documents to Markdown format with intelligent processing and automatic pagination for large documents. ⚠️ LARGE DOCUMENT HANDLING: Documents exceeding 25k tokens are automatically paginated into manageable sections. Use cursor_id to continue through pages. For massive documents (200+ pages), pagination prevents token limit errors while preserving document structure and context."
    )
    @handle_office_errors("Markdown conversion")
    @resolve_field_defaults(
        include_images=True,
        image_mode="base64",
        max_image_size=1024*1024,
        preserve_structure=True,
        page_range="",
        bookmark_name="",
        chapter_name="",
        summary_only=False,
        output_dir="",
        limit=50,
        cursor_id=None,
        session_id=None,
        return_all=False
    )
    async def convert_to_markdown(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
        include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
        image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
        max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
        preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
        page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
        bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
        chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
        summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
        output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name."),
        # Pagination parameters
        limit: int = Field(default=50, description="Maximum number of document sections to return per page"),
        cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"),
        session_id: Optional[str] = Field(default=None, description="Session ID for pagination isolation"),
        return_all: bool = Field(default=False, description="Return entire document bypassing pagination (WARNING: may exceed token limits)")
    ) -> dict[str, Any]:
        start_time = time.time()

        # Resolve file path
        local_path = await resolve_office_file_path(file_path)

        # Validate file
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        # Get format info
        format_info = await detect_format(local_path)
        category = format_info["category"]
        extension = format_info["extension"]

        # Currently focused on Word documents for markdown conversion
        if category != "word":
            raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")

        # Analyze document size and provide intelligent recommendations
        doc_analysis = await self._analyze_document_size(local_path, extension)
        processing_recommendation = self._get_processing_recommendation(
            doc_analysis, page_range, summary_only
        )

        # Parse page range if provided
        page_numbers = self._parse_page_range(page_range) if page_range else None

        # Prioritize bookmark/chapter extraction over page ranges
        if bookmark_name or chapter_name:
            page_numbers = None  # Ignore page ranges when bookmark or chapter is specified

        # Convert to markdown based on format
        if extension == ".docx":
            markdown_result = await self._convert_docx_to_markdown(
                local_path, include_images, image_mode, max_image_size,
                preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
            )
        else:  # .doc
            # For legacy .doc files, use mammoth if available
            markdown_result = await self._convert_doc_to_markdown(
                local_path, include_images, image_mode, max_image_size,
                preserve_structure, page_numbers, summary_only, output_dir
            )

        # Check if pagination is needed
        markdown_content = markdown_result["content"]
        estimated_tokens = len(markdown_content) // 4  # Rough token estimation

        # Generate session ID if not provided
        if not session_id:
            session_id = f"word-{int(time.time())}-{os.getpid()}"

        # Create pagination parameters
        pagination_params = PaginationParams(
            limit=limit,
            cursor_id=cursor_id,
            session_id=session_id,
            return_all=return_all
        )

        # Apply pagination if content is large or pagination is explicitly requested
        # Skip pagination only if return_all=True AND no cursor_id AND content is manageable
        should_paginate = (cursor_id or estimated_tokens > 25000 or (not return_all and estimated_tokens > 8000))

        if should_paginate:
            paginated_result = paginate_document_conversion(
                tool_name="convert_to_markdown",
                document_path=local_path,
                markdown_content=markdown_content,
                params=pagination_params,
                session_id=session_id,
                total_estimated_tokens=estimated_tokens
            )

            # If pagination was applied, return the paginated result
            if "pagination" in paginated_result:
                # Add metadata to the paginated result
                paginated_result["metadata"] = {
                    "original_file": os.path.basename(local_path),
                    "format": format_info["format_name"],
                    "conversion_method": markdown_result["method_used"],
                    "conversion_time": round(time.time() - start_time, 3),
                    "summary_only": summary_only,
                    "document_analysis": doc_analysis,
                    "processing_recommendation": processing_recommendation,
                    "session_id": session_id
                }

                # Add additional metadata from original result
                if "images" in markdown_result:
                    paginated_result["metadata"]["images_found"] = len(markdown_result["images"])
                if "structure" in markdown_result:
                    paginated_result["metadata"]["structure_preserved"] = bool(markdown_result["structure"])

                return paginated_result

        # Build result based on mode (non-paginated or bypass pagination)
        result = {
            "metadata": {
                "original_file": os.path.basename(local_path),
                "format": format_info["format_name"],
                "conversion_method": markdown_result["method_used"],
                "conversion_time": round(time.time() - start_time, 3),
                "summary_only": summary_only,
                "document_analysis": doc_analysis,
                "processing_recommendation": processing_recommendation,
                "session_id": session_id,
                "estimated_tokens": estimated_tokens
            }
        }

        # Add page range info if used
        if page_range:
            result["metadata"]["page_range"] = page_range
            result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0

        # Add content based on mode
        if summary_only:
            # VERY restrictive summary mode to prevent massive responses
            result["metadata"]["character_count"] = len(markdown_result["content"])
            result["metadata"]["word_count"] = len(markdown_result["content"].split())

            # Ultra-short summary (only 500 chars max)
            result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"]

            # Severely limit table of contents to prevent 1M+ token responses
            if "table_of_contents" in markdown_result:
                toc = markdown_result["table_of_contents"]
                if isinstance(toc, dict):
                    # Keep only essential TOC info, severely truncated
                    result["table_of_contents"] = {
                        "note": toc.get("note", ""),
                        "basic_info": toc.get("basic_info", "")[:200],  # Limit to 200 chars
                    }
                    # Add bookmark/heading info if available (limit to first 5 items)
                    if "bookmarks" in toc:
                        result["table_of_contents"]["bookmarks"] = toc["bookmarks"][:5]
                        result["table_of_contents"]["bookmark_count"] = toc.get("bookmark_count", 0)
                    if "available_headings" in toc:
                        result["table_of_contents"]["available_headings"] = toc["available_headings"][:5]
                        result["table_of_contents"]["heading_count"] = toc.get("heading_count", 0)
                else:
                    result["table_of_contents"] = {"note": "Summary mode - use full processing for detailed TOC"}
        else:
            # Full content mode
            result["markdown"] = markdown_result["content"]
            result["content_truncated"] = len(markdown_result["content"]) >= 200000  # Warn if near limit

            # Add images info
            if "images" in markdown_result:
                result["images"] = markdown_result["images"]

            # Add structure info
            if "structure" in markdown_result:
                result["structure"] = markdown_result["structure"]

            # Add table of contents if available
            if "table_of_contents" in markdown_result:
                result["table_of_contents"] = markdown_result["table_of_contents"]

        return result

    # Helper methods - import from monolithic server
    async def _analyze_document_size(self, file_path: str, extension: str) -> dict[str, Any]:
        """Analyze document size for processing recommendations."""
        from ..utils import _analyze_document_size
        return await _analyze_document_size(file_path, extension)

    def _get_processing_recommendation(self, doc_analysis: dict[str, Any], page_range: str, summary_only: bool) -> dict[str, Any]:
        """Get processing recommendations based on document analysis."""
        from ..utils import _get_processing_recommendation
        return _get_processing_recommendation(doc_analysis, page_range, summary_only)

    def _parse_page_range(self, page_range: str) -> list[int]:
        """Parse page range string into list of page numbers."""
        from ..utils import _parse_page_range
        return _parse_page_range(page_range)

    async def _convert_docx_to_markdown(
        self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
        preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str,
        bookmark_name: str = "", chapter_name: str = ""
    ) -> dict[str, Any]:
        """Convert .docx to markdown."""
        from ..utils import _convert_docx_to_markdown
        return await _convert_docx_to_markdown(
            file_path, include_images, image_mode, max_image_size,
            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )

    async def _convert_doc_to_markdown(
        self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
        preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str
    ) -> dict[str, Any]:
        """Convert legacy .doc to markdown."""
        from ..utils import _convert_doc_to_markdown
        return await _convert_doc_to_markdown(
            file_path, include_images, image_mode, max_image_size,
            preserve_structure, page_numbers, summary_only, output_dir
        )

    @mcp_tool(
        name="extract_word_tables",
        description="Extract all tables from Word documents with structure, styling, and data conversion options. Returns tables as structured data with CSV/JSON export capability."
    )
    @handle_office_errors("Table extraction")
    @resolve_field_defaults(
        include_styling=True,
        output_format="structured",
        preserve_merged_cells=True,
        include_headers=True
    )
    async def extract_word_tables(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        include_styling: bool = Field(default=True, description="Include table styling information (borders, alignment, etc.)"),
        output_format: str = Field(default="structured", description="Output format: structured, csv, json, markdown"),
        preserve_merged_cells: bool = Field(default=True, description="Handle merged cells appropriately"),
        include_headers: bool = Field(default=True, description="Identify and mark header rows/columns")
    ) -> dict[str, Any]:
        """Extract tables from Word documents with comprehensive structure analysis."""
        start_time = time.time()
        import csv
        import json
        import io

        # Resolve and validate file
        resolved_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(resolved_path)

        if validation["category"] != "word":
            raise OfficeFileError(f"Table extraction requires Word document, got: {validation['format_name']}")

        # Import required libraries
        import docx

        # Load document
        doc = docx.Document(resolved_path)

        tables_data = []
        table_index = 0

        for table in doc.tables:
            table_info = {
                "table_index": table_index,
                "dimensions": {
                    "rows": len(table.rows),
                    "columns": len(table.columns) if table.rows else 0
                },
                "data": [],
                "metadata": {}
            }

            # Extract table styling if requested
            if include_styling:
                table_info["styling"] = {
                    "table_style": table.style.name if table.style else None,
                    "alignment": str(table.alignment) if hasattr(table, 'alignment') else None
                }

            # Extract table data
            for row_idx, row in enumerate(table.rows):
                row_data = []
                row_styling = [] if include_styling else None

                for col_idx, cell in enumerate(row.cells):
                    cell_text = cell.text.strip()
                    cell_info = {"text": cell_text}

                    if include_styling:
                        cell_style = {
                            "bold": False,
                            "italic": False,
                            "alignment": None
                        }

                        # Check text formatting in paragraphs
                        for paragraph in cell.paragraphs:
                            for run in paragraph.runs:
                                if run.bold:
                                    cell_style["bold"] = True
                                if run.italic:
                                    cell_style["italic"] = True

                            if paragraph.alignment is not None:
                                cell_style["alignment"] = str(paragraph.alignment)

                        cell_info["styling"] = cell_style
                        row_styling.append(cell_style)

                    # Handle merged cells
                    if preserve_merged_cells:
                        # Basic merged cell detection (simplified)
                        cell_info["is_merged"] = len(cell.text.strip()) == 0 and col_idx > 0

                    row_data.append(cell_info)

                table_info["data"].append({
                    "row_index": row_idx,
                    "cells": row_data,
                    "styling": row_styling if include_styling else None
                })

            # Identify headers if requested
            if include_headers and table_info["data"]:
                # Simple header detection: first row with all non-empty cells
                first_row_cells = table_info["data"][0]["cells"]
                if all(cell["text"] for cell in first_row_cells):
                    table_info["metadata"]["has_header_row"] = True
                    table_info["metadata"]["headers"] = [cell["text"] for cell in first_row_cells]
                else:
                    table_info["metadata"]["has_header_row"] = False

            # Convert to requested output format
            if output_format in ["csv", "json", "markdown"]:
                converted_data = self._convert_table_format(table_info, output_format)
                table_info["converted_output"] = converted_data

            tables_data.append(table_info)
            table_index += 1

        # Generate summary
        total_tables = len(tables_data)
        total_cells = sum(table["dimensions"]["rows"] * table["dimensions"]["columns"] for table in tables_data)

        return {
            "tables": tables_data,
            "summary": {
                "total_tables": total_tables,
                "total_cells": total_cells,
                "extraction_time": time.time() - start_time,
                "output_format": output_format,
                "file_info": validation
            }
        }

    def _convert_table_format(self, table_info: dict, format_type: str) -> str:
        """Convert table data to specified format."""
        rows_data = []

        # Extract plain text data
        for row in table_info["data"]:
            row_texts = [cell["text"] for cell in row["cells"]]
            rows_data.append(row_texts)

        if format_type == "csv":
            output = io.StringIO()
            writer = csv.writer(output)
            writer.writerows(rows_data)
            return output.getvalue()

        elif format_type == "json":
            if table_info["metadata"].get("has_header_row", False):
                headers = rows_data[0]
                data_rows = rows_data[1:]
                json_data = [dict(zip(headers, row)) for row in data_rows]
            else:
                json_data = [{"col_" + str(i): cell for i, cell in enumerate(row)} for row in rows_data]
            return json.dumps(json_data, indent=2)

        elif format_type == "markdown":
            if not rows_data:
                return ""

            markdown = ""
            for i, row in enumerate(rows_data):
                # Escape pipe characters in cell content
                escaped_row = [cell.replace("|", "\\|") for cell in row]
                markdown += "| " + " | ".join(escaped_row) + " |\n"

                # Add separator after header row
                if i == 0 and table_info["metadata"].get("has_header_row", False):
                    markdown += "| " + " | ".join(["---"] * len(row)) + " |\n"

            return markdown

        return ""

    @mcp_tool(
        name="analyze_word_structure",
        description="Analyze Word document structure including headings, sections, page layout, and document hierarchy. Provides navigation map and content organization insights."
    )
    @handle_office_errors("Structure analysis")
    @resolve_field_defaults(
        include_page_info=True,
        extract_outline=True,
        analyze_styles=True
    )
    async def analyze_word_structure(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        include_page_info: bool = Field(default=True, description="Include page layout and section information"),
        extract_outline: bool = Field(default=True, description="Extract document outline and heading hierarchy"),
        analyze_styles: bool = Field(default=True, description="Analyze custom styles and formatting patterns")
    ) -> dict[str, Any]:
        """Analyze Word document structure and organization."""
        start_time = time.time()

        # Resolve and validate file
        resolved_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(resolved_path)

        if validation["category"] != "word":
            raise OfficeFileError(f"Structure analysis requires Word document, got: {validation['format_name']}")

        # Import required libraries
        import docx
        from docx.enum.style import WD_STYLE_TYPE

        # Load document
        doc = docx.Document(resolved_path)

        structure_info = {
            "document_info": {
                "total_paragraphs": len(doc.paragraphs),
                "total_tables": len(doc.tables),
                "total_sections": len(doc.sections)
            }
        }

        # Extract outline and headings
        if extract_outline:
            headings = []
            heading_styles = ['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4', 'Heading 5', 'Heading 6']

            for para_idx, paragraph in enumerate(doc.paragraphs):
                if paragraph.style.name in heading_styles:
                    level = int(paragraph.style.name.split()[-1])
                    headings.append({
                        "text": paragraph.text.strip(),
                        "level": level,
                        "style": paragraph.style.name,
                        "paragraph_index": para_idx
                    })

            structure_info["outline"] = {
                "headings": headings,
                "heading_count": len(headings),
                "max_depth": max([h["level"] for h in headings]) if headings else 0
            }

            # Create navigation tree
            structure_info["navigation_tree"] = self._build_navigation_tree(headings)

        # Analyze page layout and sections
        if include_page_info:
            sections_info = []

            for section_idx, section in enumerate(doc.sections):
                section_info = {
                    "section_index": section_idx,
                    "page_dimensions": {},
                    "margins": {}
                }

                # Safely extract page dimensions
                try:
                    if section.page_width:
                        section_info["page_dimensions"]["width"] = float(section.page_width.inches)
                    if section.page_height:
                        section_info["page_dimensions"]["height"] = float(section.page_height.inches)
                except (ValueError, AttributeError, TypeError):
                    section_info["page_dimensions"] = {"width": None, "height": None}

                # Safely extract margins
                try:
                    if section.left_margin:
                        section_info["margins"]["left"] = float(section.left_margin.inches)
                    if section.right_margin:
                        section_info["margins"]["right"] = float(section.right_margin.inches)
                    if section.top_margin:
                        section_info["margins"]["top"] = float(section.top_margin.inches)
                    if section.bottom_margin:
                        section_info["margins"]["bottom"] = float(section.bottom_margin.inches)
                except (ValueError, AttributeError, TypeError):
                    section_info["margins"] = {"left": None, "right": None, "top": None, "bottom": None}

                # Safely extract orientation
                try:
                    if hasattr(section, 'orientation') and section.orientation is not None:
                        # orientation is an enum, get its name
                        section_info["orientation"] = section.orientation.name if hasattr(section.orientation, 'name') else str(section.orientation)
                    else:
                        section_info["orientation"] = None
                except (ValueError, AttributeError, TypeError):
                    section_info["orientation"] = None

                # Header and footer information
                try:
                    if section.header:
                        section_info["has_header"] = True
                        section_info["header_text"] = " ".join([p.text for p in section.header.paragraphs]).strip()
                except (ValueError, AttributeError, TypeError):
                    section_info["has_header"] = False

                try:
                    if section.footer:
                        section_info["has_footer"] = True
                        section_info["footer_text"] = " ".join([p.text for p in section.footer.paragraphs]).strip()
                except (ValueError, AttributeError, TypeError):
                    section_info["has_footer"] = False

                sections_info.append(section_info)

            structure_info["page_layout"] = sections_info

        # Analyze styles
        if analyze_styles:
            styles_info = {
                "paragraph_styles": [],
                "character_styles": [],
                "table_styles": [],
                "style_usage": {}
            }

            # Collect style information
            for style in doc.styles:
                style_info = {
                    "name": style.name,
                    "type": str(style.type),
                    "builtin": style.builtin
                }

                if style.type == WD_STYLE_TYPE.PARAGRAPH:
                    styles_info["paragraph_styles"].append(style_info)
                elif style.type == WD_STYLE_TYPE.CHARACTER:
                    styles_info["character_styles"].append(style_info)
                elif style.type == WD_STYLE_TYPE.TABLE:
                    styles_info["table_styles"].append(style_info)

            # Analyze style usage
            style_usage = {}
            for paragraph in doc.paragraphs:
                style_name = paragraph.style.name
                style_usage[style_name] = style_usage.get(style_name, 0) + 1

            styles_info["style_usage"] = style_usage
            structure_info["styles"] = styles_info

        return {
            "structure": structure_info,
            "analysis_time": time.time() - start_time,
            "file_info": validation
        }

    def _build_navigation_tree(self, headings: list) -> list:
        """Build hierarchical navigation tree from headings."""
        if not headings:
            return []

        tree = []
        stack = []  # Stack to keep track of parent nodes

        for heading in headings:
            node = {
                "text": heading["text"],
                "level": heading["level"],
                "paragraph_index": heading["paragraph_index"],
                "children": []
            }

            # Find the correct parent level
            while stack and stack[-1]["level"] >= heading["level"]:
                stack.pop()

            if stack:
                # Add as child to the parent
                stack[-1]["children"].append(node)
            else:
                # Add as root level
                tree.append(node)

            stack.append(node)

        return tree

    # ==================== New Document Navigation Tools ====================

    @mcp_tool(
        name="get_document_outline",
        description="Get a clean, structured outline of a Word document showing all headings, sections, and chapters with their locations. Perfect for understanding document structure before reading."
    )
    @handle_office_errors("Document outline")
    async def get_document_outline(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
        detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
    ) -> dict[str, Any]:
        """Extract structured document outline with chapter detection."""
        from docx import Document
        from docx.oxml.ns import qn

        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)

        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        doc = Document(local_path)

        outline = []
        current_section = None
        section_word_count = 0
        total_words = 0
        chapter_pattern = ["chapter", "section", "part", "introduction", "conclusion", "appendix", "preface", "epilogue"]

        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text.strip()
            word_count = len(text.split()) if text else 0
            total_words += word_count

            # Check if this is a heading
            style_name = para.style.name.lower() if para.style else ""
            is_heading = "heading" in style_name or "title" in style_name

            # Determine heading level
            level = 0
            if is_heading:
                if "title" in style_name:
                    level = 0
                elif "heading 1" in style_name or style_name == "heading1":
                    level = 1
                elif "heading 2" in style_name or style_name == "heading2":
                    level = 2
                elif "heading 3" in style_name or style_name == "heading3":
                    level = 3
                elif "heading" in style_name:
                    # Try to extract number from style name
                    import re
                    match = re.search(r'heading\s*(\d+)', style_name)
                    level = int(match.group(1)) if match else 4

            if is_heading and text:
                # Save previous section's word count
                if current_section is not None and include_word_counts:
                    current_section["word_count"] = section_word_count

                # Detect if this is a chapter
                is_chapter = False
                chapter_number = None
                if detect_chapters:
                    text_lower = text.lower()
                    for pattern in chapter_pattern:
                        if pattern in text_lower:
                            is_chapter = True
                            # Try to extract chapter number
                            import re
                            match = re.search(r'(?:chapter|section|part)\s*(\d+)', text_lower)
                            if match:
                                chapter_number = int(match.group(1))
                            break

                current_section = {
                    "text": text[:150] + ("..." if len(text) > 150 else ""),
                    "level": level,
                    "style": para.style.name if para.style else "Unknown",
                    "paragraph_index": para_idx,
                    "is_chapter": is_chapter
                }

                if chapter_number is not None:
                    current_section["chapter_number"] = chapter_number

                outline.append(current_section)
                section_word_count = 0
            else:
                section_word_count += word_count

        # Don't forget last section
        if current_section is not None and include_word_counts:
            current_section["word_count"] = section_word_count

        # Build summary statistics
        chapters = [item for item in outline if item.get("is_chapter")]
        chapter_numbers = [c.get("chapter_number") for c in chapters if c.get("chapter_number")]

        # Detect missing chapters
        missing_chapters = []
        if chapter_numbers:
            expected = set(range(1, max(chapter_numbers) + 1))
            found = set(chapter_numbers)
            missing_chapters = sorted(expected - found)

        return {
            "outline": outline,
            "summary": {
                "total_headings": len(outline),
                "chapters_found": len(chapters),
                "chapter_numbers": chapter_numbers,
                "missing_chapters": missing_chapters,
                "total_words": total_words,
                "total_paragraphs": len(doc.paragraphs)
            },
            "extraction_time": round(time.time() - start_time, 3)
        }

    @mcp_tool(
        name="check_style_consistency",
        description="Analyze a Word document for style inconsistencies, formatting issues, and potential problems like mismatched heading styles or missing chapters."
    )
    @handle_office_errors("Style consistency check")
    async def check_style_consistency(
        self,
        file_path: str = Field(description="Path to Word document or URL")
    ) -> dict[str, Any]:
        """Check document for style and formatting consistency issues."""
        from docx import Document

        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)

        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        doc = Document(local_path)

        issues = []
        warnings = []

        # Track heading styles and chapter detection
        heading_styles = {}
        chapters_by_style = {"heading": [], "other": []}
        chapter_numbers_found = []

        import re
        chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)

        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text.strip()
            style_name = para.style.name if para.style else "None"
            style_lower = style_name.lower()

            # Track style usage
            heading_styles[style_name] = heading_styles.get(style_name, 0) + 1

            # Check for chapter-like text
            chapter_match = chapter_pattern.match(text)
            if chapter_match:
                chapter_num = int(chapter_match.group(1))
                chapter_numbers_found.append(chapter_num)

                is_heading_style = "heading" in style_lower

                if is_heading_style:
                    chapters_by_style["heading"].append({
                        "chapter": chapter_num,
                        "text": text[:80],
                        "style": style_name,
                        "paragraph": para_idx
                    })
                else:
                    chapters_by_style["other"].append({
                        "chapter": chapter_num,
                        "text": text[:80],
                        "style": style_name,
                        "paragraph": para_idx
                    })
                    issues.append({
                        "type": "inconsistent_chapter_style",
                        "severity": "warning",
                        "message": f"Chapter {chapter_num} uses '{style_name}' instead of a Heading style",
                        "paragraph": para_idx,
                        "text": text[:80]
                    })

            # Check for potential headings that aren't styled as headings
            if text and len(text) < 100 and not text.endswith('.'):
                is_heading_style = "heading" in style_lower or "title" in style_lower
                looks_like_heading = any(word in text.lower() for word in
                    ["chapter", "section", "part", "introduction", "conclusion", "appendix"])

                if looks_like_heading and not is_heading_style:
                    warnings.append({
                        "type": "potential_heading_not_styled",
                        "message": f"Text looks like a heading but uses '{style_name}' style",
                        "paragraph": para_idx,
                        "text": text[:80]
                    })

        # Check for missing chapters in sequence
        missing_chapters = []
        if chapter_numbers_found:
            chapter_numbers_found.sort()
            expected = set(range(1, max(chapter_numbers_found) + 1))
            found = set(chapter_numbers_found)
            missing_chapters = sorted(expected - found)

            for missing in missing_chapters:
                issues.append({
                    "type": "missing_chapter",
                    "severity": "error",
                    "message": f"Chapter {missing} appears to be missing from sequence",
                    "expected_between": f"Chapter {missing-1} and Chapter {missing+1}" if missing > 1 else f"Before Chapter {missing+1}"
                })

        # Check for duplicate chapter numbers
        from collections import Counter
        chapter_counts = Counter(chapter_numbers_found)
        duplicates = {num: count for num, count in chapter_counts.items() if count > 1}
        for chapter_num, count in duplicates.items():
            issues.append({
                "type": "duplicate_chapter",
                "severity": "warning",
                "message": f"Chapter {chapter_num} appears {count} times"
            })

        # Summary of heading style usage
        heading_summary = {k: v for k, v in heading_styles.items()
                         if "heading" in k.lower() or "title" in k.lower()}

        return {
            "issues": issues,
            "warnings": warnings,
            "chapter_analysis": {
                "total_chapters": len(chapter_numbers_found),
                "chapters_with_heading_style": len(chapters_by_style["heading"]),
                "chapters_without_heading_style": len(chapters_by_style["other"]),
                "missing_chapters": missing_chapters,
                "duplicate_chapters": list(duplicates.keys()),
                "chapter_details": chapters_by_style
            },
            "style_usage": heading_summary,
            "health_score": self._calculate_doc_health_score(issues, warnings),
            "analysis_time": round(time.time() - start_time, 3)
        }

    def _calculate_doc_health_score(self, issues: list, warnings: list) -> dict:
        """Calculate document health score based on issues found."""
        score = 100

        for issue in issues:
            if issue.get("severity") == "error":
                score -= 10
            elif issue.get("severity") == "warning":
                score -= 5

        for _ in warnings:
            score -= 2

        score = max(0, min(100, score))

        if score >= 90:
            rating = "excellent"
        elif score >= 70:
            rating = "good"
        elif score >= 50:
            rating = "fair"
        else:
            rating = "needs attention"

        return {"score": score, "rating": rating}

    @mcp_tool(
        name="search_document",
        description="Search for text within a Word document and return matches with surrounding context and location information."
    )
    @handle_office_errors("Document search")
    async def search_document(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        query: str = Field(description="Text to search for (case-insensitive)"),
        context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
        max_results: int = Field(default=20, description="Maximum number of results to return")
    ) -> dict[str, Any]:
        """Search document for text with context."""
        from docx import Document

        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)

        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        doc = Document(local_path)
        query_lower = query.lower()

        results = []
        current_chapter = None
        current_section = None

        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text
            style_name = para.style.name if para.style else ""
            style_lower = style_name.lower()

            # Track current chapter/section for context
            if "heading" in style_lower or "title" in style_lower:
                if "1" in style_name or "title" in style_lower:
                    current_chapter = text.strip()[:80]
                    current_section = None
                else:
                    current_section = text.strip()[:80]

            # Search for matches
            text_lower = text.lower()
            search_start = 0

            while True:
                pos = text_lower.find(query_lower, search_start)
                if pos == -1:
                    break

                if len(results) >= max_results:
                    break

                # Extract context
                context_start = max(0, pos - context_chars)
                context_end = min(len(text), pos + len(query) + context_chars)

                context = text[context_start:context_end]
                if context_start > 0:
                    context = "..." + context
                if context_end < len(text):
                    context = context + "..."

                results.append({
                    "paragraph_index": para_idx,
                    "position": pos,
                    "context": context,
                    "chapter": current_chapter,
                    "section": current_section,
                    "style": style_name
                })

                search_start = pos + 1

            if len(results) >= max_results:
                break

        return {
            "query": query,
            "total_matches": len(results),
            "results": results,
            "search_time": round(time.time() - start_time, 3),
            "truncated": len(results) >= max_results
        }

    @mcp_tool(
        name="extract_entities",
        description="Extract named entities (people, places, organizations) from a Word document using pattern-based recognition. Great for identifying key characters, locations, and institutions mentioned in the text."
    )
    @handle_office_errors("Entity extraction")
    async def extract_entities(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        entity_types: str = Field(default="all", description="Entity types to extract: 'all', 'people', 'places', 'organizations', or comma-separated combination"),
        min_occurrences: int = Field(default=1, description="Minimum occurrences for an entity to be included"),
        include_context: bool = Field(default=True, description="Include sample context for each entity")
    ) -> dict[str, Any]:
        """Extract named entities from document using pattern-based recognition."""
        from docx import Document
        from collections import defaultdict
        import re

        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)

        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        doc = Document(local_path)

        # Parse entity types to extract
        if entity_types == "all":
            extract_types = {"people", "places", "organizations"}
        else:
            extract_types = set(t.strip().lower() for t in entity_types.split(","))

        # Entity containers with context tracking
        entities = {
            "people": defaultdict(lambda: {"count": 0, "contexts": []}),
            "places": defaultdict(lambda: {"count": 0, "contexts": []}),
            "organizations": defaultdict(lambda: {"count": 0, "contexts": []})
        }

        # Patterns for entity detection
        # Titles indicating people
        title_pattern = re.compile(
            r'\b(Dr\.?|Mr\.?|Mrs\.?|Ms\.?|Miss|Professor|Prof\.?|Sister|Father|Rev\.?|'
            r'President|Director|Nurse|RN|LPN|MD)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
            re.IGNORECASE
        )

        # Organization patterns
        org_suffixes = re.compile(
            r'\b([A-Z][a-zA-Z\s\'\-]+(?:Hospital|Medical Center|Center|Clinic|University|'
            r'College|School|Association|Institute|Foundation|Department|Administration|'
            r'Committee|Board|Agency|Service|Company|Inc|Corp|LLC|VA|ANA))\b'
        )

        # Place patterns (cities, states, geographic locations)
        place_patterns = re.compile(
            r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*((?:[A-Z]{2}|[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*))\b|'
            r'\b((?:North|South|East|West)\s+[A-Z][a-z]+)\b|'
            r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:City|County|State|Valley|Mountain|River|Lake|Island)\b'
        )

        # Known US states for validation
        us_states = {
            'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
            'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
            'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
            'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
            'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
            'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
            'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
            'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
            'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
            'West Virginia', 'Wisconsin', 'Wyoming', 'DC', 'ID', 'WA', 'NY',
            'CA', 'ND', 'MN', 'IA', 'MT', 'OR', 'NV', 'AZ', 'NM', 'CO', 'WY'
        }

        # Common first names for better people detection
        common_titles = {'dr', 'mr', 'mrs', 'ms', 'miss', 'professor', 'prof',
                        'sister', 'father', 'rev', 'president', 'director', 'nurse'}

        current_chapter = "Document Start"

        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text
            style_name = para.style.name if para.style else ""

            # Track chapters for context
            if "heading" in style_name.lower() and "1" in style_name:
                current_chapter = text.strip()[:60]

            # Skip very short paragraphs
            if len(text) < 10:
                continue

            # Extract people
            if "people" in extract_types:
                for match in title_pattern.finditer(text):
                    title = match.group(1)
                    name = match.group(2).strip()
                    full_name = f"{title} {name}".strip()

                    # Clean up the name
                    if len(name) >= 2:
                        entities["people"][full_name]["count"] += 1
                        if include_context and len(entities["people"][full_name]["contexts"]) < 3:
                            # Get surrounding context
                            start = max(0, match.start() - 30)
                            end = min(len(text), match.end() + 50)
                            context = text[start:end].strip()
                            entities["people"][full_name]["contexts"].append({
                                "text": f"...{context}...",
                                "chapter": current_chapter,
                                "paragraph": para_idx
                            })

                # Also look for standalone capitalized names after verbs
                name_after_verb = re.finditer(
                    r'\b(?:said|told|asked|replied|answered|explained|noted|added|mentioned)\s+'
                    r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\b',
                    text
                )
                for match in name_after_verb:
                    name = match.group(1).strip()
                    if len(name) >= 3 and name not in us_states:
                        entities["people"][name]["count"] += 1
                        if include_context and len(entities["people"][name]["contexts"]) < 3:
                            start = max(0, match.start() - 20)
                            end = min(len(text), match.end() + 40)
                            context = text[start:end].strip()
                            entities["people"][name]["contexts"].append({
                                "text": f"...{context}...",
                                "chapter": current_chapter,
                                "paragraph": para_idx
                            })

            # Extract organizations
            if "organizations" in extract_types:
                for match in org_suffixes.finditer(text):
                    org_name = match.group(1).strip()
                    if len(org_name) >= 5:
                        entities["organizations"][org_name]["count"] += 1
                        if include_context and len(entities["organizations"][org_name]["contexts"]) < 3:
                            start = max(0, match.start() - 20)
                            end = min(len(text), match.end() + 40)
                            context = text[start:end].strip()
                            entities["organizations"][org_name]["contexts"].append({
                                "text": f"...{context}...",
                                "chapter": current_chapter,
                                "paragraph": para_idx
                            })

            # Extract places
            if "places" in extract_types:
                for match in place_patterns.finditer(text):
                    # Try different capture groups
                    place = None
                    if match.group(1) and match.group(2):  # City, State pattern
                        city = match.group(1).strip()
                        state = match.group(2).strip()
                        if state in us_states or len(state) == 2:
                            place = f"{city}, {state}"
                    elif match.group(3):  # Directional places
                        place = match.group(3).strip()
                    elif match.group(4):  # Geographic features
                        place = match.group(4).strip()

                    if place and len(place) >= 3:
                        entities["places"][place]["count"] += 1
                        if include_context and len(entities["places"][place]["contexts"]) < 3:
                            start = max(0, match.start() - 20)
                            end = min(len(text), match.end() + 40)
                            context = text[start:end].strip()
                            entities["places"][place]["contexts"].append({
                                "text": f"...{context}...",
                                "chapter": current_chapter,
                                "paragraph": para_idx
                            })

        # Filter by minimum occurrences and prepare output
        def filter_and_sort(entity_dict, min_count):
            filtered = []
            for name, data in entity_dict.items():
                if data["count"] >= min_count:
                    entry = {
                        "name": name,
                        "occurrences": data["count"]
                    }
                    if include_context and data["contexts"]:
                        entry["sample_contexts"] = data["contexts"]
                    filtered.append(entry)
            return sorted(filtered, key=lambda x: x["occurrences"], reverse=True)

        result = {
            "entities": {},
            "summary": {
                "total_entities": 0,
                "by_type": {}
            },
            "extraction_time": round(time.time() - start_time, 3)
        }

        for entity_type in extract_types:
            if entity_type in entities:
                filtered = filter_and_sort(entities[entity_type], min_occurrences)
                result["entities"][entity_type] = filtered
                result["summary"]["by_type"][entity_type] = len(filtered)
                result["summary"]["total_entities"] += len(filtered)

        return result

    @mcp_tool(
        name="get_chapter_summaries",
        description="Get brief summaries/previews of each chapter in a Word document. Extracts the opening sentences of each chapter to give a quick overview of content."
    )
    @handle_office_errors("Chapter summaries")
    async def get_chapter_summaries(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        sentences_per_chapter: int = Field(default=3, description="Number of opening sentences to include per chapter"),
        include_word_counts: bool = Field(default=True, description="Include word count for each chapter")
    ) -> dict[str, Any]:
        """Extract chapter summaries/previews from document."""
        from docx import Document
        import re

        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)

        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        doc = Document(local_path)

        chapters = []
        current_chapter = None
        chapter_text = []
        chapter_word_count = 0
        chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)

        def extract_preview(text_paragraphs, num_sentences):
            """Extract first N sentences from collected paragraphs."""
            full_text = " ".join(text_paragraphs)
            # Simple sentence splitting
            sentences = re.split(r'(?<=[.!?])\s+', full_text)
            preview_sentences = sentences[:num_sentences]
            return " ".join(preview_sentences).strip()

        def save_current_chapter():
            """Save the current chapter's data."""
            nonlocal current_chapter, chapter_text, chapter_word_count
            if current_chapter:
                preview = extract_preview(chapter_text, sentences_per_chapter)
                chapter_data = {
                    "chapter_number": current_chapter["number"],
                    "title": current_chapter["title"],
                    "paragraph_index": current_chapter["paragraph_index"],
                    "preview": preview if preview else "(No text content found)",
                }
                if include_word_counts:
                    chapter_data["word_count"] = chapter_word_count
                chapters.append(chapter_data)

        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text.strip()
            style_name = para.style.name if para.style else ""

            # Check if this is a chapter heading
            chapter_match = chapter_pattern.match(text)
            if chapter_match:
                # Save previous chapter first
                save_current_chapter()

                # Start new chapter
                current_chapter = {
                    "number": int(chapter_match.group(1)),
                    "title": text[:100],
                    "paragraph_index": para_idx
                }
                chapter_text = []
                chapter_word_count = 0
            elif current_chapter:
                # Accumulate text for current chapter
                if text:
                    word_count = len(text.split())
                    chapter_word_count += word_count
                    # Only collect first portion of text for preview
                    if len(" ".join(chapter_text)) < 1000:
                        chapter_text.append(text)

        # Don't forget the last chapter
        save_current_chapter()

        # Calculate statistics
        total_words = sum(c.get("word_count", 0) for c in chapters)
        avg_words = total_words // len(chapters) if chapters else 0

        return {
            "chapters": chapters,
            "summary": {
                "total_chapters": len(chapters),
                "total_words": total_words,
                "average_words_per_chapter": avg_words,
                "shortest_chapter": min((c for c in chapters), key=lambda x: x.get("word_count", 0), default=None),
                "longest_chapter": max((c for c in chapters), key=lambda x: x.get("word_count", 0), default=None)
            },
            "extraction_time": round(time.time() - start_time, 3)
        }

    @mcp_tool(
        name="save_reading_progress",
        description="Save your reading progress in a Word document. Creates a bookmark file to track which chapter/paragraph you're on, so you can resume reading later."
    )
    @handle_office_errors("Save reading progress")
    async def save_reading_progress(
        self,
        file_path: str = Field(description="Path to Word document"),
        chapter_number: int = Field(default=1, description="Current chapter number"),
        paragraph_index: int = Field(default=0, description="Current paragraph index"),
        notes: str = Field(default="", description="Optional notes about where you left off")
    ) -> dict[str, Any]:
        """Save reading progress to a bookmark file."""
        import json
        from datetime import datetime

        local_path = await resolve_office_file_path(file_path)

        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        # Create bookmark file path (same location as document)
        doc_dir = os.path.dirname(local_path)
        doc_name = os.path.splitext(os.path.basename(local_path))[0]
        bookmark_path = os.path.join(doc_dir, f".{doc_name}.reading_progress.json")

        # Load existing bookmarks or create new
        bookmarks = {"history": []}
        if os.path.exists(bookmark_path):
            try:
                with open(bookmark_path, 'r') as f:
                    bookmarks = json.load(f)
            except (json.JSONDecodeError, IOError):
                bookmarks = {"history": []}

        # Create new bookmark entry
        bookmark = {
            "timestamp": datetime.now().isoformat(),
            "chapter": chapter_number,
            "paragraph_index": paragraph_index,
            "notes": notes
        }

        # Update current position and add to history
        bookmarks["current"] = bookmark
        bookmarks["document"] = os.path.basename(local_path)
        bookmarks["history"].append(bookmark)

        # Keep only last 50 history entries
        if len(bookmarks["history"]) > 50:
            bookmarks["history"] = bookmarks["history"][-50:]

        # Save bookmark file
        with open(bookmark_path, 'w') as f:
            json.dump(bookmarks, f, indent=2)

        return {
            "saved": True,
            "bookmark_file": bookmark_path,
            "position": {
                "chapter": chapter_number,
                "paragraph_index": paragraph_index
            },
            "notes": notes,
            "timestamp": bookmark["timestamp"],
            "history_entries": len(bookmarks["history"])
        }

    @mcp_tool(
        name="get_reading_progress",
        description="Retrieve your saved reading progress for a Word document. Shows where you left off and your reading history."
    )
    @handle_office_errors("Get reading progress")
    async def get_reading_progress(
        self,
        file_path: str = Field(description="Path to Word document")
    ) -> dict[str, Any]:
        """Retrieve saved reading progress from bookmark file."""
        import json

        local_path = await resolve_office_file_path(file_path)

        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

        # Find bookmark file
        doc_dir = os.path.dirname(local_path)
        doc_name = os.path.splitext(os.path.basename(local_path))[0]
        bookmark_path = os.path.join(doc_dir, f".{doc_name}.reading_progress.json")

        if not os.path.exists(bookmark_path):
            return {
                "has_progress": False,
                "message": "No reading progress saved for this document. Use save_reading_progress to save your position."
            }

        # Load bookmarks
        try:
            with open(bookmark_path, 'r') as f:
                bookmarks = json.load(f)
        except (json.JSONDecodeError, IOError) as e:
            return {
                "has_progress": False,
                "error": f"Could not read bookmark file: {str(e)}"
            }

        current = bookmarks.get("current", {})
        history = bookmarks.get("history", [])

        return {
            "has_progress": True,
            "document": bookmarks.get("document", os.path.basename(local_path)),
            "current_position": {
                "chapter": current.get("chapter"),
                "paragraph_index": current.get("paragraph_index"),
                "notes": current.get("notes", ""),
                "last_read": current.get("timestamp")
            },
            "reading_sessions": len(history),
            "recent_history": history[-5:] if history else [],
            "bookmark_file": bookmark_path
        }