mcp-office-tools/src/mcp_office_tools/utils/processing.py

"""Universal processing helper functions for Office documents.

This module contains helper functions used across different document processing
operations including metadata extraction, health scoring, content truncation,
and page range parsing.
"""

import os
import tempfile
from typing import Any

# Configuration
TEMP_DIR = os.environ.get("OFFICE_TEMP_DIR", tempfile.gettempdir())
DEBUG = os.environ.get("DEBUG", "false").lower() == "true"


async def _extract_basic_metadata(file_path: str, extension: str, category: str) -> dict[str, Any]:
    """Extract basic metadata from Office documents."""
    metadata = {"category": category, "extension": extension}

    try:
        if extension in [".docx", ".xlsx", ".pptx"] and category in ["word", "excel", "powerpoint"]:
            import zipfile

            with zipfile.ZipFile(file_path, 'r') as zip_file:
                # Core properties
                if 'docProps/core.xml' in zip_file.namelist():
                    zip_file.read('docProps/core.xml').decode('utf-8')
                    metadata["has_core_properties"] = True

                # App properties
                if 'docProps/app.xml' in zip_file.namelist():
                    zip_file.read('docProps/app.xml').decode('utf-8')
                    metadata["has_app_properties"] = True

    except Exception:
        pass

    return metadata


def _calculate_health_score(validation: dict[str, Any], format_info: dict[str, Any]) -> int:
    """Calculate document health score (1-10)."""
    score = 10

    # Deduct for validation errors
    if not validation["is_valid"]:
        score -= 5

    if validation["errors"]:
        score -= len(validation["errors"]) * 2

    if validation["warnings"]:
        score -= len(validation["warnings"])

    # Deduct for problematic characteristics
    if validation.get("password_protected"):
        score -= 1

    if format_info.get("is_legacy"):
        score -= 1

    structure = format_info.get("structure", {})
    if structure.get("estimated_complexity") == "complex":
        score -= 1

    return max(1, min(10, score))


def _get_health_recommendations(validation: dict[str, Any], format_info: dict[str, Any]) -> list[str]:
    """Get health improvement recommendations."""
    recommendations = []

    if validation["errors"]:
        recommendations.append("Fix validation errors before processing")

    if validation.get("password_protected"):
        recommendations.append("Remove password protection if possible")

    if format_info.get("is_legacy"):
        recommendations.append("Consider converting to modern format (.docx, .xlsx, .pptx)")

    structure = format_info.get("structure", {})
    if structure.get("estimated_complexity") == "complex":
        recommendations.append("Complex document may require specialized processing")

    if not recommendations:
        recommendations.append("Document appears healthy and ready for processing")

    return recommendations


def _smart_truncate_content(content: str, max_chars: int) -> str:
    """Intelligently truncate content while preserving structure and readability."""
    if len(content) <= max_chars:
        return content

    lines = content.split('\n')
    truncated_lines = []
    current_length = 0

    # Try to preserve structure by stopping at a natural break point
    for line in lines:
        line_length = len(line) + 1  # +1 for newline

        # If adding this line would exceed limit
        if current_length + line_length > max_chars:
            # Try to find a good stopping point
            if truncated_lines:
                # Check if we're in the middle of a section
                last_lines = '\n'.join(truncated_lines[-3:]) if len(truncated_lines) >= 3 else '\n'.join(truncated_lines)

                # If we stopped mid-paragraph, remove incomplete paragraph
                if not (line.strip() == '' or line.startswith('#') or line.startswith('|')):
                    # Remove lines until we hit a natural break
                    while truncated_lines and not (
                        truncated_lines[-1].strip() == '' or
                        truncated_lines[-1].startswith('#') or
                        truncated_lines[-1].startswith('|') or
                        truncated_lines[-1].startswith('-') or
                        truncated_lines[-1].startswith('*')
                    ):
                        truncated_lines.pop()
            break

        truncated_lines.append(line)
        current_length += line_length

    # Add truncation notice
    result = '\n'.join(truncated_lines)
    result += f"\n\n---\n**[CONTENT TRUNCATED]**\nShowing {len(result):,} of {len(content):,} characters.\nUse smaller page ranges (e.g., 3-5 pages) for full content without truncation.\n---"

    return result


def _parse_page_range(page_range: str) -> list[int]:
    """Parse page range string into list of page numbers.

    Examples:
        "1-5" -> [1, 2, 3, 4, 5]
        "1,3,5" -> [1, 3, 5]
        "1-3,5,7-9" -> [1, 2, 3, 5, 7, 8, 9]
    """
    pages = set()

    for part in page_range.split(','):
        part = part.strip()
        if '-' in part:
            # Handle range like "1-5"
            start, end = part.split('-', 1)
            try:
                start_num = int(start.strip())
                end_num = int(end.strip())
                pages.update(range(start_num, end_num + 1))
            except ValueError:
                continue
        else:
            # Handle single page like "3"
            try:
                pages.add(int(part))
            except ValueError:
                continue

    return sorted(list(pages))


def _get_processing_recommendation(
    doc_analysis: dict[str, Any],
    page_range: str,
    summary_only: bool
) -> dict[str, Any]:
    """Generate intelligent processing recommendations based on document analysis."""

    estimated_pages = doc_analysis["estimated_pages"]
    content_size = doc_analysis["estimated_content_size"]

    recommendation = {
        "status": "optimal",
        "message": "",
        "suggested_workflow": [],
        "warnings": []
    }

    # Large document recommendations
    if content_size in ["large", "very_large"] and not page_range and not summary_only:
        recommendation["status"] = "suboptimal"
        recommendation["message"] = (
            f"⚠️  Large document detected ({estimated_pages} estimated pages). "
            "Consider using recommended workflow for better performance."
        )
        recommendation["suggested_workflow"] = [
            "1. First: Call with summary_only=true to get document overview and TOC",
            "2. Then: Use page_range to process specific sections (e.g., '1-5', '6-10', '15-20')",
            "3. Recommended: Use 3-8 page chunks to stay under 25k token MCP limit",
            "4. The tool auto-truncates if content is too large, but smaller ranges work better"
        ]
        recommendation["warnings"] = [
            "Page ranges >8 pages may hit 25k token response limit and get truncated",
            "Use smaller page ranges (3-5 pages) for dense content documents",
            "Auto-truncation preserves structure but loses content completeness"
        ]

    # Medium document recommendations
    elif content_size == "medium" and not page_range and not summary_only:
        recommendation["status"] = "caution"
        recommendation["message"] = (
            f"Medium document detected ({estimated_pages} estimated pages). "
            "Consider summary_only=true first if you encounter response size issues."
        )
        recommendation["suggested_workflow"] = [
            "Option 1: Try full processing (current approach)",
            "Option 2: Use summary_only=true first, then page_range if needed"
        ]

    # Optimal usage patterns
    elif summary_only:
        recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
        recommendation["suggested_workflow"] = [
            "After reviewing summary, use page_range to extract specific sections of interest"
        ]

    elif page_range and content_size in ["large", "very_large"]:
        recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."

    elif content_size == "small":
        recommendation["message"] = "✅ Small document - full processing is optimal."

    return recommendation