Refactor: Extract processing logic into utility modules

Complete architecture cleanup - eliminated duplicate server files: - Deleted server_monolithic.py (2249 lines) - Deleted server_legacy.py (2209 lines) New utility modules created: - utils/word_processing.py - Word extraction/conversion (preserves page range fixes) - utils/excel_processing.py - Excel extraction - utils/powerpoint_processing.py - PowerPoint extraction - utils/processing.py - Universal helpers (parse_page_range, health checks, etc.) Updated mixins to import from utils instead of server_monolithic. Entry point remains server.py (48 lines) using mixin architecture. All 53 tests pass. Coverage improved from 11% to 22% by removing duplicate code.
2026-01-11 05:08:18 -07:00 · 2026-01-11 05:08:18 -07:00 · af6aadf559
commit af6aadf559
parent 8249afb763
8 changed files with 1008 additions and 3615 deletions
--- a/src/mcp_office_tools/mixins/universal.py
+++ b/src/mcp_office_tools/mixins/universal.py
@ -293,7 +293,7 @@ class UniversalMixin(MCPMixin):
    async def _extract_text_by_category(self, file_path: str, extension: str, category: str, preserve_formatting: bool, method: str) -> dict[str, Any]:
        """Extract text based on document category."""
        # Import the appropriate extraction function
-        from ..server_monolithic import _extract_word_text, _extract_excel_text, _extract_powerpoint_text
+        from ..utils import _extract_word_text, _extract_excel_text, _extract_powerpoint_text
        if category == "word":
            return await _extract_word_text(file_path, extension, preserve_formatting, method)
@ -306,7 +306,7 @@ class UniversalMixin(MCPMixin):
    async def _extract_images_by_category(self, file_path: str, extension: str, category: str, output_format: str, min_width: int, min_height: int) -> list[dict[str, Any]]:
        """Extract images based on document category."""
-        from ..server_monolithic import _extract_word_images, _extract_excel_images, _extract_powerpoint_images
+        from ..utils import _extract_word_images, _extract_excel_images, _extract_powerpoint_images
        if category == "word":
            return await _extract_word_images(file_path, extension, output_format, min_width, min_height)
@ -319,7 +319,7 @@ class UniversalMixin(MCPMixin):
    async def _extract_metadata_by_category(self, file_path: str, extension: str, category: str) -> dict[str, Any]:
        """Extract metadata based on document category."""
-        from ..server_monolithic import _extract_word_metadata, _extract_excel_metadata, _extract_powerpoint_metadata, _extract_basic_metadata
+        from ..utils import _extract_word_metadata, _extract_excel_metadata, _extract_powerpoint_metadata, _extract_basic_metadata
        # Get basic metadata first
        metadata = await _extract_basic_metadata(file_path, extension, category)
@ -339,5 +339,5 @@ class UniversalMixin(MCPMixin):
    async def _extract_basic_metadata(self, file_path: str, extension: str, category: str) -> dict[str, Any]:
        """Extract basic metadata common to all documents."""
-        from ..server_monolithic import _extract_basic_metadata
+        from ..utils import _extract_basic_metadata
        return await _extract_basic_metadata(file_path, extension, category)
--- a/src/mcp_office_tools/mixins/word.py
+++ b/src/mcp_office_tools/mixins/word.py
@ -225,17 +225,17 @@ class WordMixin(MCPMixin):
    # Helper methods - import from monolithic server
    async def _analyze_document_size(self, file_path: str, extension: str) -> dict[str, Any]:
        """Analyze document size for processing recommendations."""
-        from ..server_monolithic import _analyze_document_size
+        from ..utils import _analyze_document_size
        return await _analyze_document_size(file_path, extension)
    def _get_processing_recommendation(self, doc_analysis: dict[str, Any], page_range: str, summary_only: bool) -> dict[str, Any]:
        """Get processing recommendations based on document analysis."""
-        from ..server_monolithic import _get_processing_recommendation
+        from ..utils import _get_processing_recommendation
        return _get_processing_recommendation(doc_analysis, page_range, summary_only)
    def _parse_page_range(self, page_range: str) -> list[int]:
        """Parse page range string into list of page numbers."""
-        from ..server_monolithic import _parse_page_range
+        from ..utils import _parse_page_range
        return _parse_page_range(page_range)
    async def _convert_docx_to_markdown(
@ -244,7 +244,7 @@ class WordMixin(MCPMixin):
        bookmark_name: str = "", chapter_name: str = ""
    ) -> dict[str, Any]:
        """Convert .docx to markdown."""
-        from ..server_monolithic import _convert_docx_to_markdown
+        from ..utils import _convert_docx_to_markdown
        return await _convert_docx_to_markdown(
            file_path, include_images, image_mode, max_image_size,
            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
@ -255,7 +255,7 @@ class WordMixin(MCPMixin):
        preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str
    ) -> dict[str, Any]:
        """Convert legacy .doc to markdown."""
-        from ..server_monolithic import _convert_doc_to_markdown
+        from ..utils import _convert_doc_to_markdown
        return await _convert_doc_to_markdown(
            file_path, include_images, image_mode, max_image_size,
            preserve_structure, page_numbers, summary_only, output_dir
--- a/src/mcp_office_tools/server_legacy.py
+++ b/src/mcp_office_tools/server_legacy.py
--- a/src/mcp_office_tools/utils/init.py
+++ b/src/mcp_office_tools/utils/init.py
@ -27,6 +27,48 @@ from .decorators import (
    handle_office_errors
 )
 from .processing import (
    TEMP_DIR,
    DEBUG,
    _extract_basic_metadata,
    _calculate_health_score,
    _get_health_recommendations,
    _smart_truncate_content,
    _parse_page_range,
    _get_processing_recommendation,
 )
 from .word_processing import (
    _extract_word_text,
    _extract_word_images,
    _extract_word_metadata,
    _convert_docx_to_markdown,
    _convert_docx_with_python_docx,
    _convert_doc_to_markdown,
    _get_ultra_fast_summary,
    _find_bookmark_content_range,
    _find_chapter_content_range,
    _get_available_headings,
    _has_page_break,
    _analyze_document_size,
    _paragraph_to_markdown,
    _table_to_markdown,
    _html_to_markdown,
    _extract_markdown_structure,
 )
 from .excel_processing import (
    _extract_excel_text,
    _extract_excel_images,
    _extract_excel_metadata,
 )
 from .powerpoint_processing import (
    _extract_powerpoint_text,
    _extract_powerpoint_images,
    _extract_powerpoint_metadata,
 )
 __all__ = [
    # Validation
    "OfficeFileError",
--- a/src/mcp_office_tools/utils/excel_processing.py
+++ b/src/mcp_office_tools/utils/excel_processing.py
@ -0,0 +1,203 @@
 """Excel document processing utilities.
 This module provides helper functions for extracting text, images, and metadata
 from Excel documents (.xlsx, .xls, .xlsm, .csv) with intelligent method selection
 and fallback support.
 """
 from typing import Any
 from . import OfficeFileError
 async def _extract_excel_text(file_path: str, extension: str, preserve_formatting: bool, method: str) -> dict[str, Any]:
    """Extract text from Excel documents."""
    methods_tried = []
    if extension == ".csv":
        # CSV handling
        import pandas as pd
        try:
            df = pd.read_csv(file_path)
            text = df.to_string()
            return {
                "text": text,
                "method_used": "pandas",
                "methods_tried": ["pandas"],
                "formatted_sections": [{"type": "table", "data": df.to_dict()}] if preserve_formatting else []
            }
        except Exception as e:
            raise OfficeFileError(f"CSV processing failed: {str(e)}")
    # Excel file handling
    text = ""
    formatted_sections = []
    method_used = None
    method_order = ["openpyxl", "pandas", "xlrd"] if extension == ".xlsx" else ["xlrd", "pandas", "openpyxl"]
    for method_name in method_order:
        try:
            methods_tried.append(method_name)
            if method_name == "openpyxl" and extension in [".xlsx", ".xlsm"]:
                import openpyxl
                wb = openpyxl.load_workbook(file_path, data_only=True)
                text_parts = []
                for sheet_name in wb.sheetnames:
                    ws = wb[sheet_name]
                    text_parts.append(f"Sheet: {sheet_name}")
                    for row in ws.iter_rows(values_only=True):
                        row_text = "\t".join(str(cell) if cell is not None else "" for cell in row)
                        if row_text.strip():
                            text_parts.append(row_text)
                    if preserve_formatting:
                        formatted_sections.append({
                            "type": "worksheet",
                            "name": sheet_name,
                            "data": [[str(cell.value) if cell.value is not None else "" for cell in row] for row in ws.iter_rows()]
                        })
                text = "\n".join(text_parts)
                method_used = "openpyxl"
                break
            elif method_name == "pandas":
                import pandas as pd
                if extension in [".xlsx", ".xlsm"]:
                    dfs = pd.read_excel(file_path, sheet_name=None)
                else:  # .xls
                    dfs = pd.read_excel(file_path, sheet_name=None, engine='xlrd')
                text_parts = []
                for sheet_name, df in dfs.items():
                    text_parts.append(f"Sheet: {sheet_name}")
                    text_parts.append(df.to_string())
                    if preserve_formatting:
                        formatted_sections.append({
                            "type": "dataframe",
                            "name": sheet_name,
                            "data": df.to_dict()
                        })
                text = "\n\n".join(text_parts)
                method_used = "pandas"
                break
            elif method_name == "xlrd" and extension == ".xls":
                import xlrd
                wb = xlrd.open_workbook(file_path)
                text_parts = []
                for sheet in wb.sheets():
                    text_parts.append(f"Sheet: {sheet.name}")
                    for row_idx in range(sheet.nrows):
                        row = sheet.row_values(row_idx)
                        row_text = "\t".join(str(cell) for cell in row)
                        text_parts.append(row_text)
                text = "\n".join(text_parts)
                method_used = "xlrd"
                break
        except ImportError:
            continue
        except Exception:
            continue
    if not method_used:
        raise OfficeFileError(f"Failed to extract text using methods: {', '.join(methods_tried)}")
    return {
        "text": text,
        "method_used": method_used,
        "methods_tried": methods_tried,
        "formatted_sections": formatted_sections
    }
 async def _extract_excel_images(file_path: str, extension: str, output_format: str, min_width: int, min_height: int) -> list[dict[str, Any]]:
    """Extract images from Excel documents."""
    import io
    import os
    import tempfile
    import zipfile
    from PIL import Image
    images = []
    TEMP_DIR = os.environ.get("OFFICE_TEMP_DIR", tempfile.gettempdir())
    if extension in [".xlsx", ".xlsm"]:
        try:
            with zipfile.ZipFile(file_path, 'r') as zip_file:
                # Look for images in media folder
                image_files = [f for f in zip_file.namelist() if f.startswith('xl/media/')]
                for i, img_path in enumerate(image_files):
                    try:
                        img_data = zip_file.read(img_path)
                        img = Image.open(io.BytesIO(img_data))
                        # Size filtering
                        if img.width >= min_width and img.height >= min_height:
                            # Save to temp file
                            temp_path = os.path.join(TEMP_DIR, f"excel_image_{i}.{output_format}")
                            img.save(temp_path, format=output_format.upper())
                            images.append({
                                "index": i,
                                "filename": os.path.basename(img_path),
                                "path": temp_path,
                                "width": img.width,
                                "height": img.height,
                                "format": img.format,
                                "size_bytes": len(img_data)
                            })
                    except Exception:
                        continue
        except Exception as e:
            raise OfficeFileError(f"Excel image extraction failed: {str(e)}")
    return images
 async def _extract_excel_metadata(file_path: str, extension: str) -> dict[str, Any]:
    """Extract Excel-specific metadata."""
    metadata = {"type": "excel", "extension": extension}
    if extension in [".xlsx", ".xlsm"]:
        try:
            import openpyxl
            wb = openpyxl.load_workbook(file_path)
            props = wb.properties
            metadata.update({
                "title": props.title,
                "creator": props.creator,
                "subject": props.subject,
                "description": props.description,
                "keywords": props.keywords,
                "created": str(props.created) if props.created else None,
                "modified": str(props.modified) if props.modified else None
            })
            # Workbook structure
            metadata.update({
                "worksheet_count": len(wb.worksheets),
                "worksheet_names": wb.sheetnames,
                "has_charts": any(len(ws._charts) > 0 for ws in wb.worksheets),
                "has_images": any(len(ws._images) > 0 for ws in wb.worksheets)
            })
        except Exception:
            pass
    return metadata
--- a/src/mcp_office_tools/utils/powerpoint_processing.py
+++ b/src/mcp_office_tools/utils/powerpoint_processing.py
@ -0,0 +1,177 @@
 """PowerPoint document processing utilities.
 This module provides helper functions for extracting text, images, and metadata
 from PowerPoint documents (.pptx and .ppt files).
 """
 import io
 import os
 import zipfile
 from typing import Any
 from PIL import Image
 from . import OfficeFileError
 async def _extract_powerpoint_text(
    file_path: str, extension: str, preserve_formatting: bool, method: str
 ) -> dict[str, Any]:
    """Extract text from PowerPoint documents."""
    methods_tried = []
    if extension == ".pptx":
        try:
            import pptx
            prs = pptx.Presentation(file_path)
            text_parts = []
            formatted_sections = []
            for slide_num, slide in enumerate(prs.slides, 1):
                slide_text_parts = []
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text:
                        slide_text_parts.append(shape.text)
                slide_text = "\n".join(slide_text_parts)
                text_parts.append(f"Slide {slide_num}:\n{slide_text}")
                if preserve_formatting:
                    formatted_sections.append(
                        {
                            "type": "slide",
                            "number": slide_num,
                            "text": slide_text,
                            "shapes": len(slide.shapes),
                        }
                    )
            text = "\n\n".join(text_parts)
            return {
                "text": text,
                "method_used": "python-pptx",
                "methods_tried": ["python-pptx"],
                "formatted_sections": formatted_sections,
            }
        except ImportError:
            methods_tried.append("python-pptx")
        except Exception:
            methods_tried.append("python-pptx")
    # Legacy .ppt handling would require additional libraries
    if extension == ".ppt":
        raise OfficeFileError(
            "Legacy PowerPoint (.ppt) text extraction requires additional setup"
        )
    raise OfficeFileError(
        f"Failed to extract text using methods: {', '.join(methods_tried)}"
    )
 async def _extract_powerpoint_images(
    file_path: str,
    extension: str,
    output_format: str,
    min_width: int,
    min_height: int,
    temp_dir: str,
 ) -> list[dict[str, Any]]:
    """Extract images from PowerPoint documents."""
    images = []
    if extension == ".pptx":
        try:
            with zipfile.ZipFile(file_path, "r") as zip_file:
                # Look for images in media folder
                image_files = [
                    f for f in zip_file.namelist() if f.startswith("ppt/media/")
                ]
                for i, img_path in enumerate(image_files):
                    try:
                        img_data = zip_file.read(img_path)
                        img = Image.open(io.BytesIO(img_data))
                        # Size filtering
                        if img.width >= min_width and img.height >= min_height:
                            # Save to temp file
                            temp_path = os.path.join(
                                temp_dir, f"powerpoint_image_{i}.{output_format}"
                            )
                            img.save(temp_path, format=output_format.upper())
                            images.append(
                                {
                                    "index": i,
                                    "filename": os.path.basename(img_path),
                                    "path": temp_path,
                                    "width": img.width,
                                    "height": img.height,
                                    "format": img.format,
                                    "size_bytes": len(img_data),
                                }
                            )
                    except Exception:
                        continue
        except Exception as e:
            raise OfficeFileError(f"PowerPoint image extraction failed: {str(e)}")
    return images
 async def _extract_powerpoint_metadata(
    file_path: str, extension: str
 ) -> dict[str, Any]:
    """Extract PowerPoint-specific metadata."""
    metadata = {"type": "powerpoint", "extension": extension}
    if extension == ".pptx":
        try:
            import pptx
            prs = pptx.Presentation(file_path)
            core_props = prs.core_properties
            metadata.update(
                {
                    "title": core_props.title,
                    "author": core_props.author,
                    "subject": core_props.subject,
                    "keywords": core_props.keywords,
                    "comments": core_props.comments,
                    "created": str(core_props.created) if core_props.created else None,
                    "modified": str(core_props.modified)
                    if core_props.modified
                    else None,
                }
            )
            # Presentation structure
            slide_layouts = set()
            total_shapes = 0
            for slide in prs.slides:
                slide_layouts.add(slide.slide_layout.name)
                total_shapes += len(slide.shapes)
            metadata.update(
                {
                    "slide_count": len(prs.slides),
                    "slide_layouts": list(slide_layouts),
                    "total_shapes": total_shapes,
                    "slide_width": prs.slide_width,
                    "slide_height": prs.slide_height,
                }
            )
        except Exception:
            pass
    return metadata
--- a/src/mcp_office_tools/utils/processing.py
+++ b/src/mcp_office_tools/utils/processing.py
@ -0,0 +1,228 @@
 """Universal processing helper functions for Office documents.
 This module contains helper functions used across different document processing
 operations including metadata extraction, health scoring, content truncation,
 and page range parsing.
 """
 import os
 import tempfile
 from typing import Any
 # Configuration
 TEMP_DIR = os.environ.get("OFFICE_TEMP_DIR", tempfile.gettempdir())
 DEBUG = os.environ.get("DEBUG", "false").lower() == "true"
 async def _extract_basic_metadata(file_path: str, extension: str, category: str) -> dict[str, Any]:
    """Extract basic metadata from Office documents."""
    metadata = {"category": category, "extension": extension}
    try:
        if extension in [".docx", ".xlsx", ".pptx"] and category in ["word", "excel", "powerpoint"]:
            import zipfile
            with zipfile.ZipFile(file_path, 'r') as zip_file:
                # Core properties
                if 'docProps/core.xml' in zip_file.namelist():
                    zip_file.read('docProps/core.xml').decode('utf-8')
                    metadata["has_core_properties"] = True
                # App properties
                if 'docProps/app.xml' in zip_file.namelist():
                    zip_file.read('docProps/app.xml').decode('utf-8')
                    metadata["has_app_properties"] = True
    except Exception:
        pass
    return metadata
 def _calculate_health_score(validation: dict[str, Any], format_info: dict[str, Any]) -> int:
    """Calculate document health score (1-10)."""
    score = 10
    # Deduct for validation errors
    if not validation["is_valid"]:
        score -= 5
    if validation["errors"]:
        score -= len(validation["errors"]) * 2
    if validation["warnings"]:
        score -= len(validation["warnings"])
    # Deduct for problematic characteristics
    if validation.get("password_protected"):
        score -= 1
    if format_info.get("is_legacy"):
        score -= 1
    structure = format_info.get("structure", {})
    if structure.get("estimated_complexity") == "complex":
        score -= 1
    return max(1, min(10, score))
 def _get_health_recommendations(validation: dict[str, Any], format_info: dict[str, Any]) -> list[str]:
    """Get health improvement recommendations."""
    recommendations = []
    if validation["errors"]:
        recommendations.append("Fix validation errors before processing")
    if validation.get("password_protected"):
        recommendations.append("Remove password protection if possible")
    if format_info.get("is_legacy"):
        recommendations.append("Consider converting to modern format (.docx, .xlsx, .pptx)")
    structure = format_info.get("structure", {})
    if structure.get("estimated_complexity") == "complex":
        recommendations.append("Complex document may require specialized processing")
    if not recommendations:
        recommendations.append("Document appears healthy and ready for processing")
    return recommendations
 def _smart_truncate_content(content: str, max_chars: int) -> str:
    """Intelligently truncate content while preserving structure and readability."""
    if len(content) <= max_chars:
        return content
    lines = content.split('\n')
    truncated_lines = []
    current_length = 0
    # Try to preserve structure by stopping at a natural break point
    for line in lines:
        line_length = len(line) + 1  # +1 for newline
        # If adding this line would exceed limit
        if current_length + line_length > max_chars:
            # Try to find a good stopping point
            if truncated_lines:
                # Check if we're in the middle of a section
                last_lines = '\n'.join(truncated_lines[-3:]) if len(truncated_lines) >= 3 else '\n'.join(truncated_lines)
                # If we stopped mid-paragraph, remove incomplete paragraph
                if not (line.strip() == '' or line.startswith('#') or line.startswith('|')):
                    # Remove lines until we hit a natural break
                    while truncated_lines and not (
                        truncated_lines[-1].strip() == '' or
                        truncated_lines[-1].startswith('#') or
                        truncated_lines[-1].startswith('|') or
                        truncated_lines[-1].startswith('-') or
                        truncated_lines[-1].startswith('*')
                    ):
                        truncated_lines.pop()
            break
        truncated_lines.append(line)
        current_length += line_length
    # Add truncation notice
    result = '\n'.join(truncated_lines)
    result += f"\n\n---\n**[CONTENT TRUNCATED]**\nShowing {len(result):,} of {len(content):,} characters.\nUse smaller page ranges (e.g., 3-5 pages) for full content without truncation.\n---"
    return result
 def _parse_page_range(page_range: str) -> list[int]:
    """Parse page range string into list of page numbers.
    Examples:
        "1-5" -> [1, 2, 3, 4, 5]
        "1,3,5" -> [1, 3, 5]
        "1-3,5,7-9" -> [1, 2, 3, 5, 7, 8, 9]
    """
    pages = set()
    for part in page_range.split(','):
        part = part.strip()
        if '-' in part:
            # Handle range like "1-5"
            start, end = part.split('-', 1)
            try:
                start_num = int(start.strip())
                end_num = int(end.strip())
                pages.update(range(start_num, end_num + 1))
            except ValueError:
                continue
        else:
            # Handle single page like "3"
            try:
                pages.add(int(part))
            except ValueError:
                continue
    return sorted(list(pages))
 def _get_processing_recommendation(
    doc_analysis: dict[str, Any],
    page_range: str,
    summary_only: bool
 ) -> dict[str, Any]:
    """Generate intelligent processing recommendations based on document analysis."""
    estimated_pages = doc_analysis["estimated_pages"]
    content_size = doc_analysis["estimated_content_size"]
    recommendation = {
        "status": "optimal",
        "message": "",
        "suggested_workflow": [],
        "warnings": []
    }
    # Large document recommendations
    if content_size in ["large", "very_large"] and not page_range and not summary_only:
        recommendation["status"] = "suboptimal"
        recommendation["message"] = (
            f"⚠️  Large document detected ({estimated_pages} estimated pages). "
            "Consider using recommended workflow for better performance."
        )
        recommendation["suggested_workflow"] = [
            "1. First: Call with summary_only=true to get document overview and TOC",
            "2. Then: Use page_range to process specific sections (e.g., '1-5', '6-10', '15-20')",
            "3. Recommended: Use 3-8 page chunks to stay under 25k token MCP limit",
            "4. The tool auto-truncates if content is too large, but smaller ranges work better"
        ]
        recommendation["warnings"] = [
            "Page ranges >8 pages may hit 25k token response limit and get truncated",
            "Use smaller page ranges (3-5 pages) for dense content documents",
            "Auto-truncation preserves structure but loses content completeness"
        ]
    # Medium document recommendations
    elif content_size == "medium" and not page_range and not summary_only:
        recommendation["status"] = "caution"
        recommendation["message"] = (
            f"Medium document detected ({estimated_pages} estimated pages). "
            "Consider summary_only=true first if you encounter response size issues."
        )
        recommendation["suggested_workflow"] = [
            "Option 1: Try full processing (current approach)",
            "Option 2: Use summary_only=true first, then page_range if needed"
        ]
    # Optimal usage patterns
    elif summary_only:
        recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
        recommendation["suggested_workflow"] = [
            "After reviewing summary, use page_range to extract specific sections of interest"
        ]
    elif page_range and content_size in ["large", "very_large"]:
        recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
    elif content_size == "small":
        recommendation["message"] = "✅ Small document - full processing is optimal."
    return recommendation
--- a/src/mcp_office_tools/utils/word_processing.py
+++ b/src/mcp_office_tools/utils/word_processing.py