✨ Add comprehensive Markdown conversion with image support

- Add convert_to_markdown tool for .docx/.doc files - Support multiple image handling modes (base64, files, references) - Implement large document chunking for performance - Preserve document structure (headings, lists, tables) - Smart fallback methods (mammoth → python-docx → custom) - Handle both modern and legacy Word formats
2025-08-18 23:23:59 -06:00 · 2025-08-18 23:23:59 -06:00 · b3caed78d3
commit b3caed78d3
parent 1b359c4c7c
1 changed files with 724 additions and 181 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -4,23 +4,22 @@ FastMCP server providing 30+ tools for processing Word, Excel, PowerPoint docume
 including both modern formats (.docx, .xlsx, .pptx) and legacy formats (.doc, .xls, .ppt).
 """

-import time
-import tempfile
 import os
-from typing import Dict, Any, List, Optional, Union
+import tempfile
+import time
 from pathlib import Path
+from typing import Any

 from fastmcp import FastMCP
 from pydantic import Field

 from .utils import (
    OfficeFileError,
-    validate_office_file,
-    validate_office_path,
-    detect_format,
    classify_document_type,
+    detect_format,
+    get_supported_extensions,
    resolve_office_file_path,
-    get_supported_extensions
+    validate_office_file,
 )

 # Initialize FastMCP app
@ -37,7 +36,7 @@ async def extract_text(
    preserve_formatting: bool = Field(default=False, description="Preserve text formatting and structure"),
    include_metadata: bool = Field(default=True, description="Include document metadata in output"),
    method: str = Field(default="auto", description="Extraction method: auto, primary, fallback")
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
    """Extract text content from Office documents with intelligent method selection.
    
    Supports Word (.docx, .doc), Excel (.xlsx, .xls), PowerPoint (.pptx, .ppt),
@ -105,7 +104,7 @@ async def extract_images(
    min_width: int = Field(default=100, description="Minimum image width in pixels"),
    min_height: int = Field(default=100, description="Minimum image height in pixels"),
    include_metadata: bool = Field(default=True, description="Include image metadata")
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
    """Extract images from Office documents with size filtering and format conversion."""
    start_time = time.time()

@ -158,7 +157,7 @@ async def extract_images(
@app.tool()
 async def extract_metadata(
    file_path: str = Field(description="Path to Office document or URL")
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
    """Extract comprehensive metadata from Office documents."""
    start_time = time.time()

@ -215,7 +214,7 @@ async def extract_metadata(
@app.tool()
 async def detect_office_format(
    file_path: str = Field(description="Path to Office document or URL")
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
    """Intelligent Office document format detection and analysis."""
    start_time = time.time()

@ -249,7 +248,7 @@ async def detect_office_format(
@app.tool()
 async def analyze_document_health(
    file_path: str = Field(description="Path to Office document or URL")
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
    """Comprehensive document health and integrity analysis."""
    start_time = time.time()

@ -286,7 +285,93 @@ async def analyze_document_health(


@app.tool()
-async def get_supported_formats() -> Dict[str, Any]:
+async def convert_to_markdown(
+    file_path: str = Field(description="Path to Office document or URL"),
+    include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
+    image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
+    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
+    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
+    chunk_size: int = Field(default=0, description="Split large documents into chunks (0 = no chunking)"),
+    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
+) -> dict[str, Any]:
+    """Convert Office documents to Markdown format with image support and structure preservation.
+    
+    Handles large .docx files efficiently with options for image embedding, file extraction,
+    and document chunking for very large files.
+    """
+    start_time = time.time()
+
+    try:
+        # Resolve file path
+        local_path = await resolve_office_file_path(file_path)
+
+        # Validate file
+        validation = await validate_office_file(local_path)
+        if not validation["is_valid"]:
+            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
+
+        # Get format info
+        format_info = await detect_format(local_path)
+        category = format_info["category"]
+        extension = format_info["extension"]
+
+        # Currently focused on Word documents for markdown conversion
+        if category != "word":
+            raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
+
+        # Convert to markdown based on format
+        if extension == ".docx":
+            markdown_result = await _convert_docx_to_markdown(
+                local_path, include_images, image_mode, max_image_size,
+                preserve_structure, chunk_size, output_dir
+            )
+        else:  # .doc
+            # For legacy .doc files, use mammoth if available
+            markdown_result = await _convert_doc_to_markdown(
+                local_path, include_images, image_mode, max_image_size,
+                preserve_structure, chunk_size, output_dir
+            )
+
+        result = {
+            "markdown": markdown_result["content"],
+            "metadata": {
+                "original_file": os.path.basename(local_path),
+                "format": format_info["format_name"],
+                "conversion_method": markdown_result["method_used"],
+                "character_count": len(markdown_result["content"]),
+                "word_count": len(markdown_result["content"].split()),
+                "conversion_time": round(time.time() - start_time, 3)
+            }
+        }
+
+        # Add chunking info if applicable
+        if chunk_size > 0 and markdown_result.get("chunks"):
+            result["chunks"] = markdown_result["chunks"]
+            result["metadata"]["chunk_count"] = len(markdown_result["chunks"])
+
+        # Add image info
+        if include_images and markdown_result.get("images"):
+            result["images"] = markdown_result["images"]
+            result["metadata"]["image_count"] = len(markdown_result["images"])
+            result["metadata"]["total_image_size"] = sum(
+                img.get("size_bytes", 0) for img in markdown_result["images"]
+            )
+
+        # Add structure info
+        if preserve_structure and markdown_result.get("structure"):
+            result["structure"] = markdown_result["structure"]
+
+        return result
+
+    except Exception as e:
+        if DEBUG:
+            import traceback
+            traceback.print_exc()
+        raise OfficeFileError(f"Markdown conversion failed: {str(e)}")
+
+
+@app.tool()
+async def get_supported_formats() -> dict[str, Any]:
    """Get list of all supported Office document formats and their capabilities."""
    extensions = get_supported_extensions()

@ -314,7 +399,7 @@ async def get_supported_formats() -> Dict[str, Any]:


 # Helper functions for text extraction
-async def _extract_word_text(file_path: str, extension: str, preserve_formatting: bool, method: str) -> Dict[str, Any]:
+async def _extract_word_text(file_path: str, extension: str, preserve_formatting: bool, method: str) -> dict[str, Any]:
    """Extract text from Word documents with fallback methods."""
    methods_tried = []

@ -414,7 +499,7 @@ async def _extract_word_text(file_path: str, extension: str, preserve_formatting
    }


-async def _extract_excel_text(file_path: str, extension: str, preserve_formatting: bool, method: str) -> Dict[str, Any]:
+async def _extract_excel_text(file_path: str, extension: str, preserve_formatting: bool, method: str) -> dict[str, Any]:
    """Extract text from Excel documents."""
    methods_tried = []

@ -526,7 +611,7 @@ async def _extract_excel_text(file_path: str, extension: str, preserve_formattin
    }


-async def _extract_powerpoint_text(file_path: str, extension: str, preserve_formatting: bool, method: str) -> Dict[str, Any]:
+async def _extract_powerpoint_text(file_path: str, extension: str, preserve_formatting: bool, method: str) -> dict[str, Any]:
    """Extract text from PowerPoint documents."""
    methods_tried = []

@ -567,7 +652,7 @@ async def _extract_powerpoint_text(file_path: str, extension: str, preserve_form

        except ImportError:
            methods_tried.append("python-pptx")
-        except Exception as e:
+        except Exception:
            methods_tried.append("python-pptx")

    # Legacy .ppt handling would require additional libraries
@ -578,15 +663,16 @@ async def _extract_powerpoint_text(file_path: str, extension: str, preserve_form


 # Helper functions for image extraction
-async def _extract_word_images(file_path: str, extension: str, output_format: str, min_width: int, min_height: int) -> List[Dict[str, Any]]:
+async def _extract_word_images(file_path: str, extension: str, output_format: str, min_width: int, min_height: int) -> list[dict[str, Any]]:
    """Extract images from Word documents."""
    images = []

    if extension == ".docx":
        try:
-            import zipfile
-            from PIL import Image
            import io
+            import zipfile
+
+            from PIL import Image

            with zipfile.ZipFile(file_path, 'r') as zip_file:
                # Look for images in media folder
@ -621,15 +707,16 @@ async def _extract_word_images(file_path: str, extension: str, output_format: st
    return images


-async def _extract_excel_images(file_path: str, extension: str, output_format: str, min_width: int, min_height: int) -> List[Dict[str, Any]]:
+async def _extract_excel_images(file_path: str, extension: str, output_format: str, min_width: int, min_height: int) -> list[dict[str, Any]]:
    """Extract images from Excel documents."""
    images = []

    if extension in [".xlsx", ".xlsm"]:
        try:
-            import zipfile
-            from PIL import Image
            import io
+            import zipfile
+
+            from PIL import Image

            with zipfile.ZipFile(file_path, 'r') as zip_file:
                # Look for images in media folder
@ -664,15 +751,16 @@ async def _extract_excel_images(file_path: str, extension: str, output_format: s
    return images


-async def _extract_powerpoint_images(file_path: str, extension: str, output_format: str, min_width: int, min_height: int) -> List[Dict[str, Any]]:
+async def _extract_powerpoint_images(file_path: str, extension: str, output_format: str, min_width: int, min_height: int) -> list[dict[str, Any]]:
    """Extract images from PowerPoint documents."""
    images = []

    if extension == ".pptx":
        try:
-            import zipfile
-            from PIL import Image
            import io
+            import zipfile
+
+            from PIL import Image

            with zipfile.ZipFile(file_path, 'r') as zip_file:
                # Look for images in media folder
@ -708,7 +796,7 @@ async def _extract_powerpoint_images(file_path: str, extension: str, output_form


 # Helper functions for metadata extraction
-async def _extract_basic_metadata(file_path: str, extension: str, category: str) -> Dict[str, Any]:
+async def _extract_basic_metadata(file_path: str, extension: str, category: str) -> dict[str, Any]:
    """Extract basic metadata from Office documents."""
    metadata = {"category": category, "extension": extension}

@ -719,12 +807,12 @@ async def _extract_basic_metadata(file_path: str, extension: str, category: str)
            with zipfile.ZipFile(file_path, 'r') as zip_file:
                # Core properties
                if 'docProps/core.xml' in zip_file.namelist():
-                    core_xml = zip_file.read('docProps/core.xml').decode('utf-8')
+                    zip_file.read('docProps/core.xml').decode('utf-8')
                    metadata["has_core_properties"] = True

                # App properties
                if 'docProps/app.xml' in zip_file.namelist():
-                    app_xml = zip_file.read('docProps/app.xml').decode('utf-8')
+                    zip_file.read('docProps/app.xml').decode('utf-8')
                    metadata["has_app_properties"] = True

    except Exception:
@ -733,7 +821,7 @@ async def _extract_basic_metadata(file_path: str, extension: str, category: str)
    return metadata


-async def _extract_word_metadata(file_path: str, extension: str) -> Dict[str, Any]:
+async def _extract_word_metadata(file_path: str, extension: str) -> dict[str, Any]:
    """Extract Word-specific metadata."""
    metadata = {"type": "word", "extension": extension}

@ -767,7 +855,7 @@ async def _extract_word_metadata(file_path: str, extension: str) -> Dict[str, An
    return metadata


-async def _extract_excel_metadata(file_path: str, extension: str) -> Dict[str, Any]:
+async def _extract_excel_metadata(file_path: str, extension: str) -> dict[str, Any]:
    """Extract Excel-specific metadata."""
    metadata = {"type": "excel", "extension": extension}

@ -801,7 +889,7 @@ async def _extract_excel_metadata(file_path: str, extension: str) -> Dict[str, A
    return metadata


-async def _extract_powerpoint_metadata(file_path: str, extension: str) -> Dict[str, Any]:
+async def _extract_powerpoint_metadata(file_path: str, extension: str) -> dict[str, Any]:
    """Extract PowerPoint-specific metadata."""
    metadata = {"type": "powerpoint", "extension": extension}

@ -843,7 +931,7 @@ async def _extract_powerpoint_metadata(file_path: str, extension: str) -> Dict[s
    return metadata


-def _calculate_health_score(validation: Dict[str, Any], format_info: Dict[str, Any]) -> int:
+def _calculate_health_score(validation: dict[str, Any], format_info: dict[str, Any]) -> int:
    """Calculate document health score (1-10)."""
    score = 10

@ -871,7 +959,7 @@ def _calculate_health_score(validation: Dict[str, Any], format_info: Dict[str, A
    return max(1, min(10, score))


-def _get_health_recommendations(validation: Dict[str, Any], format_info: Dict[str, Any]) -> List[str]:
+def _get_health_recommendations(validation: dict[str, Any], format_info: dict[str, Any]) -> list[str]:
    """Get health improvement recommendations."""
    recommendations = []

@ -894,9 +982,464 @@ def _get_health_recommendations(validation: Dict[str, Any], format_info: Dict[st
    return recommendations


+# Markdown conversion helper functions
+async def _convert_docx_to_markdown(
+    file_path: str,
+    include_images: bool,
+    image_mode: str,
+    max_image_size: int,
+    preserve_structure: bool,
+    chunk_size: int,
+    output_dir: str
+) -> dict[str, Any]:
+    """Convert .docx file to markdown with comprehensive feature support."""
+    import base64
+
+    try:
+        # Try mammoth first for better HTML->Markdown conversion
+        import mammoth
+
+        # Configure mammoth for markdown-friendly output
+        with open(file_path, "rb") as docx_file:
+            if include_images:
+                # Extract images and handle them based on mode
+                images_info = []
+
+                def convert_image(image):
+                    image_data = image.open()
+                    content_type = image.content_type
+                    ext = content_type.split('/')[-1] if '/' in content_type else 'png'
+
+                    if image_mode == "base64":
+                        if len(image_data) <= max_image_size:
+                            encoded = base64.b64encode(image_data).decode('utf-8')
+                            images_info.append({
+                                "filename": f"image_{len(images_info)}.{ext}",
+                                "content_type": content_type,
+                                "size_bytes": len(image_data),
+                                "mode": "base64"
+                            })
+                            return {
+                                "src": f"data:{content_type};base64,{encoded}"
+                            }
+                        else:
+                            # Too large for base64, fall back to reference
+                            filename = f"large_image_{len(images_info)}.{ext}"
+                            images_info.append({
+                                "filename": filename,
+                                "content_type": content_type,
+                                "size_bytes": len(image_data),
+                                "mode": "reference",
+                                "note": "Too large for base64 encoding"
+                            })
+                            return {"src": filename}
+
+                    elif image_mode == "files":
+                        # Save image to file
+                        nonlocal output_dir
+                        if not output_dir:
+                            output_dir = os.path.join(TEMP_DIR, "markdown_images")
+
+                        os.makedirs(output_dir, exist_ok=True)
+                        filename = f"image_{len(images_info)}.{ext}"
+                        file_path = os.path.join(output_dir, filename)
+
+                        with open(file_path, 'wb') as img_file:
+                            img_file.write(image_data)
+
+                        images_info.append({
+                            "filename": filename,
+                            "file_path": file_path,
+                            "content_type": content_type,
+                            "size_bytes": len(image_data),
+                            "mode": "file"
+                        })
+                        return {"src": file_path}
+
+                    else:  # references
+                        filename = f"image_{len(images_info)}.{ext}"
+                        images_info.append({
+                            "filename": filename,
+                            "content_type": content_type,
+                            "size_bytes": len(image_data),
+                            "mode": "reference"
+                        })
+                        return {"src": filename}
+
+                # Convert with image handling
+                result = mammoth.convert_to_html(
+                    docx_file,
+                    convert_image=mammoth.images.img_element(convert_image)
+                )
+
+                html_content = result.value
+                markdown_content = _html_to_markdown(html_content, preserve_structure)
+
+                conversion_result = {
+                    "content": markdown_content,
+                    "method_used": "mammoth-with-images",
+                    "images": images_info
+                }
+
+            else:
+                # Convert without images
+                result = mammoth.convert_to_markdown(docx_file)
+                markdown_content = result.value
+
+                conversion_result = {
+                    "content": markdown_content,
+                    "method_used": "mammoth-markdown",
+                    "images": []
+                }
+
+            # Handle chunking if requested
+            if chunk_size > 0 and len(markdown_content) > chunk_size:
+                chunks = _chunk_markdown(markdown_content, chunk_size)
+                conversion_result["chunks"] = chunks
+
+            # Extract structure information
+            if preserve_structure:
+                structure = _extract_markdown_structure(markdown_content)
+                conversion_result["structure"] = structure
+
+            return conversion_result
+
+    except ImportError:
+        # Fall back to python-docx with custom markdown conversion
+        return await _convert_docx_with_python_docx(
+            file_path, include_images, image_mode, max_image_size,
+            preserve_structure, chunk_size, output_dir
+        )
+    except Exception:
+        # Fall back to python-docx
+        return await _convert_docx_with_python_docx(
+            file_path, include_images, image_mode, max_image_size,
+            preserve_structure, chunk_size, output_dir
+        )
+
+
+async def _convert_docx_with_python_docx(
+    file_path: str,
+    include_images: bool,
+    image_mode: str,
+    max_image_size: int,
+    preserve_structure: bool,
+    chunk_size: int,
+    output_dir: str
+) -> dict[str, Any]:
+    """Convert .docx using python-docx with custom markdown conversion."""
+    import base64
+
+    import docx
+    from docx.oxml.table import CT_Tbl
+    from docx.oxml.text.paragraph import CT_P
+    from docx.table import Table
+    from docx.text.paragraph import Paragraph
+
+    doc = docx.Document(file_path)
+    markdown_parts = []
+    images_info = []
+    structure_info = {"headings": [], "tables": 0, "lists": 0, "paragraphs": 0}
+
+    # Extract images if requested
+    if include_images:
+        extracted_images = await _extract_word_images(file_path, ".docx", "png", 1, 1)
+        for i, img in enumerate(extracted_images):
+            if image_mode == "base64":
+                if img.get("size_bytes", 0) <= max_image_size:
+                    with open(img["path"], "rb") as img_file:
+                        img_data = img_file.read()
+                        encoded = base64.b64encode(img_data).decode('utf-8')
+                        images_info.append({
+                            "filename": img["filename"],
+                            "content_type": f"image/{img.get('format', 'png').lower()}",
+                            "size_bytes": img.get("size_bytes", 0),
+                            "mode": "base64",
+                            "markdown_ref": f"![Image {i+1}](data:image/{img.get('format', 'png').lower()};base64,{encoded})"
+                        })
+                else:
+                    images_info.append({
+                        "filename": img["filename"],
+                        "size_bytes": img.get("size_bytes", 0),
+                        "mode": "reference",
+                        "markdown_ref": f"![Image {i+1}]({img['filename']})",
+                        "note": "Too large for base64 encoding"
+                    })
+            elif image_mode == "files":
+                images_info.append({
+                    "filename": img["filename"],
+                    "file_path": img["path"],
+                    "size_bytes": img.get("size_bytes", 0),
+                    "mode": "file",
+                    "markdown_ref": f"![Image {i+1}]({img['path']})"
+                })
+            else:  # references
+                images_info.append({
+                    "filename": img["filename"],
+                    "size_bytes": img.get("size_bytes", 0),
+                    "mode": "reference",
+                    "markdown_ref": f"![Image {i+1}]({img['filename']})"
+                })
+
+    # Process document elements
+    for element in doc.element.body:
+        if isinstance(element, CT_P):
+            paragraph = Paragraph(element, doc)
+            markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
+            if markdown_text.strip():
+                markdown_parts.append(markdown_text)
+                structure_info["paragraphs"] += 1
+
+                # Track headings
+                if preserve_structure and markdown_text.startswith('#'):
+                    level = len(markdown_text) - len(markdown_text.lstrip('#'))
+                    heading_text = markdown_text.lstrip('# ').strip()
+                    structure_info["headings"].append({
+                        "level": level,
+                        "text": heading_text,
+                        "position": len(markdown_parts) - 1
+                    })
+
+        elif isinstance(element, CT_Tbl):
+            table = Table(element, doc)
+            table_markdown = _table_to_markdown(table)
+            if table_markdown.strip():
+                markdown_parts.append(table_markdown)
+                structure_info["tables"] += 1
+
+    # Add image references at the end if any
+    if include_images and images_info:
+        markdown_parts.append("\n## Images\n")
+        for img in images_info:
+            markdown_parts.append(img["markdown_ref"])
+
+    markdown_content = "\n\n".join(markdown_parts)
+
+    result = {
+        "content": markdown_content,
+        "method_used": "python-docx-custom",
+        "images": images_info
+    }
+
+    # Handle chunking
+    if chunk_size > 0 and len(markdown_content) > chunk_size:
+        chunks = _chunk_markdown(markdown_content, chunk_size)
+        result["chunks"] = chunks
+
+    # Add structure info
+    if preserve_structure:
+        result["structure"] = structure_info
+
+    return result
+
+
+async def _convert_doc_to_markdown(
+    file_path: str,
+    include_images: bool,
+    image_mode: str,
+    max_image_size: int,
+    preserve_structure: bool,
+    chunk_size: int,
+    output_dir: str
+) -> dict[str, Any]:
+    """Convert legacy .doc file to markdown using available methods."""
+    try:
+        import mammoth
+
+        with open(file_path, "rb") as doc_file:
+            result = mammoth.convert_to_markdown(doc_file)
+            markdown_content = result.value
+
+            conversion_result = {
+                "content": markdown_content,
+                "method_used": "mammoth-doc",
+                "images": []  # Legacy .doc image extraction is complex
+            }
+
+            if chunk_size > 0 and len(markdown_content) > chunk_size:
+                chunks = _chunk_markdown(markdown_content, chunk_size)
+                conversion_result["chunks"] = chunks
+
+            if preserve_structure:
+                structure = _extract_markdown_structure(markdown_content)
+                conversion_result["structure"] = structure
+
+            return conversion_result
+
+    except ImportError:
+        raise OfficeFileError("Legacy .doc conversion requires mammoth library")
+    except Exception as e:
+        raise OfficeFileError(f"Legacy .doc conversion failed: {str(e)}")
+
+
+def _paragraph_to_markdown(paragraph, preserve_structure: bool) -> str:
+    """Convert a Word paragraph to markdown format."""
+    text = paragraph.text.strip()
+    if not text:
+        return ""
+
+    if not preserve_structure:
+        return text
+
+    # Handle different paragraph styles
+    style_name = paragraph.style.name.lower() if paragraph.style else ""
+
+    if "heading" in style_name:
+        # Extract heading level from style name
+        import re
+        level_match = re.search(r'(\d+)', style_name)
+        level = int(level_match.group(1)) if level_match else 1
+        return f"{'#' * level} {text}"
+    elif "title" in style_name:
+        return f"# {text}"
+    elif "subtitle" in style_name:
+        return f"## {text}"
+    elif style_name in ["list paragraph", "list"]:
+        return f"- {text}"
+    elif "quote" in style_name:
+        return f"> {text}"
+    else:
+        return text
+
+
+def _table_to_markdown(table) -> str:
+    """Convert a Word table to markdown format."""
+    markdown_rows = []
+
+    for i, row in enumerate(table.rows):
+        cells = [cell.text.strip().replace('\n', ' ') for cell in row.cells]
+        markdown_row = "| " + " | ".join(cells) + " |"
+        markdown_rows.append(markdown_row)
+
+        # Add header separator after first row
+        if i == 0:
+            separator = "| " + " | ".join(["---"] * len(cells)) + " |"
+            markdown_rows.append(separator)
+
+    return "\n".join(markdown_rows)
+
+
+def _html_to_markdown(html_content: str, preserve_structure: bool) -> str:
+    """Convert HTML content to markdown format."""
+    import re
+
+    # Basic HTML to Markdown conversions
+    conversions = [
+        (r'<h1[^>]*>(.*?)</h1>', r'# \1'),
+        (r'<h2[^>]*>(.*?)</h2>', r'## \1'),
+        (r'<h3[^>]*>(.*?)</h3>', r'### \1'),
+        (r'<h4[^>]*>(.*?)</h4>', r'#### \1'),
+        (r'<h5[^>]*>(.*?)</h5>', r'##### \1'),
+        (r'<h6[^>]*>(.*?)</h6>', r'###### \1'),
+        (r'<strong[^>]*>(.*?)</strong>', r'**\1**'),
+        (r'<b[^>]*>(.*?)</b>', r'**\1**'),
+        (r'<em[^>]*>(.*?)</em>', r'*\1*'),
+        (r'<i[^>]*>(.*?)</i>', r'*\1*'),
+        (r'<code[^>]*>(.*?)</code>', r'`\1`'),
+        (r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)'),
+        (r'<img[^>]*src="([^"]*)"[^>]*/?>', r'![](\1)'),
+        (r'<p[^>]*>(.*?)</p>', r'\1\n'),
+        (r'<br[^>]*/?>', r'\n'),
+        (r'<li[^>]*>(.*?)</li>', r'- \1'),
+        (r'<ul[^>]*>(.*?)</ul>', r'\1'),
+        (r'<ol[^>]*>(.*?)</ol>', r'\1'),
+        (r'<blockquote[^>]*>(.*?)</blockquote>', r'> \1'),
+    ]
+
+    markdown = html_content
+    for pattern, replacement in conversions:
+        markdown = re.sub(pattern, replacement, markdown, flags=re.DOTALL | re.IGNORECASE)
+
+    # Clean up extra whitespace
+    markdown = re.sub(r'\n\s*\n\s*\n', '\n\n', markdown)
+    markdown = re.sub(r'^\s+|\s+$', '', markdown, flags=re.MULTILINE)
+
+    return markdown
+
+
+def _chunk_markdown(content: str, chunk_size: int) -> list[dict[str, Any]]:
+    """Split markdown content into chunks while preserving structure."""
+    chunks = []
+    lines = content.split('\n')
+    current_chunk = []
+    current_size = 0
+    chunk_num = 1
+
+    for line in lines:
+        line_size = len(line) + 1  # +1 for newline
+
+        # If adding this line would exceed chunk size and we have content
+        if current_size + line_size > chunk_size and current_chunk:
+            chunks.append({
+                "chunk_number": chunk_num,
+                "content": '\n'.join(current_chunk),
+                "character_count": current_size,
+                "line_count": len(current_chunk)
+            })
+            current_chunk = []
+            current_size = 0
+            chunk_num += 1
+
+        current_chunk.append(line)
+        current_size += line_size
+
+    # Add final chunk if there's remaining content
+    if current_chunk:
+        chunks.append({
+            "chunk_number": chunk_num,
+            "content": '\n'.join(current_chunk),
+            "character_count": current_size,
+            "line_count": len(current_chunk)
+        })
+
+    return chunks
+
+
+def _extract_markdown_structure(content: str) -> dict[str, Any]:
+    """Extract structure information from markdown content."""
+    import re
+
+    structure = {
+        "headings": [],
+        "lists": 0,
+        "links": 0,
+        "images": 0,
+        "code_blocks": 0,
+        "tables": 0,
+        "line_count": len(content.split('\n'))
+    }
+
+    lines = content.split('\n')
+    for i, line in enumerate(lines):
+        # Find headings
+        heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
+        if heading_match:
+            level = len(heading_match.group(1))
+            text = heading_match.group(2).strip()
+            structure["headings"].append({
+                "level": level,
+                "text": text,
+                "line_number": i + 1
+            })
+
+        # Count other elements
+        if re.match(r'^[-*+]\s+', line):
+            structure["lists"] += 1
+
+        structure["links"] += len(re.findall(r'\[([^\]]+)\]\([^)]+\)', line))
+        structure["images"] += len(re.findall(r'!\[([^\]]*)\]\([^)]+\)', line))
+
+        if line.strip().startswith('```'):
+            structure["code_blocks"] += 1
+
+        if '|' in line and line.count('|') >= 2:
+            structure["tables"] += 1
+
+    return structure
+
+
 def main():
    """Main entry point for the MCP server."""
-    import asyncio
    import sys

    if len(sys.argv) > 1 and sys.argv[1] == "--version":