Add chapter-based extraction for documents without bookmarks

- Add chapter_name parameter to convert_to_markdown tool - Implement _find_chapter_content_range() for heading-based navigation - Add _get_available_headings() to help users find chapter names - Include chapter extraction metadata in results - Enhanced ultra-fast summary with available headings - Provides alternative to bookmark extraction when bookmarks unavailable
2025-08-22 08:14:23 -06:00 · 2025-08-22 08:14:23 -06:00 · 778ef3a2d4
commit 778ef3a2d4
parent 6484036b69
1 changed files with 145 additions and 14 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -293,6 +293,7 @@ async def convert_to_markdown(
    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
    bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
    chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
    summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
 ) -> dict[str, Any]:
@ -334,15 +335,15 @@ async def convert_to_markdown(
        # Parse page range if provided
        page_numbers = _parse_page_range(page_range) if page_range else None
-        # Prioritize bookmark extraction over page ranges
+        # Prioritize bookmark/chapter extraction over page ranges
-        if bookmark_name:
+        if bookmark_name or chapter_name:
-            page_numbers = None  # Ignore page ranges when bookmark is specified
+            page_numbers = None  # Ignore page ranges when bookmark or chapter is specified
        # Convert to markdown based on format
        if extension == ".docx":
            markdown_result = await _convert_docx_to_markdown(
                local_path, include_images, image_mode, max_image_size,
-                preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+                preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
            )
        else:  # .doc
            # For legacy .doc files, use mammoth if available
@ -1059,7 +1060,8 @@ async def _convert_docx_to_markdown(
    page_numbers: list[int],
    summary_only: bool,
    output_dir: str,
-    bookmark_name: str = ""
+    bookmark_name: str = "",
    chapter_name: str = ""
 ) -> dict[str, Any]:
    """Convert .docx file to markdown with comprehensive feature support."""
    import base64
@ -1068,12 +1070,12 @@ async def _convert_docx_to_markdown(
    if summary_only:
        return await _get_ultra_fast_summary(file_path)
-    # If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction
+    # If page_numbers, bookmark_name, or chapter_name is specified, we need to use python-docx for targeted extraction
    # as mammoth processes the entire document
-    if page_numbers or bookmark_name:
+    if page_numbers or bookmark_name or chapter_name:
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )
    try:
@ -1192,13 +1194,13 @@ async def _convert_docx_to_markdown(
        # Fall back to python-docx with custom markdown conversion
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )
    except Exception:
        # Fall back to python-docx
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )
@ -1211,7 +1213,8 @@ async def _convert_docx_with_python_docx(
    page_numbers: list[int],
    summary_only: bool,
    output_dir: str,
-    bookmark_name: str = ""
+    bookmark_name: str = "",
    chapter_name: str = ""
 ) -> dict[str, Any]:
    """Convert .docx using python-docx with custom markdown conversion."""
    import base64
@ -1267,7 +1270,7 @@ async def _convert_docx_with_python_docx(
                    "markdown_ref": f"![Image {i+1}]({img['filename']})"
                })
-    # Handle bookmark-based extraction vs page-based vs full document
+    # Handle bookmark-based, chapter-based, or page-based extraction vs full document
    if bookmark_name:
        # For bookmark extraction, find the bookmark boundaries
        bookmark_range = await _find_bookmark_content_range(doc, bookmark_name)
@ -1280,6 +1283,21 @@ async def _convert_docx_with_python_docx(
            }
        max_paragraphs = 500  # Generous limit for bookmark sections
        max_chars = 100000
        chapter_range = None
    elif chapter_name:
        # For chapter extraction, find the heading boundaries
        chapter_range = await _find_chapter_content_range(doc, chapter_name)
        if not chapter_range:
            return {
                "content": f"Chapter '{chapter_name}' not found in document. Available headings will be listed in processing_limits.",
                "method_used": "python-docx-chapter-not-found", 
                "images": [],
                "chapter_error": True,
                "available_headings": await _get_available_headings(doc)
            }
        max_paragraphs = 500  # Generous limit for chapter sections
        max_chars = 100000
        bookmark_range = None
    elif page_numbers:
        # For page ranges, severely limit content extraction
        max_pages_requested = max(page_numbers) if page_numbers else 1
@ -1287,10 +1305,12 @@ async def _convert_docx_with_python_docx(
        max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
        max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
        bookmark_range = None
        chapter_range = None
    else:
        max_paragraphs = 1000  # Large limit for full document
        max_chars = 200000
        bookmark_range = None
        chapter_range = None
    current_page = 1
    processed_paragraphs = 0
@ -1303,9 +1323,11 @@ async def _convert_docx_with_python_docx(
        if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
            break
-        # Skip elements outside bookmark range if bookmark extraction is used
+        # Skip elements outside bookmark/chapter range if targeted extraction is used
        if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']):
            continue
        if chapter_range and not (chapter_range['start_idx'] <= element_idx <= chapter_range['end_idx']):
            continue
        if isinstance(element, CT_P):
            paragraph = Paragraph(element, doc)
@ -1398,6 +1420,12 @@ async def _convert_docx_with_python_docx(
            "elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}",
            "extraction_note": bookmark_range["note"]
        }
    elif chapter_name and chapter_range:
        result["chapter_extraction"] = {
            "chapter_name": chapter_name,
            "elements_range": f"{chapter_range['start_idx']}-{chapter_range['end_idx']}",
            "extraction_note": chapter_range["note"]
        }
    elif page_numbers:
        result["pages_processed"] = page_numbers
        result["total_pages_in_range"] = len(page_numbers)
@ -1667,6 +1695,103 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
        return None  # Error finding bookmark
 async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
    """Find the content range for a specific chapter by heading text."""
    try:
        # Find heading that matches the chapter name
        chapter_start_idx = None
        chapter_end_idx = None
        # Search through document elements for matching heading
        for elem_idx, element in enumerate(doc.element.body):
            # Check if this element is a paragraph with heading style
            try:
                para = element
                if para.tag.endswith('}p'):  # Word paragraph element
                    # Get the text content
                    text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
                    # Check if this matches our chapter name (case insensitive, flexible matching)
                    if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
                        # Check if it's actually a heading by looking at paragraph style
                        style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
                        if style_elem:
                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
                                chapter_start_idx = elem_idx
                                break
                        # Also consider short text lines as potential headings
                        elif len(text_content.strip()) < 100:
                            chapter_start_idx = elem_idx
                            break
            except Exception:
                continue
        if chapter_start_idx is None:
            return None  # Chapter heading not found
        # Find the end of this chapter (next major heading or end of document)
        chapter_end_idx = len(doc.element.body) - 1  # Default to end of document
        # Look for the next major heading to determine chapter end
        for elem_idx in range(chapter_start_idx + 1, len(doc.element.body)):
            try:
                para = doc.element.body[elem_idx]
                if para.tag.endswith('}p'):
                    # Check if this is a major heading (same level or higher than chapter start)
                    style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
                    if style_elem:
                        style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
                        if 'heading1' in style_val.lower() or 'title' in style_val.lower():
                            chapter_end_idx = elem_idx - 1
                            break
            except Exception:
                continue
        return {
            'start_idx': chapter_start_idx,
            'end_idx': chapter_end_idx,
            'chapter_name': chapter_name,
            'note': f"Extracting content for chapter '{chapter_name}' (elements {chapter_start_idx}-{chapter_end_idx})"
        }
    except Exception:
        return None  # Error finding chapter
 async def _get_available_headings(doc) -> list[str]:
    """Extract available headings from the document to help users find chapter names."""
    try:
        headings = []
        # Search through document elements for headings
        for element in doc.element.body[:100]:  # Only check first 100 elements to avoid token issues
            try:
                if element.tag.endswith('}p'):  # Word paragraph element
                    # Get the text content
                    text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
                    if text_content.strip():
                        # Check if it's a heading by looking at paragraph style
                        style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
                        if style_elem:
                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
                                headings.append(text_content.strip()[:100])  # Limit heading length
                        # Also consider short text lines as potential headings
                        elif len(text_content.strip()) < 100:
                            # Only add if it looks like a heading (not just short random text)
                            if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):
                                headings.append(text_content.strip())
            except Exception:
                continue
        return headings[:20]  # Return max 20 headings to avoid token issues
    except Exception:
        return []
 async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
    """Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
    try:
@ -1719,6 +1844,9 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
        # Create very basic summary
        summary_content = "\n\n".join(content_parts)
        # Extract available headings for chapter navigation
        available_headings = await _get_available_headings(doc)
        return {
            "content": summary_content,
            "method_used": "ultra-fast-summary", 
@ -1727,7 +1855,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
                "basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan",
                "bookmarks": bookmarks[:20] if bookmarks else [],  # Limit to first 20 bookmarks
                "bookmark_count": len(bookmarks),
-                "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction."
+                "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction.",
                "available_headings": available_headings[:10] if available_headings else [],  # Limit to first 10 headings
                "heading_count": len(available_headings),
                "heading_note": "Use these headings with chapter_name parameter for chapter-based extraction when bookmarks are not available."
            }
        }