Add chapter-based extraction for documents without bookmarks

- Add chapter_name parameter to convert_to_markdown tool - Implement _find_chapter_content_range() for heading-based navigation - Add _get_available_headings() to help users find chapter names - Include chapter extraction metadata in results - Enhanced ultra-fast summary with available headings - Provides alternative to bookmark extraction when bookmarks unavailable
2025-08-22 08:14:23 -06:00 · 2025-08-22 08:14:23 -06:00 · 778ef3a2d4
commit 778ef3a2d4
parent 6484036b69
1 changed files with 145 additions and 14 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -293,6 +293,7 @@ async def convert_to_markdown(
    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
    bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
+    chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
    summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
 ) -> dict[str, Any]:
@ -334,15 +335,15 @@ async def convert_to_markdown(
        # Parse page range if provided
        page_numbers = _parse_page_range(page_range) if page_range else None
        
-        # Prioritize bookmark extraction over page ranges
-        if bookmark_name:
-            page_numbers = None  # Ignore page ranges when bookmark is specified
+        # Prioritize bookmark/chapter extraction over page ranges
+        if bookmark_name or chapter_name:
+            page_numbers = None  # Ignore page ranges when bookmark or chapter is specified
        
        # Convert to markdown based on format
        if extension == ".docx":
            markdown_result = await _convert_docx_to_markdown(
                local_path, include_images, image_mode, max_image_size,
-                preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+                preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
            )
        else:  # .doc
            # For legacy .doc files, use mammoth if available
@ -1059,7 +1060,8 @@ async def _convert_docx_to_markdown(
    page_numbers: list[int],
    summary_only: bool,
    output_dir: str,
-    bookmark_name: str = ""
+    bookmark_name: str = "",
+    chapter_name: str = ""
 ) -> dict[str, Any]:
    """Convert .docx file to markdown with comprehensive feature support."""
    import base64
@ -1068,12 +1070,12 @@ async def _convert_docx_to_markdown(
    if summary_only:
        return await _get_ultra_fast_summary(file_path)
    
-    # If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction
+    # If page_numbers, bookmark_name, or chapter_name is specified, we need to use python-docx for targeted extraction
    # as mammoth processes the entire document
-    if page_numbers or bookmark_name:
+    if page_numbers or bookmark_name or chapter_name:
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )
    
    try:
@ -1192,13 +1194,13 @@ async def _convert_docx_to_markdown(
        # Fall back to python-docx with custom markdown conversion
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )
    except Exception:
        # Fall back to python-docx
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )


@ -1211,7 +1213,8 @@ async def _convert_docx_with_python_docx(
    page_numbers: list[int],
    summary_only: bool,
    output_dir: str,
-    bookmark_name: str = ""
+    bookmark_name: str = "",
+    chapter_name: str = ""
 ) -> dict[str, Any]:
    """Convert .docx using python-docx with custom markdown conversion."""
    import base64
@ -1267,7 +1270,7 @@ async def _convert_docx_with_python_docx(
                    "markdown_ref": f"![Image {i+1}]({img['filename']})"
                })

-    # Handle bookmark-based extraction vs page-based vs full document
+    # Handle bookmark-based, chapter-based, or page-based extraction vs full document
    if bookmark_name:
        # For bookmark extraction, find the bookmark boundaries
        bookmark_range = await _find_bookmark_content_range(doc, bookmark_name)
@ -1280,6 +1283,21 @@ async def _convert_docx_with_python_docx(
            }
        max_paragraphs = 500  # Generous limit for bookmark sections
        max_chars = 100000
+        chapter_range = None
+    elif chapter_name:
+        # For chapter extraction, find the heading boundaries
+        chapter_range = await _find_chapter_content_range(doc, chapter_name)
+        if not chapter_range:
+            return {
+                "content": f"Chapter '{chapter_name}' not found in document. Available headings will be listed in processing_limits.",
+                "method_used": "python-docx-chapter-not-found", 
+                "images": [],
+                "chapter_error": True,
+                "available_headings": await _get_available_headings(doc)
+            }
+        max_paragraphs = 500  # Generous limit for chapter sections
+        max_chars = 100000
+        bookmark_range = None
    elif page_numbers:
        # For page ranges, severely limit content extraction
        max_pages_requested = max(page_numbers) if page_numbers else 1
@ -1287,10 +1305,12 @@ async def _convert_docx_with_python_docx(
        max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
        max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
        bookmark_range = None
+        chapter_range = None
    else:
        max_paragraphs = 1000  # Large limit for full document
        max_chars = 200000
        bookmark_range = None
+        chapter_range = None
    
    current_page = 1
    processed_paragraphs = 0
@ -1303,9 +1323,11 @@ async def _convert_docx_with_python_docx(
        if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
            break
        
-        # Skip elements outside bookmark range if bookmark extraction is used
+        # Skip elements outside bookmark/chapter range if targeted extraction is used
        if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']):
            continue
+        if chapter_range and not (chapter_range['start_idx'] <= element_idx <= chapter_range['end_idx']):
+            continue
            
        if isinstance(element, CT_P):
            paragraph = Paragraph(element, doc)
@ -1398,6 +1420,12 @@ async def _convert_docx_with_python_docx(
            "elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}",
            "extraction_note": bookmark_range["note"]
        }
+    elif chapter_name and chapter_range:
+        result["chapter_extraction"] = {
+            "chapter_name": chapter_name,
+            "elements_range": f"{chapter_range['start_idx']}-{chapter_range['end_idx']}",
+            "extraction_note": chapter_range["note"]
+        }
    elif page_numbers:
        result["pages_processed"] = page_numbers
        result["total_pages_in_range"] = len(page_numbers)
@ -1667,6 +1695,103 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
        return None  # Error finding bookmark


+async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
+    """Find the content range for a specific chapter by heading text."""
+    try:
+        # Find heading that matches the chapter name
+        chapter_start_idx = None
+        chapter_end_idx = None
+        
+        # Search through document elements for matching heading
+        for elem_idx, element in enumerate(doc.element.body):
+            # Check if this element is a paragraph with heading style
+            try:
+                para = element
+                if para.tag.endswith('}p'):  # Word paragraph element
+                    # Get the text content
+                    text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
+                    
+                    # Check if this matches our chapter name (case insensitive, flexible matching)
+                    if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
+                        # Check if it's actually a heading by looking at paragraph style
+                        style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+                        if style_elem:
+                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
+                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
+                                chapter_start_idx = elem_idx
+                                break
+                        # Also consider short text lines as potential headings
+                        elif len(text_content.strip()) < 100:
+                            chapter_start_idx = elem_idx
+                            break
+            except Exception:
+                continue
+        
+        if chapter_start_idx is None:
+            return None  # Chapter heading not found
+        
+        # Find the end of this chapter (next major heading or end of document)
+        chapter_end_idx = len(doc.element.body) - 1  # Default to end of document
+        
+        # Look for the next major heading to determine chapter end
+        for elem_idx in range(chapter_start_idx + 1, len(doc.element.body)):
+            try:
+                para = doc.element.body[elem_idx]
+                if para.tag.endswith('}p'):
+                    # Check if this is a major heading (same level or higher than chapter start)
+                    style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+                    if style_elem:
+                        style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
+                        if 'heading1' in style_val.lower() or 'title' in style_val.lower():
+                            chapter_end_idx = elem_idx - 1
+                            break
+            except Exception:
+                continue
+        
+        return {
+            'start_idx': chapter_start_idx,
+            'end_idx': chapter_end_idx,
+            'chapter_name': chapter_name,
+            'note': f"Extracting content for chapter '{chapter_name}' (elements {chapter_start_idx}-{chapter_end_idx})"
+        }
+        
+    except Exception:
+        return None  # Error finding chapter
+
+
+async def _get_available_headings(doc) -> list[str]:
+    """Extract available headings from the document to help users find chapter names."""
+    try:
+        headings = []
+        
+        # Search through document elements for headings
+        for element in doc.element.body[:100]:  # Only check first 100 elements to avoid token issues
+            try:
+                if element.tag.endswith('}p'):  # Word paragraph element
+                    # Get the text content
+                    text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
+                    
+                    if text_content.strip():
+                        # Check if it's a heading by looking at paragraph style
+                        style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+                        if style_elem:
+                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
+                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
+                                headings.append(text_content.strip()[:100])  # Limit heading length
+                        # Also consider short text lines as potential headings
+                        elif len(text_content.strip()) < 100:
+                            # Only add if it looks like a heading (not just short random text)
+                            if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):
+                                headings.append(text_content.strip())
+            except Exception:
+                continue
+        
+        return headings[:20]  # Return max 20 headings to avoid token issues
+        
+    except Exception:
+        return []
+
+
 async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
    """Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
    try:
@ -1719,6 +1844,9 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
        # Create very basic summary
        summary_content = "\n\n".join(content_parts)
        
+        # Extract available headings for chapter navigation
+        available_headings = await _get_available_headings(doc)
+        
        return {
            "content": summary_content,
            "method_used": "ultra-fast-summary", 
@ -1727,7 +1855,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
                "basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan",
                "bookmarks": bookmarks[:20] if bookmarks else [],  # Limit to first 20 bookmarks
                "bookmark_count": len(bookmarks),
-                "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction."
+                "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction.",
+                "available_headings": available_headings[:10] if available_headings else [],  # Limit to first 10 headings
+                "heading_count": len(available_headings),
+                "heading_note": "Use these headings with chapter_name parameter for chapter-based extraction when bookmarks are not available."
            }
        }