📖 Add bookmark-based chapter extraction for precise content targeting

- Add bookmark_name parameter for extracting specific chapters/sections - Implement bookmark boundary detection using Word XML structure - Extract content between bookmark start/end markers with smart extension - More reliable than page ranges - bookmarks are anchored to exact locations - Support chapter extraction like bookmark_name='Chapter1_Start' - Include bookmark metadata in response with element ranges - Perfect for extracting individual chapters from large documents
2025-08-22 08:02:50 -06:00 · 2025-08-22 08:02:50 -06:00 · 6484036b69
commit 6484036b69
parent b2033fc239
1 changed files with 87 additions and 14 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -292,6 +292,7 @@ async def convert_to_markdown(
    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
+    bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
    summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
 ) -> dict[str, Any]:
@ -333,11 +334,15 @@ async def convert_to_markdown(
        # Parse page range if provided
        page_numbers = _parse_page_range(page_range) if page_range else None
        
+        # Prioritize bookmark extraction over page ranges
+        if bookmark_name:
+            page_numbers = None  # Ignore page ranges when bookmark is specified
+        
        # Convert to markdown based on format
        if extension == ".docx":
            markdown_result = await _convert_docx_to_markdown(
                local_path, include_images, image_mode, max_image_size,
-                preserve_structure, page_numbers, summary_only, output_dir
+                preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
            )
        else:  # .doc
            # For legacy .doc files, use mammoth if available
@ -1053,7 +1058,8 @@ async def _convert_docx_to_markdown(
    preserve_structure: bool,
    page_numbers: list[int],
    summary_only: bool,
-    output_dir: str
+    output_dir: str,
+    bookmark_name: str = ""
 ) -> dict[str, Any]:
    """Convert .docx file to markdown with comprehensive feature support."""
    import base64
@ -1062,12 +1068,12 @@ async def _convert_docx_to_markdown(
    if summary_only:
        return await _get_ultra_fast_summary(file_path)
    
-    # If page_numbers is specified, we need to use python-docx for page-based extraction
+    # If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction
    # as mammoth processes the entire document
-    if page_numbers:
+    if page_numbers or bookmark_name:
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
        )
    
    try:
@ -1186,13 +1192,13 @@ async def _convert_docx_to_markdown(
        # Fall back to python-docx with custom markdown conversion
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
        )
    except Exception:
        # Fall back to python-docx
        return await _convert_docx_with_python_docx(
            file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
        )


@ -1204,7 +1210,8 @@ async def _convert_docx_with_python_docx(
    preserve_structure: bool,
    page_numbers: list[int],
    summary_only: bool,
-    output_dir: str
+    output_dir: str,
+    bookmark_name: str = ""
 ) -> dict[str, Any]:
    """Convert .docx using python-docx with custom markdown conversion."""
    import base64
@ -1260,17 +1267,30 @@ async def _convert_docx_with_python_docx(
                    "markdown_ref": f"![Image {i+1}]({img['filename']})"
                })

-    # Process document elements with aggressive content limiting
-    # Since Word page detection is unreliable, use element-based limiting
-    if page_numbers:
+    # Handle bookmark-based extraction vs page-based vs full document
+    if bookmark_name:
+        # For bookmark extraction, find the bookmark boundaries
+        bookmark_range = await _find_bookmark_content_range(doc, bookmark_name)
+        if not bookmark_range:
+            return {
+                "content": f"Bookmark '{bookmark_name}' not found in document",
+                "method_used": "python-docx-bookmark-not-found",
+                "images": [],
+                "bookmark_error": True
+            }
+        max_paragraphs = 500  # Generous limit for bookmark sections
+        max_chars = 100000
+    elif page_numbers:
        # For page ranges, severely limit content extraction
        max_pages_requested = max(page_numbers) if page_numbers else 1
        # Rough estimate: ~20-30 paragraphs per page
        max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
        max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
+        bookmark_range = None
    else:
        max_paragraphs = 1000  # Large limit for full document
        max_chars = 200000
+        bookmark_range = None
    
    current_page = 1
    processed_paragraphs = 0
@ -1278,10 +1298,15 @@ async def _convert_docx_with_python_docx(
    include_current_page = not page_numbers or current_page in page_numbers
    table_of_contents = []  # Track headings with page numbers for TOC
    
-    for element in doc.element.body:
+    for element_idx, element in enumerate(doc.element.body):
        # Early termination if we've processed enough content
        if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
            break
+        
+        # Skip elements outside bookmark range if bookmark extraction is used
+        if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']):
+            continue
+            
        if isinstance(element, CT_P):
            paragraph = Paragraph(element, doc)
            
@ -1366,8 +1391,14 @@ async def _convert_docx_with_python_docx(
        "note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
    }
    
-    # Add page filtering info
-    if page_numbers:
+    # Add extraction method info
+    if bookmark_name and bookmark_range:
+        result["bookmark_extraction"] = {
+            "bookmark_name": bookmark_name,
+            "elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}",
+            "extraction_note": bookmark_range["note"]
+        }
+    elif page_numbers:
        result["pages_processed"] = page_numbers
        result["total_pages_in_range"] = len(page_numbers)

@ -1594,6 +1625,48 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
    return structure


+async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any]:
+    """Find the content range for a specific bookmark."""
+    try:
+        # Find bookmark start and end positions in the document
+        bookmark_starts = {}
+        bookmark_ends = {}
+        
+        # Look for bookmark markers in the document XML
+        for elem_idx, element in enumerate(doc.element.body):
+            # Look for bookmark start markers
+            for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+                name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
+                if name == bookmark_name:
+                    bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+                    bookmark_starts[bookmark_id] = elem_idx
+            
+            # Look for bookmark end markers
+            for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+                bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+                if bookmark_id in bookmark_starts:
+                    bookmark_ends[bookmark_id] = elem_idx
+                    break
+        
+        # Find the bookmark range
+        for bookmark_id, start_idx in bookmark_starts.items():
+            if bookmark_id in bookmark_ends:
+                end_idx = bookmark_ends[bookmark_id]
+                # Extend range to capture full sections (look for next major heading)
+                extended_end = min(end_idx + 50, len(doc.element.body) - 1)  # Extend by 50 elements or end of doc
+                return {
+                    'start_idx': start_idx,
+                    'end_idx': extended_end,
+                    'bookmark_id': bookmark_id,
+                    'note': f"Extracting content from bookmark '{bookmark_name}' (elements {start_idx}-{extended_end})"
+                }
+        
+        return None  # Bookmark not found
+        
+    except Exception:
+        return None  # Error finding bookmark
+
+
 async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
    """Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
    try: