From 6484036b69f615dc6b6636bebfd9dd70b7568eab Mon Sep 17 00:00:00 2001
From: Ryan Malloy <ryan@supported.systems>
Date: Fri, 22 Aug 2025 08:02:50 -0600
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=96=20Add=20bookmark-based=20chapter?=
 =?UTF-8?q?=20extraction=20for=20precise=20content=20targeting?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add bookmark_name parameter for extracting specific chapters/sections
- Implement bookmark boundary detection using Word XML structure
- Extract content between bookmark start/end markers with smart extension
- More reliable than page ranges - bookmarks are anchored to exact locations
- Support chapter extraction like bookmark_name='Chapter1_Start'
- Include bookmark metadata in response with element ranges
- Perfect for extracting individual chapters from large documents
---
 src/mcp_office_tools/server.py | 101 ++++++++++++++++++++++++++++-----
 1 file changed, 87 insertions(+), 14 deletions(-)

diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py
index b26a578..620d822 100644
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@@ -292,6 +292,7 @@ async def convert_to_markdown(
     max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
     preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
     page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
+    bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
     summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
     output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
 ) -> dict[str, Any]:
@@ -333,11 +334,15 @@ async def convert_to_markdown(
         # Parse page range if provided
         page_numbers = _parse_page_range(page_range) if page_range else None
         
+        # Prioritize bookmark extraction over page ranges
+        if bookmark_name:
+            page_numbers = None  # Ignore page ranges when bookmark is specified
+        
         # Convert to markdown based on format
         if extension == ".docx":
             markdown_result = await _convert_docx_to_markdown(
                 local_path, include_images, image_mode, max_image_size,
-                preserve_structure, page_numbers, summary_only, output_dir
+                preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
             )
         else:  # .doc
             # For legacy .doc files, use mammoth if available
@@ -1053,7 +1058,8 @@ async def _convert_docx_to_markdown(
     preserve_structure: bool,
     page_numbers: list[int],
     summary_only: bool,
-    output_dir: str
+    output_dir: str,
+    bookmark_name: str = ""
 ) -> dict[str, Any]:
     """Convert .docx file to markdown with comprehensive feature support."""
     import base64
@@ -1062,12 +1068,12 @@ async def _convert_docx_to_markdown(
     if summary_only:
         return await _get_ultra_fast_summary(file_path)
     
-    # If page_numbers is specified, we need to use python-docx for page-based extraction
+    # If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction
     # as mammoth processes the entire document
-    if page_numbers:
+    if page_numbers or bookmark_name:
         return await _convert_docx_with_python_docx(
             file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
         )
     
     try:
@@ -1186,13 +1192,13 @@ async def _convert_docx_to_markdown(
         # Fall back to python-docx with custom markdown conversion
         return await _convert_docx_with_python_docx(
             file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
         )
     except Exception:
         # Fall back to python-docx
         return await _convert_docx_with_python_docx(
             file_path, include_images, image_mode, max_image_size,
-            preserve_structure, page_numbers, summary_only, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name
         )
 
 
@@ -1204,7 +1210,8 @@ async def _convert_docx_with_python_docx(
     preserve_structure: bool,
     page_numbers: list[int],
     summary_only: bool,
-    output_dir: str
+    output_dir: str,
+    bookmark_name: str = ""
 ) -> dict[str, Any]:
     """Convert .docx using python-docx with custom markdown conversion."""
     import base64
@@ -1260,17 +1267,30 @@ async def _convert_docx_with_python_docx(
                     "markdown_ref": f"![Image {i+1}]({img['filename']})"
                 })
 
-    # Process document elements with aggressive content limiting
-    # Since Word page detection is unreliable, use element-based limiting
-    if page_numbers:
+    # Handle bookmark-based extraction vs page-based vs full document
+    if bookmark_name:
+        # For bookmark extraction, find the bookmark boundaries
+        bookmark_range = await _find_bookmark_content_range(doc, bookmark_name)
+        if not bookmark_range:
+            return {
+                "content": f"Bookmark '{bookmark_name}' not found in document",
+                "method_used": "python-docx-bookmark-not-found",
+                "images": [],
+                "bookmark_error": True
+            }
+        max_paragraphs = 500  # Generous limit for bookmark sections
+        max_chars = 100000
+    elif page_numbers:
         # For page ranges, severely limit content extraction
         max_pages_requested = max(page_numbers) if page_numbers else 1
         # Rough estimate: ~20-30 paragraphs per page
         max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
         max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
+        bookmark_range = None
     else:
         max_paragraphs = 1000  # Large limit for full document
         max_chars = 200000
+        bookmark_range = None
     
     current_page = 1
     processed_paragraphs = 0
@@ -1278,10 +1298,15 @@ async def _convert_docx_with_python_docx(
     include_current_page = not page_numbers or current_page in page_numbers
     table_of_contents = []  # Track headings with page numbers for TOC
     
-    for element in doc.element.body:
+    for element_idx, element in enumerate(doc.element.body):
         # Early termination if we've processed enough content
         if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
             break
+        
+        # Skip elements outside bookmark range if bookmark extraction is used
+        if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']):
+            continue
+            
         if isinstance(element, CT_P):
             paragraph = Paragraph(element, doc)
             
@@ -1366,8 +1391,14 @@ async def _convert_docx_with_python_docx(
         "note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
     }
     
-    # Add page filtering info
-    if page_numbers:
+    # Add extraction method info
+    if bookmark_name and bookmark_range:
+        result["bookmark_extraction"] = {
+            "bookmark_name": bookmark_name,
+            "elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}",
+            "extraction_note": bookmark_range["note"]
+        }
+    elif page_numbers:
         result["pages_processed"] = page_numbers
         result["total_pages_in_range"] = len(page_numbers)
 
@@ -1594,6 +1625,48 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
     return structure
 
 
+async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any]:
+    """Find the content range for a specific bookmark."""
+    try:
+        # Find bookmark start and end positions in the document
+        bookmark_starts = {}
+        bookmark_ends = {}
+        
+        # Look for bookmark markers in the document XML
+        for elem_idx, element in enumerate(doc.element.body):
+            # Look for bookmark start markers
+            for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+                name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
+                if name == bookmark_name:
+                    bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+                    bookmark_starts[bookmark_id] = elem_idx
+            
+            # Look for bookmark end markers
+            for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+                bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+                if bookmark_id in bookmark_starts:
+                    bookmark_ends[bookmark_id] = elem_idx
+                    break
+        
+        # Find the bookmark range
+        for bookmark_id, start_idx in bookmark_starts.items():
+            if bookmark_id in bookmark_ends:
+                end_idx = bookmark_ends[bookmark_id]
+                # Extend range to capture full sections (look for next major heading)
+                extended_end = min(end_idx + 50, len(doc.element.body) - 1)  # Extend by 50 elements or end of doc
+                return {
+                    'start_idx': start_idx,
+                    'end_idx': extended_end,
+                    'bookmark_id': bookmark_id,
+                    'note': f"Extracting content from bookmark '{bookmark_name}' (elements {start_idx}-{extended_end})"
+                }
+        
+        return None  # Bookmark not found
+        
+    except Exception:
+        return None  # Error finding bookmark
+
+
 async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
     """Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
     try: