From 6484036b69f615dc6b6636bebfd9dd70b7568eab Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Fri, 22 Aug 2025 08:02:50 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=96=20Add=20bookmark-based=20chapter?= =?UTF-8?q?=20extraction=20for=20precise=20content=20targeting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add bookmark_name parameter for extracting specific chapters/sections - Implement bookmark boundary detection using Word XML structure - Extract content between bookmark start/end markers with smart extension - More reliable than page ranges - bookmarks are anchored to exact locations - Support chapter extraction like bookmark_name='Chapter1_Start' - Include bookmark metadata in response with element ranges - Perfect for extracting individual chapters from large documents --- src/mcp_office_tools/server.py | 101 ++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 14 deletions(-) diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index b26a578..620d822 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -292,6 +292,7 @@ async def convert_to_markdown( max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), + bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."), summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')") ) -> dict[str, Any]: @@ -333,11 +334,15 @@ async def convert_to_markdown( # Parse page range if provided page_numbers = _parse_page_range(page_range) if page_range else None + # Prioritize bookmark extraction over page ranges + if bookmark_name: + page_numbers = None # Ignore page ranges when bookmark is specified + # Convert to markdown based on format if extension == ".docx": markdown_result = await _convert_docx_to_markdown( local_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name ) else: # .doc # For legacy .doc files, use mammoth if available @@ -1053,7 +1058,8 @@ async def _convert_docx_to_markdown( preserve_structure: bool, page_numbers: list[int], summary_only: bool, - output_dir: str + output_dir: str, + bookmark_name: str = "" ) -> dict[str, Any]: """Convert .docx file to markdown with comprehensive feature support.""" import base64 @@ -1062,12 +1068,12 @@ async def _convert_docx_to_markdown( if summary_only: return await _get_ultra_fast_summary(file_path) - # If page_numbers is specified, we need to use python-docx for page-based extraction + # If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction # as mammoth processes the entire document - if page_numbers: + if page_numbers or bookmark_name: return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name ) try: @@ -1186,13 +1192,13 @@ async def _convert_docx_to_markdown( # Fall back to python-docx with custom markdown conversion return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name ) except Exception: # Fall back to python-docx return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name ) @@ -1204,7 +1210,8 @@ async def _convert_docx_with_python_docx( preserve_structure: bool, page_numbers: list[int], summary_only: bool, - output_dir: str + output_dir: str, + bookmark_name: str = "" ) -> dict[str, Any]: """Convert .docx using python-docx with custom markdown conversion.""" import base64 @@ -1260,17 +1267,30 @@ async def _convert_docx_with_python_docx( "markdown_ref": f"![Image {i+1}]({img['filename']})" }) - # Process document elements with aggressive content limiting - # Since Word page detection is unreliable, use element-based limiting - if page_numbers: + # Handle bookmark-based extraction vs page-based vs full document + if bookmark_name: + # For bookmark extraction, find the bookmark boundaries + bookmark_range = await _find_bookmark_content_range(doc, bookmark_name) + if not bookmark_range: + return { + "content": f"Bookmark '{bookmark_name}' not found in document", + "method_used": "python-docx-bookmark-not-found", + "images": [], + "bookmark_error": True + } + max_paragraphs = 500 # Generous limit for bookmark sections + max_chars = 100000 + elif page_numbers: # For page ranges, severely limit content extraction max_pages_requested = max(page_numbers) if page_numbers else 1 # Rough estimate: ~20-30 paragraphs per page max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max + bookmark_range = None else: max_paragraphs = 1000 # Large limit for full document max_chars = 200000 + bookmark_range = None current_page = 1 processed_paragraphs = 0 @@ -1278,10 +1298,15 @@ async def _convert_docx_with_python_docx( include_current_page = not page_numbers or current_page in page_numbers table_of_contents = [] # Track headings with page numbers for TOC - for element in doc.element.body: + for element_idx, element in enumerate(doc.element.body): # Early termination if we've processed enough content if processed_paragraphs >= max_paragraphs or total_chars >= max_chars: break + + # Skip elements outside bookmark range if bookmark extraction is used + if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']): + continue + if isinstance(element, CT_P): paragraph = Paragraph(element, doc) @@ -1366,8 +1391,14 @@ async def _convert_docx_with_python_docx( "note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars" } - # Add page filtering info - if page_numbers: + # Add extraction method info + if bookmark_name and bookmark_range: + result["bookmark_extraction"] = { + "bookmark_name": bookmark_name, + "elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}", + "extraction_note": bookmark_range["note"] + } + elif page_numbers: result["pages_processed"] = page_numbers result["total_pages_in_range"] = len(page_numbers) @@ -1594,6 +1625,48 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]: return structure +async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any]: + """Find the content range for a specific bookmark.""" + try: + # Find bookmark start and end positions in the document + bookmark_starts = {} + bookmark_ends = {} + + # Look for bookmark markers in the document XML + for elem_idx, element in enumerate(doc.element.body): + # Look for bookmark start markers + for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): + name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name') + if name == bookmark_name: + bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') + bookmark_starts[bookmark_id] = elem_idx + + # Look for bookmark end markers + for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): + bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') + if bookmark_id in bookmark_starts: + bookmark_ends[bookmark_id] = elem_idx + break + + # Find the bookmark range + for bookmark_id, start_idx in bookmark_starts.items(): + if bookmark_id in bookmark_ends: + end_idx = bookmark_ends[bookmark_id] + # Extend range to capture full sections (look for next major heading) + extended_end = min(end_idx + 50, len(doc.element.body) - 1) # Extend by 50 elements or end of doc + return { + 'start_idx': start_idx, + 'end_idx': extended_end, + 'bookmark_id': bookmark_id, + 'note': f"Extracting content from bookmark '{bookmark_name}' (elements {start_idx}-{extended_end})" + } + + return None # Bookmark not found + + except Exception: + return None # Error finding bookmark + + async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]: """Ultra-fast summary that extracts minimal data to prevent MCP token limits.""" try: