diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index 620d822..5f85e58 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -293,6 +293,7 @@ async def convert_to_markdown( preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."), + chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."), summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')") ) -> dict[str, Any]: @@ -334,15 +335,15 @@ async def convert_to_markdown( # Parse page range if provided page_numbers = _parse_page_range(page_range) if page_range else None - # Prioritize bookmark extraction over page ranges - if bookmark_name: - page_numbers = None # Ignore page ranges when bookmark is specified + # Prioritize bookmark/chapter extraction over page ranges + if bookmark_name or chapter_name: + page_numbers = None # Ignore page ranges when bookmark or chapter is specified # Convert to markdown based on format if extension == ".docx": markdown_result = await _convert_docx_to_markdown( local_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir, bookmark_name + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name ) else: # .doc # For legacy .doc files, use mammoth if available @@ -1059,7 +1060,8 @@ async def _convert_docx_to_markdown( page_numbers: list[int], summary_only: bool, output_dir: str, - bookmark_name: str = "" + bookmark_name: str = "", + chapter_name: str = "" ) -> dict[str, Any]: """Convert .docx file to markdown with comprehensive feature support.""" import base64 @@ -1068,12 +1070,12 @@ async def _convert_docx_to_markdown( if summary_only: return await _get_ultra_fast_summary(file_path) - # If page_numbers or bookmark_name is specified, we need to use python-docx for targeted extraction + # If page_numbers, bookmark_name, or chapter_name is specified, we need to use python-docx for targeted extraction # as mammoth processes the entire document - if page_numbers or bookmark_name: + if page_numbers or bookmark_name or chapter_name: return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir, bookmark_name + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name ) try: @@ -1192,13 +1194,13 @@ async def _convert_docx_to_markdown( # Fall back to python-docx with custom markdown conversion return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir, bookmark_name + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name ) except Exception: # Fall back to python-docx return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, page_numbers, summary_only, output_dir, bookmark_name + preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name ) @@ -1211,7 +1213,8 @@ async def _convert_docx_with_python_docx( page_numbers: list[int], summary_only: bool, output_dir: str, - bookmark_name: str = "" + bookmark_name: str = "", + chapter_name: str = "" ) -> dict[str, Any]: """Convert .docx using python-docx with custom markdown conversion.""" import base64 @@ -1267,7 +1270,7 @@ async def _convert_docx_with_python_docx( "markdown_ref": f"![Image {i+1}]({img['filename']})" }) - # Handle bookmark-based extraction vs page-based vs full document + # Handle bookmark-based, chapter-based, or page-based extraction vs full document if bookmark_name: # For bookmark extraction, find the bookmark boundaries bookmark_range = await _find_bookmark_content_range(doc, bookmark_name) @@ -1280,6 +1283,21 @@ async def _convert_docx_with_python_docx( } max_paragraphs = 500 # Generous limit for bookmark sections max_chars = 100000 + chapter_range = None + elif chapter_name: + # For chapter extraction, find the heading boundaries + chapter_range = await _find_chapter_content_range(doc, chapter_name) + if not chapter_range: + return { + "content": f"Chapter '{chapter_name}' not found in document. Available headings will be listed in processing_limits.", + "method_used": "python-docx-chapter-not-found", + "images": [], + "chapter_error": True, + "available_headings": await _get_available_headings(doc) + } + max_paragraphs = 500 # Generous limit for chapter sections + max_chars = 100000 + bookmark_range = None elif page_numbers: # For page ranges, severely limit content extraction max_pages_requested = max(page_numbers) if page_numbers else 1 @@ -1287,10 +1305,12 @@ async def _convert_docx_with_python_docx( max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max bookmark_range = None + chapter_range = None else: max_paragraphs = 1000 # Large limit for full document max_chars = 200000 bookmark_range = None + chapter_range = None current_page = 1 processed_paragraphs = 0 @@ -1303,9 +1323,11 @@ async def _convert_docx_with_python_docx( if processed_paragraphs >= max_paragraphs or total_chars >= max_chars: break - # Skip elements outside bookmark range if bookmark extraction is used + # Skip elements outside bookmark/chapter range if targeted extraction is used if bookmark_range and not (bookmark_range['start_idx'] <= element_idx <= bookmark_range['end_idx']): continue + if chapter_range and not (chapter_range['start_idx'] <= element_idx <= chapter_range['end_idx']): + continue if isinstance(element, CT_P): paragraph = Paragraph(element, doc) @@ -1398,6 +1420,12 @@ async def _convert_docx_with_python_docx( "elements_range": f"{bookmark_range['start_idx']}-{bookmark_range['end_idx']}", "extraction_note": bookmark_range["note"] } + elif chapter_name and chapter_range: + result["chapter_extraction"] = { + "chapter_name": chapter_name, + "elements_range": f"{chapter_range['start_idx']}-{chapter_range['end_idx']}", + "extraction_note": chapter_range["note"] + } elif page_numbers: result["pages_processed"] = page_numbers result["total_pages_in_range"] = len(page_numbers) @@ -1667,6 +1695,103 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any return None # Error finding bookmark +async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]: + """Find the content range for a specific chapter by heading text.""" + try: + # Find heading that matches the chapter name + chapter_start_idx = None + chapter_end_idx = None + + # Search through document elements for matching heading + for elem_idx, element in enumerate(doc.element.body): + # Check if this element is a paragraph with heading style + try: + para = element + if para.tag.endswith('}p'): # Word paragraph element + # Get the text content + text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})) + + # Check if this matches our chapter name (case insensitive, flexible matching) + if text_content.strip() and chapter_name.lower() in text_content.lower().strip(): + # Check if it's actually a heading by looking at paragraph style + style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) + if style_elem: + style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') + if 'heading' in style_val.lower() or 'title' in style_val.lower(): + chapter_start_idx = elem_idx + break + # Also consider short text lines as potential headings + elif len(text_content.strip()) < 100: + chapter_start_idx = elem_idx + break + except Exception: + continue + + if chapter_start_idx is None: + return None # Chapter heading not found + + # Find the end of this chapter (next major heading or end of document) + chapter_end_idx = len(doc.element.body) - 1 # Default to end of document + + # Look for the next major heading to determine chapter end + for elem_idx in range(chapter_start_idx + 1, len(doc.element.body)): + try: + para = doc.element.body[elem_idx] + if para.tag.endswith('}p'): + # Check if this is a major heading (same level or higher than chapter start) + style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) + if style_elem: + style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') + if 'heading1' in style_val.lower() or 'title' in style_val.lower(): + chapter_end_idx = elem_idx - 1 + break + except Exception: + continue + + return { + 'start_idx': chapter_start_idx, + 'end_idx': chapter_end_idx, + 'chapter_name': chapter_name, + 'note': f"Extracting content for chapter '{chapter_name}' (elements {chapter_start_idx}-{chapter_end_idx})" + } + + except Exception: + return None # Error finding chapter + + +async def _get_available_headings(doc) -> list[str]: + """Extract available headings from the document to help users find chapter names.""" + try: + headings = [] + + # Search through document elements for headings + for element in doc.element.body[:100]: # Only check first 100 elements to avoid token issues + try: + if element.tag.endswith('}p'): # Word paragraph element + # Get the text content + text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})) + + if text_content.strip(): + # Check if it's a heading by looking at paragraph style + style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) + if style_elem: + style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') + if 'heading' in style_val.lower() or 'title' in style_val.lower(): + headings.append(text_content.strip()[:100]) # Limit heading length + # Also consider short text lines as potential headings + elif len(text_content.strip()) < 100: + # Only add if it looks like a heading (not just short random text) + if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']): + headings.append(text_content.strip()) + except Exception: + continue + + return headings[:20] # Return max 20 headings to avoid token issues + + except Exception: + return [] + + async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]: """Ultra-fast summary that extracts minimal data to prevent MCP token limits.""" try: @@ -1719,6 +1844,9 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]: # Create very basic summary summary_content = "\n\n".join(content_parts) + # Extract available headings for chapter navigation + available_headings = await _get_available_headings(doc) + return { "content": summary_content, "method_used": "ultra-fast-summary", @@ -1727,7 +1855,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]: "basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan", "bookmarks": bookmarks[:20] if bookmarks else [], # Limit to first 20 bookmarks "bookmark_count": len(bookmarks), - "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction." + "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction.", + "available_headings": available_headings[:10] if available_headings else [], # Limit to first 10 headings + "heading_count": len(available_headings), + "heading_note": "Use these headings with chapter_name parameter for chapter-based extraction when bookmarks are not available." } }