diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index 993b483..2fbaf44 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -1011,8 +1011,16 @@ async def _convert_docx_to_markdown( """Convert .docx file to markdown with comprehensive feature support.""" import base64 + # If page_numbers is specified, we need to use python-docx for page-based extraction + # as mammoth processes the entire document + if page_numbers: + return await _convert_docx_with_python_docx( + file_path, include_images, image_mode, max_image_size, + preserve_structure, page_numbers, summary_only, output_dir + ) + try: - # Try mammoth first for better HTML->Markdown conversion + # Try mammoth first for better HTML->Markdown conversion (full document only) import mammoth # Configure mammoth for markdown-friendly output @@ -1201,31 +1209,45 @@ async def _convert_docx_with_python_docx( "markdown_ref": f"![Image {i+1}]({img['filename']})" }) - # Process document elements + # Process document elements with page filtering if specified + current_page = 1 + include_current_page = not page_numbers or current_page in page_numbers + for element in doc.element.body: if isinstance(element, CT_P): paragraph = Paragraph(element, doc) - markdown_text = _paragraph_to_markdown(paragraph, preserve_structure) - if markdown_text.strip(): - markdown_parts.append(markdown_text) - structure_info["paragraphs"] += 1 + + # Check for page breaks + if _has_page_break(paragraph): + current_page += 1 + include_current_page = not page_numbers or current_page in page_numbers + continue + + # Only process content from specified pages + if include_current_page: + markdown_text = _paragraph_to_markdown(paragraph, preserve_structure) + if markdown_text.strip(): + markdown_parts.append(markdown_text) + structure_info["paragraphs"] += 1 - # Track headings - if preserve_structure and markdown_text.startswith('#'): - level = len(markdown_text) - len(markdown_text.lstrip('#')) - heading_text = markdown_text.lstrip('# ').strip() - structure_info["headings"].append({ - "level": level, - "text": heading_text, - "position": len(markdown_parts) - 1 - }) + # Track headings + if preserve_structure and markdown_text.startswith('#'): + level = len(markdown_text) - len(markdown_text.lstrip('#')) + heading_text = markdown_text.lstrip('# ').strip() + structure_info["headings"].append({ + "level": level, + "text": heading_text, + "position": len(markdown_parts) - 1 + }) elif isinstance(element, CT_Tbl): - table = Table(element, doc) - table_markdown = _table_to_markdown(table) - if table_markdown.strip(): - markdown_parts.append(table_markdown) - structure_info["tables"] += 1 + # Only process tables from specified pages + if include_current_page: + table = Table(element, doc) + table_markdown = _table_to_markdown(table) + if table_markdown.strip(): + markdown_parts.append(table_markdown) + structure_info["tables"] += 1 # Add image references at the end if any if include_images and images_info: @@ -1240,6 +1262,11 @@ async def _convert_docx_with_python_docx( "method_used": "python-docx-custom", "images": images_info } + + # Add page filtering info + if page_numbers: + result["pages_processed"] = page_numbers + result["total_pages_in_range"] = len(page_numbers) # Handle summary mode if summary_only and len(markdown_content) > 5000: @@ -1464,6 +1491,20 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]: return structure +def _has_page_break(paragraph) -> bool: + """Check if a paragraph contains a page break.""" + try: + # Check for explicit page breaks in paragraph runs + for run in paragraph.runs: + if run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') is not None: + br_elem = run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') + if br_elem is not None and br_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page': + return True + return False + except Exception: + return False + + def _parse_page_range(page_range: str) -> list[int]: """Parse page range string into list of page numbers.