Implement true page-range filtering for efficient processing

- Add page break detection using Word XML structure
- Process only specified pages instead of full document + truncation
- Route page-range requests to python-docx for granular control
- Skip mammoth for page-specific processing (mammoth processes full doc)
- Add page metadata to results when filtering is used
- Significantly reduce memory usage and response size for large documents
This commit is contained in:
Ryan Malloy 2025-08-19 13:12:19 -06:00
parent f884c99bbd
commit a485e05759

View File

@ -1011,8 +1011,16 @@ async def _convert_docx_to_markdown(
"""Convert .docx file to markdown with comprehensive feature support."""
import base64
# If page_numbers is specified, we need to use python-docx for page-based extraction
# as mammoth processes the entire document
if page_numbers:
return await _convert_docx_with_python_docx(
file_path, include_images, image_mode, max_image_size,
preserve_structure, page_numbers, summary_only, output_dir
)
try:
# Try mammoth first for better HTML->Markdown conversion
# Try mammoth first for better HTML->Markdown conversion (full document only)
import mammoth
# Configure mammoth for markdown-friendly output
@ -1201,31 +1209,45 @@ async def _convert_docx_with_python_docx(
"markdown_ref": f"![Image {i+1}]({img['filename']})"
})
# Process document elements
# Process document elements with page filtering if specified
current_page = 1
include_current_page = not page_numbers or current_page in page_numbers
for element in doc.element.body:
if isinstance(element, CT_P):
paragraph = Paragraph(element, doc)
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
if markdown_text.strip():
markdown_parts.append(markdown_text)
structure_info["paragraphs"] += 1
# Check for page breaks
if _has_page_break(paragraph):
current_page += 1
include_current_page = not page_numbers or current_page in page_numbers
continue
# Only process content from specified pages
if include_current_page:
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
if markdown_text.strip():
markdown_parts.append(markdown_text)
structure_info["paragraphs"] += 1
# Track headings
if preserve_structure and markdown_text.startswith('#'):
level = len(markdown_text) - len(markdown_text.lstrip('#'))
heading_text = markdown_text.lstrip('# ').strip()
structure_info["headings"].append({
"level": level,
"text": heading_text,
"position": len(markdown_parts) - 1
})
# Track headings
if preserve_structure and markdown_text.startswith('#'):
level = len(markdown_text) - len(markdown_text.lstrip('#'))
heading_text = markdown_text.lstrip('# ').strip()
structure_info["headings"].append({
"level": level,
"text": heading_text,
"position": len(markdown_parts) - 1
})
elif isinstance(element, CT_Tbl):
table = Table(element, doc)
table_markdown = _table_to_markdown(table)
if table_markdown.strip():
markdown_parts.append(table_markdown)
structure_info["tables"] += 1
# Only process tables from specified pages
if include_current_page:
table = Table(element, doc)
table_markdown = _table_to_markdown(table)
if table_markdown.strip():
markdown_parts.append(table_markdown)
structure_info["tables"] += 1
# Add image references at the end if any
if include_images and images_info:
@ -1240,6 +1262,11 @@ async def _convert_docx_with_python_docx(
"method_used": "python-docx-custom",
"images": images_info
}
# Add page filtering info
if page_numbers:
result["pages_processed"] = page_numbers
result["total_pages_in_range"] = len(page_numbers)
# Handle summary mode
if summary_only and len(markdown_content) > 5000:
@ -1464,6 +1491,20 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
return structure
def _has_page_break(paragraph) -> bool:
"""Check if a paragraph contains a page break."""
try:
# Check for explicit page breaks in paragraph runs
for run in paragraph.runs:
if run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') is not None:
br_elem = run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br')
if br_elem is not None and br_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page':
return True
return False
except Exception:
return False
def _parse_page_range(page_range: str) -> list[int]:
"""Parse page range string into list of page numbers.