⚡ Implement true page-range filtering for efficient processing
- Add page break detection using Word XML structure - Process only specified pages instead of full document + truncation - Route page-range requests to python-docx for granular control - Skip mammoth for page-specific processing (mammoth processes full doc) - Add page metadata to results when filtering is used - Significantly reduce memory usage and response size for large documents
This commit is contained in:
parent
f884c99bbd
commit
a485e05759
@ -1011,8 +1011,16 @@ async def _convert_docx_to_markdown(
|
||||
"""Convert .docx file to markdown with comprehensive feature support."""
|
||||
import base64
|
||||
|
||||
# If page_numbers is specified, we need to use python-docx for page-based extraction
|
||||
# as mammoth processes the entire document
|
||||
if page_numbers:
|
||||
return await _convert_docx_with_python_docx(
|
||||
file_path, include_images, image_mode, max_image_size,
|
||||
preserve_structure, page_numbers, summary_only, output_dir
|
||||
)
|
||||
|
||||
try:
|
||||
# Try mammoth first for better HTML->Markdown conversion
|
||||
# Try mammoth first for better HTML->Markdown conversion (full document only)
|
||||
import mammoth
|
||||
|
||||
# Configure mammoth for markdown-friendly output
|
||||
@ -1201,10 +1209,22 @@ async def _convert_docx_with_python_docx(
|
||||
"markdown_ref": f""
|
||||
})
|
||||
|
||||
# Process document elements
|
||||
# Process document elements with page filtering if specified
|
||||
current_page = 1
|
||||
include_current_page = not page_numbers or current_page in page_numbers
|
||||
|
||||
for element in doc.element.body:
|
||||
if isinstance(element, CT_P):
|
||||
paragraph = Paragraph(element, doc)
|
||||
|
||||
# Check for page breaks
|
||||
if _has_page_break(paragraph):
|
||||
current_page += 1
|
||||
include_current_page = not page_numbers or current_page in page_numbers
|
||||
continue
|
||||
|
||||
# Only process content from specified pages
|
||||
if include_current_page:
|
||||
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
||||
if markdown_text.strip():
|
||||
markdown_parts.append(markdown_text)
|
||||
@ -1221,6 +1241,8 @@ async def _convert_docx_with_python_docx(
|
||||
})
|
||||
|
||||
elif isinstance(element, CT_Tbl):
|
||||
# Only process tables from specified pages
|
||||
if include_current_page:
|
||||
table = Table(element, doc)
|
||||
table_markdown = _table_to_markdown(table)
|
||||
if table_markdown.strip():
|
||||
@ -1241,6 +1263,11 @@ async def _convert_docx_with_python_docx(
|
||||
"images": images_info
|
||||
}
|
||||
|
||||
# Add page filtering info
|
||||
if page_numbers:
|
||||
result["pages_processed"] = page_numbers
|
||||
result["total_pages_in_range"] = len(page_numbers)
|
||||
|
||||
# Handle summary mode
|
||||
if summary_only and len(markdown_content) > 5000:
|
||||
markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]"
|
||||
@ -1464,6 +1491,20 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
|
||||
return structure
|
||||
|
||||
|
||||
def _has_page_break(paragraph) -> bool:
|
||||
"""Check if a paragraph contains a page break."""
|
||||
try:
|
||||
# Check for explicit page breaks in paragraph runs
|
||||
for run in paragraph.runs:
|
||||
if run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') is not None:
|
||||
br_elem = run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br')
|
||||
if br_elem is not None and br_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page':
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _parse_page_range(page_range: str) -> list[int]:
|
||||
"""Parse page range string into list of page numbers.
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user