⚡ Implement true page-range filtering for efficient processing
- Add page break detection using Word XML structure - Process only specified pages instead of full document + truncation - Route page-range requests to python-docx for granular control - Skip mammoth for page-specific processing (mammoth processes full doc) - Add page metadata to results when filtering is used - Significantly reduce memory usage and response size for large documents
This commit is contained in:
parent
f884c99bbd
commit
a485e05759
@ -1011,8 +1011,16 @@ async def _convert_docx_to_markdown(
|
|||||||
"""Convert .docx file to markdown with comprehensive feature support."""
|
"""Convert .docx file to markdown with comprehensive feature support."""
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
|
# If page_numbers is specified, we need to use python-docx for page-based extraction
|
||||||
|
# as mammoth processes the entire document
|
||||||
|
if page_numbers:
|
||||||
|
return await _convert_docx_with_python_docx(
|
||||||
|
file_path, include_images, image_mode, max_image_size,
|
||||||
|
preserve_structure, page_numbers, summary_only, output_dir
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try mammoth first for better HTML->Markdown conversion
|
# Try mammoth first for better HTML->Markdown conversion (full document only)
|
||||||
import mammoth
|
import mammoth
|
||||||
|
|
||||||
# Configure mammoth for markdown-friendly output
|
# Configure mammoth for markdown-friendly output
|
||||||
@ -1201,31 +1209,45 @@ async def _convert_docx_with_python_docx(
|
|||||||
"markdown_ref": f""
|
"markdown_ref": f""
|
||||||
})
|
})
|
||||||
|
|
||||||
# Process document elements
|
# Process document elements with page filtering if specified
|
||||||
|
current_page = 1
|
||||||
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
|
|
||||||
for element in doc.element.body:
|
for element in doc.element.body:
|
||||||
if isinstance(element, CT_P):
|
if isinstance(element, CT_P):
|
||||||
paragraph = Paragraph(element, doc)
|
paragraph = Paragraph(element, doc)
|
||||||
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
|
||||||
if markdown_text.strip():
|
# Check for page breaks
|
||||||
markdown_parts.append(markdown_text)
|
if _has_page_break(paragraph):
|
||||||
structure_info["paragraphs"] += 1
|
current_page += 1
|
||||||
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Only process content from specified pages
|
||||||
|
if include_current_page:
|
||||||
|
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
||||||
|
if markdown_text.strip():
|
||||||
|
markdown_parts.append(markdown_text)
|
||||||
|
structure_info["paragraphs"] += 1
|
||||||
|
|
||||||
# Track headings
|
# Track headings
|
||||||
if preserve_structure and markdown_text.startswith('#'):
|
if preserve_structure and markdown_text.startswith('#'):
|
||||||
level = len(markdown_text) - len(markdown_text.lstrip('#'))
|
level = len(markdown_text) - len(markdown_text.lstrip('#'))
|
||||||
heading_text = markdown_text.lstrip('# ').strip()
|
heading_text = markdown_text.lstrip('# ').strip()
|
||||||
structure_info["headings"].append({
|
structure_info["headings"].append({
|
||||||
"level": level,
|
"level": level,
|
||||||
"text": heading_text,
|
"text": heading_text,
|
||||||
"position": len(markdown_parts) - 1
|
"position": len(markdown_parts) - 1
|
||||||
})
|
})
|
||||||
|
|
||||||
elif isinstance(element, CT_Tbl):
|
elif isinstance(element, CT_Tbl):
|
||||||
table = Table(element, doc)
|
# Only process tables from specified pages
|
||||||
table_markdown = _table_to_markdown(table)
|
if include_current_page:
|
||||||
if table_markdown.strip():
|
table = Table(element, doc)
|
||||||
markdown_parts.append(table_markdown)
|
table_markdown = _table_to_markdown(table)
|
||||||
structure_info["tables"] += 1
|
if table_markdown.strip():
|
||||||
|
markdown_parts.append(table_markdown)
|
||||||
|
structure_info["tables"] += 1
|
||||||
|
|
||||||
# Add image references at the end if any
|
# Add image references at the end if any
|
||||||
if include_images and images_info:
|
if include_images and images_info:
|
||||||
@ -1240,6 +1262,11 @@ async def _convert_docx_with_python_docx(
|
|||||||
"method_used": "python-docx-custom",
|
"method_used": "python-docx-custom",
|
||||||
"images": images_info
|
"images": images_info
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Add page filtering info
|
||||||
|
if page_numbers:
|
||||||
|
result["pages_processed"] = page_numbers
|
||||||
|
result["total_pages_in_range"] = len(page_numbers)
|
||||||
|
|
||||||
# Handle summary mode
|
# Handle summary mode
|
||||||
if summary_only and len(markdown_content) > 5000:
|
if summary_only and len(markdown_content) > 5000:
|
||||||
@ -1464,6 +1491,20 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
|
|||||||
return structure
|
return structure
|
||||||
|
|
||||||
|
|
||||||
|
def _has_page_break(paragraph) -> bool:
|
||||||
|
"""Check if a paragraph contains a page break."""
|
||||||
|
try:
|
||||||
|
# Check for explicit page breaks in paragraph runs
|
||||||
|
for run in paragraph.runs:
|
||||||
|
if run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') is not None:
|
||||||
|
br_elem = run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br')
|
||||||
|
if br_elem is not None and br_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _parse_page_range(page_range: str) -> list[int]:
|
def _parse_page_range(page_range: str) -> list[int]:
|
||||||
"""Parse page range string into list of page numbers.
|
"""Parse page range string into list of page numbers.
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user