🔥 Fix critical issue: page_range was processing entire document

- Replace unreliable Word page detection with element-based limiting
- Cap extraction at 25 paragraphs per 'page' requested (max 100 total)
- Cap extraction at 8k chars per 'page' requested (max 40k total)
- Add early termination when limits reached
- Add processing_limits metadata to show actual extraction stats
- Prevent 1.28M token responses by stopping at reasonable content limits
- Single page (page_range='1') now limited to ~25 paragraphs/8k chars
This commit is contained in:
Ryan Malloy 2025-08-22 08:00:02 -06:00
parent 431022e113
commit b2033fc239

View File

@ -1260,12 +1260,28 @@ async def _convert_docx_with_python_docx(
"markdown_ref": f"![Image {i+1}]({img['filename']})" "markdown_ref": f"![Image {i+1}]({img['filename']})"
}) })
# Process document elements with page filtering if specified # Process document elements with aggressive content limiting
# Since Word page detection is unreliable, use element-based limiting
if page_numbers:
# For page ranges, severely limit content extraction
max_pages_requested = max(page_numbers) if page_numbers else 1
# Rough estimate: ~20-30 paragraphs per page
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
else:
max_paragraphs = 1000 # Large limit for full document
max_chars = 200000
current_page = 1 current_page = 1
processed_paragraphs = 0
total_chars = 0
include_current_page = not page_numbers or current_page in page_numbers include_current_page = not page_numbers or current_page in page_numbers
table_of_contents = [] # Track headings with page numbers for TOC table_of_contents = [] # Track headings with page numbers for TOC
for element in doc.element.body: for element in doc.element.body:
# Early termination if we've processed enough content
if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
break
if isinstance(element, CT_P): if isinstance(element, CT_P):
paragraph = Paragraph(element, doc) paragraph = Paragraph(element, doc)
@ -1275,11 +1291,17 @@ async def _convert_docx_with_python_docx(
include_current_page = not page_numbers or current_page in page_numbers include_current_page = not page_numbers or current_page in page_numbers
continue continue
# Only process content from specified pages # Process content with strict limits
if include_current_page:
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure) markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
if markdown_text.strip(): if markdown_text.strip():
# Check if adding this would exceed limits
text_length = len(markdown_text)
if total_chars + text_length > max_chars:
break # Stop processing
markdown_parts.append(markdown_text) markdown_parts.append(markdown_text)
processed_paragraphs += 1
total_chars += text_length
structure_info["paragraphs"] += 1 structure_info["paragraphs"] += 1
# Track headings for both structure and TOC # Track headings for both structure and TOC
@ -1303,12 +1325,17 @@ async def _convert_docx_with_python_docx(
}) })
elif isinstance(element, CT_Tbl): elif isinstance(element, CT_Tbl):
# Only process tables from specified pages # Process tables with strict limits
if include_current_page: if processed_paragraphs < max_paragraphs and total_chars < max_chars:
table = Table(element, doc) table = Table(element, doc)
table_markdown = _table_to_markdown(table) table_markdown = _table_to_markdown(table)
if table_markdown.strip(): if table_markdown.strip():
table_length = len(table_markdown)
if total_chars + table_length > max_chars:
break # Stop processing
markdown_parts.append(table_markdown) markdown_parts.append(table_markdown)
total_chars += table_length
structure_info["tables"] += 1 structure_info["tables"] += 1
# Add image references at the end if any # Add image references at the end if any
@ -1329,6 +1356,16 @@ async def _convert_docx_with_python_docx(
if table_of_contents: if table_of_contents:
result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents) result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
# Add processing limits info
result["processing_limits"] = {
"max_paragraphs_allowed": max_paragraphs,
"max_chars_allowed": max_chars,
"paragraphs_processed": processed_paragraphs,
"chars_processed": total_chars,
"content_truncated": processed_paragraphs >= max_paragraphs or total_chars >= max_chars,
"note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
}
# Add page filtering info # Add page filtering info
if page_numbers: if page_numbers:
result["pages_processed"] = page_numbers result["pages_processed"] = page_numbers