🔥 Fix critical issue: page_range was processing entire document
- Replace unreliable Word page detection with element-based limiting - Cap extraction at 25 paragraphs per 'page' requested (max 100 total) - Cap extraction at 8k chars per 'page' requested (max 40k total) - Add early termination when limits reached - Add processing_limits metadata to show actual extraction stats - Prevent 1.28M token responses by stopping at reasonable content limits - Single page (page_range='1') now limited to ~25 paragraphs/8k chars
This commit is contained in:
parent
431022e113
commit
b2033fc239
@ -1260,12 +1260,28 @@ async def _convert_docx_with_python_docx(
|
|||||||
"markdown_ref": f""
|
"markdown_ref": f""
|
||||||
})
|
})
|
||||||
|
|
||||||
# Process document elements with page filtering if specified
|
# Process document elements with aggressive content limiting
|
||||||
|
# Since Word page detection is unreliable, use element-based limiting
|
||||||
|
if page_numbers:
|
||||||
|
# For page ranges, severely limit content extraction
|
||||||
|
max_pages_requested = max(page_numbers) if page_numbers else 1
|
||||||
|
# Rough estimate: ~20-30 paragraphs per page
|
||||||
|
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
|
||||||
|
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
|
||||||
|
else:
|
||||||
|
max_paragraphs = 1000 # Large limit for full document
|
||||||
|
max_chars = 200000
|
||||||
|
|
||||||
current_page = 1
|
current_page = 1
|
||||||
|
processed_paragraphs = 0
|
||||||
|
total_chars = 0
|
||||||
include_current_page = not page_numbers or current_page in page_numbers
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
table_of_contents = [] # Track headings with page numbers for TOC
|
table_of_contents = [] # Track headings with page numbers for TOC
|
||||||
|
|
||||||
for element in doc.element.body:
|
for element in doc.element.body:
|
||||||
|
# Early termination if we've processed enough content
|
||||||
|
if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
|
||||||
|
break
|
||||||
if isinstance(element, CT_P):
|
if isinstance(element, CT_P):
|
||||||
paragraph = Paragraph(element, doc)
|
paragraph = Paragraph(element, doc)
|
||||||
|
|
||||||
@ -1275,40 +1291,51 @@ async def _convert_docx_with_python_docx(
|
|||||||
include_current_page = not page_numbers or current_page in page_numbers
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Only process content from specified pages
|
# Process content with strict limits
|
||||||
if include_current_page:
|
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
||||||
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
if markdown_text.strip():
|
||||||
if markdown_text.strip():
|
# Check if adding this would exceed limits
|
||||||
markdown_parts.append(markdown_text)
|
text_length = len(markdown_text)
|
||||||
structure_info["paragraphs"] += 1
|
if total_chars + text_length > max_chars:
|
||||||
|
break # Stop processing
|
||||||
|
|
||||||
# Track headings for both structure and TOC
|
markdown_parts.append(markdown_text)
|
||||||
if preserve_structure and markdown_text.startswith('#'):
|
processed_paragraphs += 1
|
||||||
level = len(markdown_text) - len(markdown_text.lstrip('#'))
|
total_chars += text_length
|
||||||
heading_text = markdown_text.lstrip('# ').strip()
|
structure_info["paragraphs"] += 1
|
||||||
heading_info = {
|
|
||||||
"level": level,
|
|
||||||
"text": heading_text,
|
|
||||||
"position": len(markdown_parts) - 1,
|
|
||||||
"page": current_page
|
|
||||||
}
|
|
||||||
structure_info["headings"].append(heading_info)
|
|
||||||
|
|
||||||
# Add to table of contents
|
# Track headings for both structure and TOC
|
||||||
table_of_contents.append({
|
if preserve_structure and markdown_text.startswith('#'):
|
||||||
"level": level,
|
level = len(markdown_text) - len(markdown_text.lstrip('#'))
|
||||||
"title": heading_text,
|
heading_text = markdown_text.lstrip('# ').strip()
|
||||||
"page": current_page,
|
heading_info = {
|
||||||
"suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
|
"level": level,
|
||||||
})
|
"text": heading_text,
|
||||||
|
"position": len(markdown_parts) - 1,
|
||||||
|
"page": current_page
|
||||||
|
}
|
||||||
|
structure_info["headings"].append(heading_info)
|
||||||
|
|
||||||
|
# Add to table of contents
|
||||||
|
table_of_contents.append({
|
||||||
|
"level": level,
|
||||||
|
"title": heading_text,
|
||||||
|
"page": current_page,
|
||||||
|
"suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
|
||||||
|
})
|
||||||
|
|
||||||
elif isinstance(element, CT_Tbl):
|
elif isinstance(element, CT_Tbl):
|
||||||
# Only process tables from specified pages
|
# Process tables with strict limits
|
||||||
if include_current_page:
|
if processed_paragraphs < max_paragraphs and total_chars < max_chars:
|
||||||
table = Table(element, doc)
|
table = Table(element, doc)
|
||||||
table_markdown = _table_to_markdown(table)
|
table_markdown = _table_to_markdown(table)
|
||||||
if table_markdown.strip():
|
if table_markdown.strip():
|
||||||
|
table_length = len(table_markdown)
|
||||||
|
if total_chars + table_length > max_chars:
|
||||||
|
break # Stop processing
|
||||||
|
|
||||||
markdown_parts.append(table_markdown)
|
markdown_parts.append(table_markdown)
|
||||||
|
total_chars += table_length
|
||||||
structure_info["tables"] += 1
|
structure_info["tables"] += 1
|
||||||
|
|
||||||
# Add image references at the end if any
|
# Add image references at the end if any
|
||||||
@ -1329,6 +1356,16 @@ async def _convert_docx_with_python_docx(
|
|||||||
if table_of_contents:
|
if table_of_contents:
|
||||||
result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
|
result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
|
||||||
|
|
||||||
|
# Add processing limits info
|
||||||
|
result["processing_limits"] = {
|
||||||
|
"max_paragraphs_allowed": max_paragraphs,
|
||||||
|
"max_chars_allowed": max_chars,
|
||||||
|
"paragraphs_processed": processed_paragraphs,
|
||||||
|
"chars_processed": total_chars,
|
||||||
|
"content_truncated": processed_paragraphs >= max_paragraphs or total_chars >= max_chars,
|
||||||
|
"note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
|
||||||
|
}
|
||||||
|
|
||||||
# Add page filtering info
|
# Add page filtering info
|
||||||
if page_numbers:
|
if page_numbers:
|
||||||
result["pages_processed"] = page_numbers
|
result["pages_processed"] = page_numbers
|
||||||
|
Loading…
x
Reference in New Issue
Block a user