📋 Add comprehensive Table of Contents extraction with smart chunking
- Extract headings with page numbers during document processing - Generate optimized page ranges for each section/chapter - Provide intelligent chunking suggestions (15-page optimal chunks) - Classify section types (chapter, section, subsection, etc.) - Calculate actual section lengths based on heading positions - Include suggested_chunking with ready-to-use page ranges - Perfect for extracting 200+ page documents section by section
This commit is contained in:
parent
d94bd39da6
commit
9c2f299d49
@ -370,6 +370,10 @@ async def convert_to_markdown(
|
|||||||
result["metadata"]["character_count"] = len(markdown_result["content"])
|
result["metadata"]["character_count"] = len(markdown_result["content"])
|
||||||
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
||||||
result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
|
result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
|
||||||
|
|
||||||
|
# Add table of contents with page ranges for navigation
|
||||||
|
if "table_of_contents" in markdown_result:
|
||||||
|
result["table_of_contents"] = markdown_result["table_of_contents"]
|
||||||
else:
|
else:
|
||||||
# Include full content for smaller documents or page ranges
|
# Include full content for smaller documents or page ranges
|
||||||
result["markdown"] = markdown_result["content"]
|
result["markdown"] = markdown_result["content"]
|
||||||
@ -1224,6 +1228,7 @@ async def _convert_docx_with_python_docx(
|
|||||||
# Process document elements with page filtering if specified
|
# Process document elements with page filtering if specified
|
||||||
current_page = 1
|
current_page = 1
|
||||||
include_current_page = not page_numbers or current_page in page_numbers
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
|
table_of_contents = [] # Track headings with page numbers for TOC
|
||||||
|
|
||||||
for element in doc.element.body:
|
for element in doc.element.body:
|
||||||
if isinstance(element, CT_P):
|
if isinstance(element, CT_P):
|
||||||
@ -1242,14 +1247,24 @@ async def _convert_docx_with_python_docx(
|
|||||||
markdown_parts.append(markdown_text)
|
markdown_parts.append(markdown_text)
|
||||||
structure_info["paragraphs"] += 1
|
structure_info["paragraphs"] += 1
|
||||||
|
|
||||||
# Track headings
|
# Track headings for both structure and TOC
|
||||||
if preserve_structure and markdown_text.startswith('#'):
|
if preserve_structure and markdown_text.startswith('#'):
|
||||||
level = len(markdown_text) - len(markdown_text.lstrip('#'))
|
level = len(markdown_text) - len(markdown_text.lstrip('#'))
|
||||||
heading_text = markdown_text.lstrip('# ').strip()
|
heading_text = markdown_text.lstrip('# ').strip()
|
||||||
structure_info["headings"].append({
|
heading_info = {
|
||||||
"level": level,
|
"level": level,
|
||||||
"text": heading_text,
|
"text": heading_text,
|
||||||
"position": len(markdown_parts) - 1
|
"position": len(markdown_parts) - 1,
|
||||||
|
"page": current_page
|
||||||
|
}
|
||||||
|
structure_info["headings"].append(heading_info)
|
||||||
|
|
||||||
|
# Add to table of contents
|
||||||
|
table_of_contents.append({
|
||||||
|
"level": level,
|
||||||
|
"title": heading_text,
|
||||||
|
"page": current_page,
|
||||||
|
"suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
|
||||||
})
|
})
|
||||||
|
|
||||||
elif isinstance(element, CT_Tbl):
|
elif isinstance(element, CT_Tbl):
|
||||||
@ -1275,6 +1290,10 @@ async def _convert_docx_with_python_docx(
|
|||||||
"images": images_info
|
"images": images_info
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Add table of contents for navigation
|
||||||
|
if table_of_contents:
|
||||||
|
result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
|
||||||
|
|
||||||
# Add page filtering info
|
# Add page filtering info
|
||||||
if page_numbers:
|
if page_numbers:
|
||||||
result["pages_processed"] = page_numbers
|
result["pages_processed"] = page_numbers
|
||||||
@ -1503,6 +1522,115 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
|
|||||||
return structure
|
return structure
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_section_length(heading_level: int) -> int:
|
||||||
|
"""Estimate how many pages a section might span based on heading level."""
|
||||||
|
# Higher level headings (H1) tend to have longer sections
|
||||||
|
if heading_level == 1: # Major chapters
|
||||||
|
return 8
|
||||||
|
elif heading_level == 2: # Major sections
|
||||||
|
return 4
|
||||||
|
elif heading_level == 3: # Subsections
|
||||||
|
return 2
|
||||||
|
else: # Minor headings
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def _optimize_toc_page_ranges(toc_entries: list) -> dict[str, Any]:
|
||||||
|
"""Optimize table of contents page ranges based on actual heading positions."""
|
||||||
|
optimized_toc = {
|
||||||
|
"sections": [],
|
||||||
|
"total_sections": len(toc_entries),
|
||||||
|
"suggested_chunking": []
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, entry in enumerate(toc_entries):
|
||||||
|
# Calculate actual end page based on next heading or document end
|
||||||
|
if i + 1 < len(toc_entries):
|
||||||
|
next_page = toc_entries[i + 1]["page"]
|
||||||
|
actual_end_page = max(entry["page"], next_page - 1)
|
||||||
|
else:
|
||||||
|
# Last section - use estimated length
|
||||||
|
actual_end_page = entry["page"] + _estimate_section_length(entry["level"])
|
||||||
|
|
||||||
|
optimized_entry = {
|
||||||
|
"level": entry["level"],
|
||||||
|
"title": entry["title"],
|
||||||
|
"start_page": entry["page"],
|
||||||
|
"estimated_end_page": actual_end_page,
|
||||||
|
"suggested_page_range": f"{entry['page']}-{actual_end_page}",
|
||||||
|
"section_type": _classify_section_type(entry["level"], entry["title"])
|
||||||
|
}
|
||||||
|
optimized_toc["sections"].append(optimized_entry)
|
||||||
|
|
||||||
|
# Generate chunking suggestions
|
||||||
|
optimized_toc["suggested_chunking"] = _generate_chunking_suggestions(optimized_toc["sections"])
|
||||||
|
|
||||||
|
return optimized_toc
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_section_type(level: int, title: str) -> str:
|
||||||
|
"""Classify section type based on level and title patterns."""
|
||||||
|
title_lower = title.lower()
|
||||||
|
|
||||||
|
if level == 1:
|
||||||
|
if any(word in title_lower for word in ["chapter", "part", "section"]):
|
||||||
|
return "chapter"
|
||||||
|
elif any(word in title_lower for word in ["introduction", "conclusion", "summary"]):
|
||||||
|
return "special_section"
|
||||||
|
else:
|
||||||
|
return "major_section"
|
||||||
|
elif level == 2:
|
||||||
|
return "section"
|
||||||
|
elif level == 3:
|
||||||
|
return "subsection"
|
||||||
|
else:
|
||||||
|
return "minor_heading"
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_chunking_suggestions(sections: list) -> list[dict[str, Any]]:
|
||||||
|
"""Generate smart chunking suggestions based on document structure."""
|
||||||
|
suggestions = []
|
||||||
|
current_chunk_pages = 0
|
||||||
|
chunk_start = 1
|
||||||
|
chunk_sections = []
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
section_pages = section["estimated_end_page"] - section["start_page"] + 1
|
||||||
|
|
||||||
|
# If adding this section would make chunk too large, finalize current chunk
|
||||||
|
if current_chunk_pages + section_pages > 15 and chunk_sections:
|
||||||
|
suggestions.append({
|
||||||
|
"chunk_number": len(suggestions) + 1,
|
||||||
|
"page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
|
||||||
|
"sections_included": [s["title"] for s in chunk_sections],
|
||||||
|
"estimated_pages": current_chunk_pages,
|
||||||
|
"description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" +
|
||||||
|
(f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
|
||||||
|
})
|
||||||
|
|
||||||
|
# Start new chunk
|
||||||
|
chunk_start = section["start_page"]
|
||||||
|
current_chunk_pages = section_pages
|
||||||
|
chunk_sections = [section]
|
||||||
|
else:
|
||||||
|
# Add to current chunk
|
||||||
|
current_chunk_pages += section_pages
|
||||||
|
chunk_sections.append(section)
|
||||||
|
|
||||||
|
# Add final chunk if any sections remain
|
||||||
|
if chunk_sections:
|
||||||
|
suggestions.append({
|
||||||
|
"chunk_number": len(suggestions) + 1,
|
||||||
|
"page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
|
||||||
|
"sections_included": [s["title"] for s in chunk_sections],
|
||||||
|
"estimated_pages": current_chunk_pages,
|
||||||
|
"description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" +
|
||||||
|
(f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
|
||||||
|
})
|
||||||
|
|
||||||
|
return suggestions
|
||||||
|
|
||||||
|
|
||||||
def _has_page_break(paragraph) -> bool:
|
def _has_page_break(paragraph) -> bool:
|
||||||
"""Check if a paragraph contains a page break."""
|
"""Check if a paragraph contains a page break."""
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user