📋 Add comprehensive Table of Contents extraction with smart chunking

- Extract headings with page numbers during document processing
- Generate optimized page ranges for each section/chapter
- Provide intelligent chunking suggestions (15-page optimal chunks)
- Classify section types (chapter, section, subsection, etc.)
- Calculate actual section lengths based on heading positions
- Include suggested_chunking with ready-to-use page ranges
- Perfect for extracting 200+ page documents section by section
This commit is contained in:
Ryan Malloy 2025-08-21 02:47:01 -06:00
parent d94bd39da6
commit 9c2f299d49

View File

@ -370,6 +370,10 @@ async def convert_to_markdown(
result["metadata"]["character_count"] = len(markdown_result["content"]) result["metadata"]["character_count"] = len(markdown_result["content"])
result["metadata"]["word_count"] = len(markdown_result["content"].split()) result["metadata"]["word_count"] = len(markdown_result["content"].split())
result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"] result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
# Add table of contents with page ranges for navigation
if "table_of_contents" in markdown_result:
result["table_of_contents"] = markdown_result["table_of_contents"]
else: else:
# Include full content for smaller documents or page ranges # Include full content for smaller documents or page ranges
result["markdown"] = markdown_result["content"] result["markdown"] = markdown_result["content"]
@ -1224,6 +1228,7 @@ async def _convert_docx_with_python_docx(
# Process document elements with page filtering if specified # Process document elements with page filtering if specified
current_page = 1 current_page = 1
include_current_page = not page_numbers or current_page in page_numbers include_current_page = not page_numbers or current_page in page_numbers
table_of_contents = [] # Track headings with page numbers for TOC
for element in doc.element.body: for element in doc.element.body:
if isinstance(element, CT_P): if isinstance(element, CT_P):
@ -1242,14 +1247,24 @@ async def _convert_docx_with_python_docx(
markdown_parts.append(markdown_text) markdown_parts.append(markdown_text)
structure_info["paragraphs"] += 1 structure_info["paragraphs"] += 1
# Track headings # Track headings for both structure and TOC
if preserve_structure and markdown_text.startswith('#'): if preserve_structure and markdown_text.startswith('#'):
level = len(markdown_text) - len(markdown_text.lstrip('#')) level = len(markdown_text) - len(markdown_text.lstrip('#'))
heading_text = markdown_text.lstrip('# ').strip() heading_text = markdown_text.lstrip('# ').strip()
structure_info["headings"].append({ heading_info = {
"level": level, "level": level,
"text": heading_text, "text": heading_text,
"position": len(markdown_parts) - 1 "position": len(markdown_parts) - 1,
"page": current_page
}
structure_info["headings"].append(heading_info)
# Add to table of contents
table_of_contents.append({
"level": level,
"title": heading_text,
"page": current_page,
"suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
}) })
elif isinstance(element, CT_Tbl): elif isinstance(element, CT_Tbl):
@ -1275,6 +1290,10 @@ async def _convert_docx_with_python_docx(
"images": images_info "images": images_info
} }
# Add table of contents for navigation
if table_of_contents:
result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
# Add page filtering info # Add page filtering info
if page_numbers: if page_numbers:
result["pages_processed"] = page_numbers result["pages_processed"] = page_numbers
@ -1503,6 +1522,115 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
return structure return structure
def _estimate_section_length(heading_level: int) -> int:
"""Estimate how many pages a section might span based on heading level."""
# Higher level headings (H1) tend to have longer sections
if heading_level == 1: # Major chapters
return 8
elif heading_level == 2: # Major sections
return 4
elif heading_level == 3: # Subsections
return 2
else: # Minor headings
return 1
def _optimize_toc_page_ranges(toc_entries: list) -> dict[str, Any]:
"""Optimize table of contents page ranges based on actual heading positions."""
optimized_toc = {
"sections": [],
"total_sections": len(toc_entries),
"suggested_chunking": []
}
for i, entry in enumerate(toc_entries):
# Calculate actual end page based on next heading or document end
if i + 1 < len(toc_entries):
next_page = toc_entries[i + 1]["page"]
actual_end_page = max(entry["page"], next_page - 1)
else:
# Last section - use estimated length
actual_end_page = entry["page"] + _estimate_section_length(entry["level"])
optimized_entry = {
"level": entry["level"],
"title": entry["title"],
"start_page": entry["page"],
"estimated_end_page": actual_end_page,
"suggested_page_range": f"{entry['page']}-{actual_end_page}",
"section_type": _classify_section_type(entry["level"], entry["title"])
}
optimized_toc["sections"].append(optimized_entry)
# Generate chunking suggestions
optimized_toc["suggested_chunking"] = _generate_chunking_suggestions(optimized_toc["sections"])
return optimized_toc
def _classify_section_type(level: int, title: str) -> str:
"""Classify section type based on level and title patterns."""
title_lower = title.lower()
if level == 1:
if any(word in title_lower for word in ["chapter", "part", "section"]):
return "chapter"
elif any(word in title_lower for word in ["introduction", "conclusion", "summary"]):
return "special_section"
else:
return "major_section"
elif level == 2:
return "section"
elif level == 3:
return "subsection"
else:
return "minor_heading"
def _generate_chunking_suggestions(sections: list) -> list[dict[str, Any]]:
"""Generate smart chunking suggestions based on document structure."""
suggestions = []
current_chunk_pages = 0
chunk_start = 1
chunk_sections = []
for section in sections:
section_pages = section["estimated_end_page"] - section["start_page"] + 1
# If adding this section would make chunk too large, finalize current chunk
if current_chunk_pages + section_pages > 15 and chunk_sections:
suggestions.append({
"chunk_number": len(suggestions) + 1,
"page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
"sections_included": [s["title"] for s in chunk_sections],
"estimated_pages": current_chunk_pages,
"description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" +
(f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
})
# Start new chunk
chunk_start = section["start_page"]
current_chunk_pages = section_pages
chunk_sections = [section]
else:
# Add to current chunk
current_chunk_pages += section_pages
chunk_sections.append(section)
# Add final chunk if any sections remain
if chunk_sections:
suggestions.append({
"chunk_number": len(suggestions) + 1,
"page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
"sections_included": [s["title"] for s in chunk_sections],
"estimated_pages": current_chunk_pages,
"description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" +
(f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
})
return suggestions
def _has_page_break(paragraph) -> bool: def _has_page_break(paragraph) -> bool:
"""Check if a paragraph contains a page break.""" """Check if a paragraph contains a page break."""
try: try: