📋 Add comprehensive Table of Contents extraction with smart chunking
- Extract headings with page numbers during document processing - Generate optimized page ranges for each section/chapter - Provide intelligent chunking suggestions (15-page optimal chunks) - Classify section types (chapter, section, subsection, etc.) - Calculate actual section lengths based on heading positions - Include suggested_chunking with ready-to-use page ranges - Perfect for extracting 200+ page documents section by section
This commit is contained in:
parent
d94bd39da6
commit
9c2f299d49
@ -370,6 +370,10 @@ async def convert_to_markdown(
|
||||
result["metadata"]["character_count"] = len(markdown_result["content"])
|
||||
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
||||
result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
|
||||
|
||||
# Add table of contents with page ranges for navigation
|
||||
if "table_of_contents" in markdown_result:
|
||||
result["table_of_contents"] = markdown_result["table_of_contents"]
|
||||
else:
|
||||
# Include full content for smaller documents or page ranges
|
||||
result["markdown"] = markdown_result["content"]
|
||||
@ -1224,6 +1228,7 @@ async def _convert_docx_with_python_docx(
|
||||
# Process document elements with page filtering if specified
|
||||
current_page = 1
|
||||
include_current_page = not page_numbers or current_page in page_numbers
|
||||
table_of_contents = [] # Track headings with page numbers for TOC
|
||||
|
||||
for element in doc.element.body:
|
||||
if isinstance(element, CT_P):
|
||||
@ -1242,14 +1247,24 @@ async def _convert_docx_with_python_docx(
|
||||
markdown_parts.append(markdown_text)
|
||||
structure_info["paragraphs"] += 1
|
||||
|
||||
# Track headings
|
||||
# Track headings for both structure and TOC
|
||||
if preserve_structure and markdown_text.startswith('#'):
|
||||
level = len(markdown_text) - len(markdown_text.lstrip('#'))
|
||||
heading_text = markdown_text.lstrip('# ').strip()
|
||||
structure_info["headings"].append({
|
||||
heading_info = {
|
||||
"level": level,
|
||||
"text": heading_text,
|
||||
"position": len(markdown_parts) - 1
|
||||
"position": len(markdown_parts) - 1,
|
||||
"page": current_page
|
||||
}
|
||||
structure_info["headings"].append(heading_info)
|
||||
|
||||
# Add to table of contents
|
||||
table_of_contents.append({
|
||||
"level": level,
|
||||
"title": heading_text,
|
||||
"page": current_page,
|
||||
"suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
|
||||
})
|
||||
|
||||
elif isinstance(element, CT_Tbl):
|
||||
@ -1275,6 +1290,10 @@ async def _convert_docx_with_python_docx(
|
||||
"images": images_info
|
||||
}
|
||||
|
||||
# Add table of contents for navigation
|
||||
if table_of_contents:
|
||||
result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
|
||||
|
||||
# Add page filtering info
|
||||
if page_numbers:
|
||||
result["pages_processed"] = page_numbers
|
||||
@ -1503,6 +1522,115 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
|
||||
return structure
|
||||
|
||||
|
||||
def _estimate_section_length(heading_level: int) -> int:
|
||||
"""Estimate how many pages a section might span based on heading level."""
|
||||
# Higher level headings (H1) tend to have longer sections
|
||||
if heading_level == 1: # Major chapters
|
||||
return 8
|
||||
elif heading_level == 2: # Major sections
|
||||
return 4
|
||||
elif heading_level == 3: # Subsections
|
||||
return 2
|
||||
else: # Minor headings
|
||||
return 1
|
||||
|
||||
|
||||
def _optimize_toc_page_ranges(toc_entries: list) -> dict[str, Any]:
|
||||
"""Optimize table of contents page ranges based on actual heading positions."""
|
||||
optimized_toc = {
|
||||
"sections": [],
|
||||
"total_sections": len(toc_entries),
|
||||
"suggested_chunking": []
|
||||
}
|
||||
|
||||
for i, entry in enumerate(toc_entries):
|
||||
# Calculate actual end page based on next heading or document end
|
||||
if i + 1 < len(toc_entries):
|
||||
next_page = toc_entries[i + 1]["page"]
|
||||
actual_end_page = max(entry["page"], next_page - 1)
|
||||
else:
|
||||
# Last section - use estimated length
|
||||
actual_end_page = entry["page"] + _estimate_section_length(entry["level"])
|
||||
|
||||
optimized_entry = {
|
||||
"level": entry["level"],
|
||||
"title": entry["title"],
|
||||
"start_page": entry["page"],
|
||||
"estimated_end_page": actual_end_page,
|
||||
"suggested_page_range": f"{entry['page']}-{actual_end_page}",
|
||||
"section_type": _classify_section_type(entry["level"], entry["title"])
|
||||
}
|
||||
optimized_toc["sections"].append(optimized_entry)
|
||||
|
||||
# Generate chunking suggestions
|
||||
optimized_toc["suggested_chunking"] = _generate_chunking_suggestions(optimized_toc["sections"])
|
||||
|
||||
return optimized_toc
|
||||
|
||||
|
||||
def _classify_section_type(level: int, title: str) -> str:
|
||||
"""Classify section type based on level and title patterns."""
|
||||
title_lower = title.lower()
|
||||
|
||||
if level == 1:
|
||||
if any(word in title_lower for word in ["chapter", "part", "section"]):
|
||||
return "chapter"
|
||||
elif any(word in title_lower for word in ["introduction", "conclusion", "summary"]):
|
||||
return "special_section"
|
||||
else:
|
||||
return "major_section"
|
||||
elif level == 2:
|
||||
return "section"
|
||||
elif level == 3:
|
||||
return "subsection"
|
||||
else:
|
||||
return "minor_heading"
|
||||
|
||||
|
||||
def _generate_chunking_suggestions(sections: list) -> list[dict[str, Any]]:
|
||||
"""Generate smart chunking suggestions based on document structure."""
|
||||
suggestions = []
|
||||
current_chunk_pages = 0
|
||||
chunk_start = 1
|
||||
chunk_sections = []
|
||||
|
||||
for section in sections:
|
||||
section_pages = section["estimated_end_page"] - section["start_page"] + 1
|
||||
|
||||
# If adding this section would make chunk too large, finalize current chunk
|
||||
if current_chunk_pages + section_pages > 15 and chunk_sections:
|
||||
suggestions.append({
|
||||
"chunk_number": len(suggestions) + 1,
|
||||
"page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
|
||||
"sections_included": [s["title"] for s in chunk_sections],
|
||||
"estimated_pages": current_chunk_pages,
|
||||
"description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" +
|
||||
(f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
|
||||
})
|
||||
|
||||
# Start new chunk
|
||||
chunk_start = section["start_page"]
|
||||
current_chunk_pages = section_pages
|
||||
chunk_sections = [section]
|
||||
else:
|
||||
# Add to current chunk
|
||||
current_chunk_pages += section_pages
|
||||
chunk_sections.append(section)
|
||||
|
||||
# Add final chunk if any sections remain
|
||||
if chunk_sections:
|
||||
suggestions.append({
|
||||
"chunk_number": len(suggestions) + 1,
|
||||
"page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
|
||||
"sections_included": [s["title"] for s in chunk_sections],
|
||||
"estimated_pages": current_chunk_pages,
|
||||
"description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" +
|
||||
(f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
|
||||
})
|
||||
|
||||
return suggestions
|
||||
|
||||
|
||||
def _has_page_break(paragraph) -> bool:
|
||||
"""Check if a paragraph contains a page break."""
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user