📋 Add comprehensive Table of Contents extraction with smart chunking

- Extract headings with page numbers during document processing - Generate optimized page ranges for each section/chapter - Provide intelligent chunking suggestions (15-page optimal chunks) - Classify section types (chapter, section, subsection, etc.) - Calculate actual section lengths based on heading positions - Include suggested_chunking with ready-to-use page ranges - Perfect for extracting 200+ page documents section by section
2025-08-21 02:47:01 -06:00 · 2025-08-21 02:47:01 -06:00 · 9c2f299d49
commit 9c2f299d49
parent d94bd39da6
1 changed files with 131 additions and 3 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -370,6 +370,10 @@ async def convert_to_markdown(
            result["metadata"]["character_count"] = len(markdown_result["content"])
            result["metadata"]["word_count"] = len(markdown_result["content"].split())
            result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
            # Add table of contents with page ranges for navigation
            if "table_of_contents" in markdown_result:
                result["table_of_contents"] = markdown_result["table_of_contents"]
        else:
            # Include full content for smaller documents or page ranges
            result["markdown"] = markdown_result["content"]
@ -1224,6 +1228,7 @@ async def _convert_docx_with_python_docx(
    # Process document elements with page filtering if specified
    current_page = 1
    include_current_page = not page_numbers or current_page in page_numbers
    table_of_contents = []  # Track headings with page numbers for TOC
    for element in doc.element.body:
        if isinstance(element, CT_P):
@ -1242,14 +1247,24 @@ async def _convert_docx_with_python_docx(
                    markdown_parts.append(markdown_text)
                    structure_info["paragraphs"] += 1
-                    # Track headings
+                    # Track headings for both structure and TOC
                    if preserve_structure and markdown_text.startswith('#'):
                        level = len(markdown_text) - len(markdown_text.lstrip('#'))
                        heading_text = markdown_text.lstrip('# ').strip()
-                        structure_info["headings"].append({
+                        heading_info = {
                            "level": level,
                            "text": heading_text,
-                            "position": len(markdown_parts) - 1
+                            "position": len(markdown_parts) - 1,
                            "page": current_page
                        }
                        structure_info["headings"].append(heading_info)
                        # Add to table of contents
                        table_of_contents.append({
                            "level": level,
                            "title": heading_text,
                            "page": current_page,
                            "suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
                        })
        elif isinstance(element, CT_Tbl):
@ -1275,6 +1290,10 @@ async def _convert_docx_with_python_docx(
        "images": images_info
    }
    # Add table of contents for navigation
    if table_of_contents:
        result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
    # Add page filtering info
    if page_numbers:
        result["pages_processed"] = page_numbers
@ -1503,6 +1522,115 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
    return structure
 def _estimate_section_length(heading_level: int) -> int:
    """Estimate how many pages a section might span based on heading level."""
    # Higher level headings (H1) tend to have longer sections
    if heading_level == 1:  # Major chapters
        return 8
    elif heading_level == 2:  # Major sections
        return 4
    elif heading_level == 3:  # Subsections
        return 2
    else:  # Minor headings
        return 1
 def _optimize_toc_page_ranges(toc_entries: list) -> dict[str, Any]:
    """Optimize table of contents page ranges based on actual heading positions."""
    optimized_toc = {
        "sections": [],
        "total_sections": len(toc_entries),
        "suggested_chunking": []
    }
    for i, entry in enumerate(toc_entries):
        # Calculate actual end page based on next heading or document end
        if i + 1 < len(toc_entries):
            next_page = toc_entries[i + 1]["page"]
            actual_end_page = max(entry["page"], next_page - 1)
        else:
            # Last section - use estimated length
            actual_end_page = entry["page"] + _estimate_section_length(entry["level"])
        optimized_entry = {
            "level": entry["level"],
            "title": entry["title"],
            "start_page": entry["page"],
            "estimated_end_page": actual_end_page,
            "suggested_page_range": f"{entry['page']}-{actual_end_page}",
            "section_type": _classify_section_type(entry["level"], entry["title"])
        }
        optimized_toc["sections"].append(optimized_entry)
    # Generate chunking suggestions
    optimized_toc["suggested_chunking"] = _generate_chunking_suggestions(optimized_toc["sections"])
    return optimized_toc
 def _classify_section_type(level: int, title: str) -> str:
    """Classify section type based on level and title patterns."""
    title_lower = title.lower()
    if level == 1:
        if any(word in title_lower for word in ["chapter", "part", "section"]):
            return "chapter"
        elif any(word in title_lower for word in ["introduction", "conclusion", "summary"]):
            return "special_section"
        else:
            return "major_section"
    elif level == 2:
        return "section"
    elif level == 3:
        return "subsection"
    else:
        return "minor_heading"
 def _generate_chunking_suggestions(sections: list) -> list[dict[str, Any]]:
    """Generate smart chunking suggestions based on document structure."""
    suggestions = []
    current_chunk_pages = 0
    chunk_start = 1
    chunk_sections = []
    for section in sections:
        section_pages = section["estimated_end_page"] - section["start_page"] + 1
        # If adding this section would make chunk too large, finalize current chunk
        if current_chunk_pages + section_pages > 15 and chunk_sections:
            suggestions.append({
                "chunk_number": len(suggestions) + 1,
                "page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
                "sections_included": [s["title"] for s in chunk_sections],
                "estimated_pages": current_chunk_pages,
                "description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" + 
                              (f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
            })
            # Start new chunk
            chunk_start = section["start_page"]
            current_chunk_pages = section_pages
            chunk_sections = [section]
        else:
            # Add to current chunk
            current_chunk_pages += section_pages
            chunk_sections.append(section)
    # Add final chunk if any sections remain
    if chunk_sections:
        suggestions.append({
            "chunk_number": len(suggestions) + 1,
            "page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
            "sections_included": [s["title"] for s in chunk_sections],
            "estimated_pages": current_chunk_pages,
            "description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" + 
                          (f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
        })
    return suggestions
 def _has_page_break(paragraph) -> bool:
    """Check if a paragraph contains a page break."""
    try: