📋 Add comprehensive Table of Contents extraction with smart chunking

- Extract headings with page numbers during document processing - Generate optimized page ranges for each section/chapter - Provide intelligent chunking suggestions (15-page optimal chunks) - Classify section types (chapter, section, subsection, etc.) - Calculate actual section lengths based on heading positions - Include suggested_chunking with ready-to-use page ranges - Perfect for extracting 200+ page documents section by section
2025-08-21 02:47:01 -06:00 · 2025-08-21 02:47:01 -06:00 · 9c2f299d49
commit 9c2f299d49
parent d94bd39da6
1 changed files with 131 additions and 3 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -370,6 +370,10 @@ async def convert_to_markdown(
            result["metadata"]["character_count"] = len(markdown_result["content"])
            result["metadata"]["word_count"] = len(markdown_result["content"].split())
            result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
+            
+            # Add table of contents with page ranges for navigation
+            if "table_of_contents" in markdown_result:
+                result["table_of_contents"] = markdown_result["table_of_contents"]
        else:
            # Include full content for smaller documents or page ranges
            result["markdown"] = markdown_result["content"]
@ -1224,6 +1228,7 @@ async def _convert_docx_with_python_docx(
    # Process document elements with page filtering if specified
    current_page = 1
    include_current_page = not page_numbers or current_page in page_numbers
+    table_of_contents = []  # Track headings with page numbers for TOC
    
    for element in doc.element.body:
        if isinstance(element, CT_P):
@ -1242,14 +1247,24 @@ async def _convert_docx_with_python_docx(
                    markdown_parts.append(markdown_text)
                    structure_info["paragraphs"] += 1

-                    # Track headings
+                    # Track headings for both structure and TOC
                    if preserve_structure and markdown_text.startswith('#'):
                        level = len(markdown_text) - len(markdown_text.lstrip('#'))
                        heading_text = markdown_text.lstrip('# ').strip()
-                        structure_info["headings"].append({
+                        heading_info = {
                            "level": level,
                            "text": heading_text,
-                            "position": len(markdown_parts) - 1
+                            "position": len(markdown_parts) - 1,
+                            "page": current_page
+                        }
+                        structure_info["headings"].append(heading_info)
+                        
+                        # Add to table of contents
+                        table_of_contents.append({
+                            "level": level,
+                            "title": heading_text,
+                            "page": current_page,
+                            "suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
                        })

        elif isinstance(element, CT_Tbl):
@ -1275,6 +1290,10 @@ async def _convert_docx_with_python_docx(
        "images": images_info
    }
    
+    # Add table of contents for navigation
+    if table_of_contents:
+        result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
+    
    # Add page filtering info
    if page_numbers:
        result["pages_processed"] = page_numbers
@ -1503,6 +1522,115 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
    return structure


+def _estimate_section_length(heading_level: int) -> int:
+    """Estimate how many pages a section might span based on heading level."""
+    # Higher level headings (H1) tend to have longer sections
+    if heading_level == 1:  # Major chapters
+        return 8
+    elif heading_level == 2:  # Major sections
+        return 4
+    elif heading_level == 3:  # Subsections
+        return 2
+    else:  # Minor headings
+        return 1
+
+
+def _optimize_toc_page_ranges(toc_entries: list) -> dict[str, Any]:
+    """Optimize table of contents page ranges based on actual heading positions."""
+    optimized_toc = {
+        "sections": [],
+        "total_sections": len(toc_entries),
+        "suggested_chunking": []
+    }
+    
+    for i, entry in enumerate(toc_entries):
+        # Calculate actual end page based on next heading or document end
+        if i + 1 < len(toc_entries):
+            next_page = toc_entries[i + 1]["page"]
+            actual_end_page = max(entry["page"], next_page - 1)
+        else:
+            # Last section - use estimated length
+            actual_end_page = entry["page"] + _estimate_section_length(entry["level"])
+        
+        optimized_entry = {
+            "level": entry["level"],
+            "title": entry["title"],
+            "start_page": entry["page"],
+            "estimated_end_page": actual_end_page,
+            "suggested_page_range": f"{entry['page']}-{actual_end_page}",
+            "section_type": _classify_section_type(entry["level"], entry["title"])
+        }
+        optimized_toc["sections"].append(optimized_entry)
+    
+    # Generate chunking suggestions
+    optimized_toc["suggested_chunking"] = _generate_chunking_suggestions(optimized_toc["sections"])
+    
+    return optimized_toc
+
+
+def _classify_section_type(level: int, title: str) -> str:
+    """Classify section type based on level and title patterns."""
+    title_lower = title.lower()
+    
+    if level == 1:
+        if any(word in title_lower for word in ["chapter", "part", "section"]):
+            return "chapter"
+        elif any(word in title_lower for word in ["introduction", "conclusion", "summary"]):
+            return "special_section"
+        else:
+            return "major_section"
+    elif level == 2:
+        return "section"
+    elif level == 3:
+        return "subsection"
+    else:
+        return "minor_heading"
+
+
+def _generate_chunking_suggestions(sections: list) -> list[dict[str, Any]]:
+    """Generate smart chunking suggestions based on document structure."""
+    suggestions = []
+    current_chunk_pages = 0
+    chunk_start = 1
+    chunk_sections = []
+    
+    for section in sections:
+        section_pages = section["estimated_end_page"] - section["start_page"] + 1
+        
+        # If adding this section would make chunk too large, finalize current chunk
+        if current_chunk_pages + section_pages > 15 and chunk_sections:
+            suggestions.append({
+                "chunk_number": len(suggestions) + 1,
+                "page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
+                "sections_included": [s["title"] for s in chunk_sections],
+                "estimated_pages": current_chunk_pages,
+                "description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" + 
+                              (f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
+            })
+            
+            # Start new chunk
+            chunk_start = section["start_page"]
+            current_chunk_pages = section_pages
+            chunk_sections = [section]
+        else:
+            # Add to current chunk
+            current_chunk_pages += section_pages
+            chunk_sections.append(section)
+    
+    # Add final chunk if any sections remain
+    if chunk_sections:
+        suggestions.append({
+            "chunk_number": len(suggestions) + 1,
+            "page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}",
+            "sections_included": [s["title"] for s in chunk_sections],
+            "estimated_pages": current_chunk_pages,
+            "description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" + 
+                          (f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "")
+        })
+    
+    return suggestions
+
+
 def _has_page_break(paragraph) -> bool:
    """Check if a paragraph contains a page break."""
    try: