diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index 83fe70c..80ba6b1 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -370,6 +370,10 @@ async def convert_to_markdown( result["metadata"]["character_count"] = len(markdown_result["content"]) result["metadata"]["word_count"] = len(markdown_result["content"].split()) result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"] + + # Add table of contents with page ranges for navigation + if "table_of_contents" in markdown_result: + result["table_of_contents"] = markdown_result["table_of_contents"] else: # Include full content for smaller documents or page ranges result["markdown"] = markdown_result["content"] @@ -1224,6 +1228,7 @@ async def _convert_docx_with_python_docx( # Process document elements with page filtering if specified current_page = 1 include_current_page = not page_numbers or current_page in page_numbers + table_of_contents = [] # Track headings with page numbers for TOC for element in doc.element.body: if isinstance(element, CT_P): @@ -1242,14 +1247,24 @@ async def _convert_docx_with_python_docx( markdown_parts.append(markdown_text) structure_info["paragraphs"] += 1 - # Track headings + # Track headings for both structure and TOC if preserve_structure and markdown_text.startswith('#'): level = len(markdown_text) - len(markdown_text.lstrip('#')) heading_text = markdown_text.lstrip('# ').strip() - structure_info["headings"].append({ + heading_info = { "level": level, "text": heading_text, - "position": len(markdown_parts) - 1 + "position": len(markdown_parts) - 1, + "page": current_page + } + structure_info["headings"].append(heading_info) + + # Add to table of contents + table_of_contents.append({ + "level": level, + "title": heading_text, + "page": current_page, + "suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}" }) elif isinstance(element, CT_Tbl): @@ -1275,6 +1290,10 @@ async def _convert_docx_with_python_docx( "images": images_info } + # Add table of contents for navigation + if table_of_contents: + result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents) + # Add page filtering info if page_numbers: result["pages_processed"] = page_numbers @@ -1503,6 +1522,115 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]: return structure +def _estimate_section_length(heading_level: int) -> int: + """Estimate how many pages a section might span based on heading level.""" + # Higher level headings (H1) tend to have longer sections + if heading_level == 1: # Major chapters + return 8 + elif heading_level == 2: # Major sections + return 4 + elif heading_level == 3: # Subsections + return 2 + else: # Minor headings + return 1 + + +def _optimize_toc_page_ranges(toc_entries: list) -> dict[str, Any]: + """Optimize table of contents page ranges based on actual heading positions.""" + optimized_toc = { + "sections": [], + "total_sections": len(toc_entries), + "suggested_chunking": [] + } + + for i, entry in enumerate(toc_entries): + # Calculate actual end page based on next heading or document end + if i + 1 < len(toc_entries): + next_page = toc_entries[i + 1]["page"] + actual_end_page = max(entry["page"], next_page - 1) + else: + # Last section - use estimated length + actual_end_page = entry["page"] + _estimate_section_length(entry["level"]) + + optimized_entry = { + "level": entry["level"], + "title": entry["title"], + "start_page": entry["page"], + "estimated_end_page": actual_end_page, + "suggested_page_range": f"{entry['page']}-{actual_end_page}", + "section_type": _classify_section_type(entry["level"], entry["title"]) + } + optimized_toc["sections"].append(optimized_entry) + + # Generate chunking suggestions + optimized_toc["suggested_chunking"] = _generate_chunking_suggestions(optimized_toc["sections"]) + + return optimized_toc + + +def _classify_section_type(level: int, title: str) -> str: + """Classify section type based on level and title patterns.""" + title_lower = title.lower() + + if level == 1: + if any(word in title_lower for word in ["chapter", "part", "section"]): + return "chapter" + elif any(word in title_lower for word in ["introduction", "conclusion", "summary"]): + return "special_section" + else: + return "major_section" + elif level == 2: + return "section" + elif level == 3: + return "subsection" + else: + return "minor_heading" + + +def _generate_chunking_suggestions(sections: list) -> list[dict[str, Any]]: + """Generate smart chunking suggestions based on document structure.""" + suggestions = [] + current_chunk_pages = 0 + chunk_start = 1 + chunk_sections = [] + + for section in sections: + section_pages = section["estimated_end_page"] - section["start_page"] + 1 + + # If adding this section would make chunk too large, finalize current chunk + if current_chunk_pages + section_pages > 15 and chunk_sections: + suggestions.append({ + "chunk_number": len(suggestions) + 1, + "page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}", + "sections_included": [s["title"] for s in chunk_sections], + "estimated_pages": current_chunk_pages, + "description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" + + (f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "") + }) + + # Start new chunk + chunk_start = section["start_page"] + current_chunk_pages = section_pages + chunk_sections = [section] + else: + # Add to current chunk + current_chunk_pages += section_pages + chunk_sections.append(section) + + # Add final chunk if any sections remain + if chunk_sections: + suggestions.append({ + "chunk_number": len(suggestions) + 1, + "page_range": f"{chunk_start}-{chunk_sections[-1]['estimated_end_page']}", + "sections_included": [s["title"] for s in chunk_sections], + "estimated_pages": current_chunk_pages, + "description": f"Chunk {len(suggestions) + 1}: {chunk_sections[0]['title']}" + + (f" + {len(chunk_sections)-1} more sections" if len(chunk_sections) > 1 else "") + }) + + return suggestions + + def _has_page_break(paragraph) -> bool: """Check if a paragraph contains a page break.""" try: