diff --git a/reports/test_results.json b/reports/test_results.json index 45bc471..17f47cb 100644 --- a/reports/test_results.json +++ b/reports/test_results.json @@ -1,9 +1,9 @@ { "metadata": { - "start_time": "2026-01-11T07:15:14.417108", + "start_time": "2026-01-11T09:40:29.164041", "pytest_version": "9.0.2", - "end_time": "2026-01-11T07:15:15.173732", - "duration": 0.7566196918487549, + "end_time": "2026-01-11T09:40:30.048909", + "duration": 0.8848621845245361, "exit_status": 0 }, "summary": { diff --git a/src/mcp_office_tools/mixins/universal.py b/src/mcp_office_tools/mixins/universal.py index 0c4645f..5062da6 100644 --- a/src/mcp_office_tools/mixins/universal.py +++ b/src/mcp_office_tools/mixins/universal.py @@ -353,7 +353,8 @@ class UniversalMixin(MCPMixin): include_images: bool = Field(default=True, description="Index embedded images"), include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"), include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"), - include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)") + include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)"), + text_patterns_only: bool = Field(default=False, description="Ignore heading styles, detect chapters by 'Chapter X' text patterns only") ) -> dict[str, Any]: """Scan document and populate resource store with available content. @@ -415,7 +416,7 @@ class UniversalMixin(MCPMixin): # Index chapters (Word documents) if include_chapters and category == "word": try: - chapters = await self._index_word_chapters(local_path, doc_id) + chapters = await self._index_word_chapters(local_path, doc_id, text_patterns_only) indexed["resources"]["chapter"] = chapters except Exception as e: indexed["resources"]["chapter"] = {"error": str(e)} @@ -444,23 +445,39 @@ class UniversalMixin(MCPMixin): return indexed - async def _index_word_chapters(self, file_path: str, doc_id: str) -> list[dict]: - """Extract and index chapters from a Word document.""" + async def _index_word_chapters(self, file_path: str, doc_id: str, text_patterns_only: bool = False) -> list[dict]: + """Extract and index chapters/sections from a Word document. + + Detection strategy (in order): + 1. Primary: Heading 1 styles (structured, reliable) → section://doc/N + 2. Fallback: "Chapter X" text pattern (books, manuscripts) → chapter://doc/N + + If text_patterns_only=True, skips heading styles and uses only text patterns. + """ import re from docx import Document doc = Document(file_path) chapters = [] - current_chapter = None + current_section = None current_paragraphs = [] - chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE) + section_num = 0 - def save_chapter(): - nonlocal current_chapter, current_paragraphs - if current_chapter is not None: + # Detection patterns + chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE) + heading_styles = {'Heading 1', 'Heading1', 'Title', 'Titre', 'Überschrift 1'} + + def is_heading(para) -> bool: + """Check if paragraph is a heading style.""" + style_name = para.style.name if para.style else '' + return style_name in heading_styles or style_name.startswith('Heading 1') + + def save_section(resource_type: str = "chapter"): + nonlocal current_section, current_paragraphs, section_num + if current_section is not None and current_paragraphs: # Convert to markdown markdown_lines = [] - markdown_lines.append(f"# {current_chapter['title']}\n") + markdown_lines.append(f"# {current_section['title']}\n") for para in current_paragraphs: text = para.strip() if text: @@ -469,11 +486,11 @@ class UniversalMixin(MCPMixin): content = "\n".join(markdown_lines) resource = EmbeddedResource( - resource_id=str(current_chapter["number"]), - resource_type="chapter", + resource_id=str(current_section["number"]), + resource_type=resource_type, mime_type="text/markdown", data=content, - name=current_chapter["title"], + name=current_section["title"], metadata={ "word_count": len(content.split()), "paragraph_count": len(current_paragraphs) @@ -482,28 +499,51 @@ class UniversalMixin(MCPMixin): resource_store.store(doc_id, resource, file_path) chapters.append({ - "id": str(current_chapter["number"]), - "title": current_chapter["title"], - "uri": f"chapter://{doc_id}/{current_chapter['number']}", + "id": str(current_section["number"]), + "title": current_section["title"], + "uri": f"{resource_type}://{doc_id}/{current_section['number']}", "word_count": len(content.split()) }) - for para in doc.paragraphs: - text = para.text.strip() - match = chapter_pattern.match(text) + # Primary: detect by Heading 1 styles (structured, reliable) + # Skip if text_patterns_only=True (for messy docs with inconsistent styles) + if not text_patterns_only: + for para in doc.paragraphs: + text = para.text.strip() - if match: - save_chapter() - current_chapter = { - "number": int(match.group(1)), - "title": text[:100] - } - current_paragraphs = [] - elif current_chapter is not None: - current_paragraphs.append(text) + if is_heading(para) and text: + save_section("section") + section_num += 1 + current_section = { + "number": section_num, + "title": text[:100] + } + current_paragraphs = [] + elif current_section is not None: + current_paragraphs.append(text) - # Save last chapter - save_chapter() + save_section("section") + + # Fallback: try "Chapter X" text pattern (for docs without heading styles) + if not chapters: + current_section = None + current_paragraphs = [] + + for para in doc.paragraphs: + text = para.text.strip() + match = chapter_pattern.match(text) + + if match: + save_section("chapter") + current_section = { + "number": int(match.group(1)), + "title": text[:100] + } + current_paragraphs = [] + elif current_section is not None: + current_paragraphs.append(text) + + save_section("chapter") return chapters