Improve section detection with heading styles + fallback
Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
- Primary: Detect sections via Heading 1 styles (structured) - Fallback: Detect chapters via "Chapter X" text patterns - Add text_patterns_only flag to skip heading styles (for messy docs) This handles both well-structured business documents (manuals, PRDs) and narrative content (books with explicit chapter headings).
This commit is contained in:
parent
d569034fa3
commit
89ad0c849d
@ -1,9 +1,9 @@
|
||||
{
|
||||
"metadata": {
|
||||
"start_time": "2026-01-11T07:15:14.417108",
|
||||
"start_time": "2026-01-11T09:40:29.164041",
|
||||
"pytest_version": "9.0.2",
|
||||
"end_time": "2026-01-11T07:15:15.173732",
|
||||
"duration": 0.7566196918487549,
|
||||
"end_time": "2026-01-11T09:40:30.048909",
|
||||
"duration": 0.8848621845245361,
|
||||
"exit_status": 0
|
||||
},
|
||||
"summary": {
|
||||
|
||||
@ -353,7 +353,8 @@ class UniversalMixin(MCPMixin):
|
||||
include_images: bool = Field(default=True, description="Index embedded images"),
|
||||
include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
|
||||
include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
|
||||
include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)")
|
||||
include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)"),
|
||||
text_patterns_only: bool = Field(default=False, description="Ignore heading styles, detect chapters by 'Chapter X' text patterns only")
|
||||
) -> dict[str, Any]:
|
||||
"""Scan document and populate resource store with available content.
|
||||
|
||||
@ -415,7 +416,7 @@ class UniversalMixin(MCPMixin):
|
||||
# Index chapters (Word documents)
|
||||
if include_chapters and category == "word":
|
||||
try:
|
||||
chapters = await self._index_word_chapters(local_path, doc_id)
|
||||
chapters = await self._index_word_chapters(local_path, doc_id, text_patterns_only)
|
||||
indexed["resources"]["chapter"] = chapters
|
||||
except Exception as e:
|
||||
indexed["resources"]["chapter"] = {"error": str(e)}
|
||||
@ -444,23 +445,39 @@ class UniversalMixin(MCPMixin):
|
||||
|
||||
return indexed
|
||||
|
||||
async def _index_word_chapters(self, file_path: str, doc_id: str) -> list[dict]:
|
||||
"""Extract and index chapters from a Word document."""
|
||||
async def _index_word_chapters(self, file_path: str, doc_id: str, text_patterns_only: bool = False) -> list[dict]:
|
||||
"""Extract and index chapters/sections from a Word document.
|
||||
|
||||
Detection strategy (in order):
|
||||
1. Primary: Heading 1 styles (structured, reliable) → section://doc/N
|
||||
2. Fallback: "Chapter X" text pattern (books, manuscripts) → chapter://doc/N
|
||||
|
||||
If text_patterns_only=True, skips heading styles and uses only text patterns.
|
||||
"""
|
||||
import re
|
||||
from docx import Document
|
||||
|
||||
doc = Document(file_path)
|
||||
chapters = []
|
||||
current_chapter = None
|
||||
current_section = None
|
||||
current_paragraphs = []
|
||||
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
|
||||
section_num = 0
|
||||
|
||||
def save_chapter():
|
||||
nonlocal current_chapter, current_paragraphs
|
||||
if current_chapter is not None:
|
||||
# Detection patterns
|
||||
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
|
||||
heading_styles = {'Heading 1', 'Heading1', 'Title', 'Titre', 'Überschrift 1'}
|
||||
|
||||
def is_heading(para) -> bool:
|
||||
"""Check if paragraph is a heading style."""
|
||||
style_name = para.style.name if para.style else ''
|
||||
return style_name in heading_styles or style_name.startswith('Heading 1')
|
||||
|
||||
def save_section(resource_type: str = "chapter"):
|
||||
nonlocal current_section, current_paragraphs, section_num
|
||||
if current_section is not None and current_paragraphs:
|
||||
# Convert to markdown
|
||||
markdown_lines = []
|
||||
markdown_lines.append(f"# {current_chapter['title']}\n")
|
||||
markdown_lines.append(f"# {current_section['title']}\n")
|
||||
for para in current_paragraphs:
|
||||
text = para.strip()
|
||||
if text:
|
||||
@ -469,11 +486,11 @@ class UniversalMixin(MCPMixin):
|
||||
content = "\n".join(markdown_lines)
|
||||
|
||||
resource = EmbeddedResource(
|
||||
resource_id=str(current_chapter["number"]),
|
||||
resource_type="chapter",
|
||||
resource_id=str(current_section["number"]),
|
||||
resource_type=resource_type,
|
||||
mime_type="text/markdown",
|
||||
data=content,
|
||||
name=current_chapter["title"],
|
||||
name=current_section["title"],
|
||||
metadata={
|
||||
"word_count": len(content.split()),
|
||||
"paragraph_count": len(current_paragraphs)
|
||||
@ -482,28 +499,51 @@ class UniversalMixin(MCPMixin):
|
||||
resource_store.store(doc_id, resource, file_path)
|
||||
|
||||
chapters.append({
|
||||
"id": str(current_chapter["number"]),
|
||||
"title": current_chapter["title"],
|
||||
"uri": f"chapter://{doc_id}/{current_chapter['number']}",
|
||||
"id": str(current_section["number"]),
|
||||
"title": current_section["title"],
|
||||
"uri": f"{resource_type}://{doc_id}/{current_section['number']}",
|
||||
"word_count": len(content.split())
|
||||
})
|
||||
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
match = chapter_pattern.match(text)
|
||||
# Primary: detect by Heading 1 styles (structured, reliable)
|
||||
# Skip if text_patterns_only=True (for messy docs with inconsistent styles)
|
||||
if not text_patterns_only:
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
|
||||
if match:
|
||||
save_chapter()
|
||||
current_chapter = {
|
||||
"number": int(match.group(1)),
|
||||
"title": text[:100]
|
||||
}
|
||||
current_paragraphs = []
|
||||
elif current_chapter is not None:
|
||||
current_paragraphs.append(text)
|
||||
if is_heading(para) and text:
|
||||
save_section("section")
|
||||
section_num += 1
|
||||
current_section = {
|
||||
"number": section_num,
|
||||
"title": text[:100]
|
||||
}
|
||||
current_paragraphs = []
|
||||
elif current_section is not None:
|
||||
current_paragraphs.append(text)
|
||||
|
||||
# Save last chapter
|
||||
save_chapter()
|
||||
save_section("section")
|
||||
|
||||
# Fallback: try "Chapter X" text pattern (for docs without heading styles)
|
||||
if not chapters:
|
||||
current_section = None
|
||||
current_paragraphs = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
match = chapter_pattern.match(text)
|
||||
|
||||
if match:
|
||||
save_section("chapter")
|
||||
current_section = {
|
||||
"number": int(match.group(1)),
|
||||
"title": text[:100]
|
||||
}
|
||||
current_paragraphs = []
|
||||
elif current_section is not None:
|
||||
current_paragraphs.append(text)
|
||||
|
||||
save_section("chapter")
|
||||
|
||||
return chapters
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user