Improve section detection with heading styles + fallback
Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
- Primary: Detect sections via Heading 1 styles (structured) - Fallback: Detect chapters via "Chapter X" text patterns - Add text_patterns_only flag to skip heading styles (for messy docs) This handles both well-structured business documents (manuals, PRDs) and narrative content (books with explicit chapter headings).
This commit is contained in:
parent
d569034fa3
commit
89ad0c849d
@ -1,9 +1,9 @@
|
|||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"start_time": "2026-01-11T07:15:14.417108",
|
"start_time": "2026-01-11T09:40:29.164041",
|
||||||
"pytest_version": "9.0.2",
|
"pytest_version": "9.0.2",
|
||||||
"end_time": "2026-01-11T07:15:15.173732",
|
"end_time": "2026-01-11T09:40:30.048909",
|
||||||
"duration": 0.7566196918487549,
|
"duration": 0.8848621845245361,
|
||||||
"exit_status": 0
|
"exit_status": 0
|
||||||
},
|
},
|
||||||
"summary": {
|
"summary": {
|
||||||
|
|||||||
@ -353,7 +353,8 @@ class UniversalMixin(MCPMixin):
|
|||||||
include_images: bool = Field(default=True, description="Index embedded images"),
|
include_images: bool = Field(default=True, description="Index embedded images"),
|
||||||
include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
|
include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
|
||||||
include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
|
include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
|
||||||
include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)")
|
include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)"),
|
||||||
|
text_patterns_only: bool = Field(default=False, description="Ignore heading styles, detect chapters by 'Chapter X' text patterns only")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Scan document and populate resource store with available content.
|
"""Scan document and populate resource store with available content.
|
||||||
|
|
||||||
@ -415,7 +416,7 @@ class UniversalMixin(MCPMixin):
|
|||||||
# Index chapters (Word documents)
|
# Index chapters (Word documents)
|
||||||
if include_chapters and category == "word":
|
if include_chapters and category == "word":
|
||||||
try:
|
try:
|
||||||
chapters = await self._index_word_chapters(local_path, doc_id)
|
chapters = await self._index_word_chapters(local_path, doc_id, text_patterns_only)
|
||||||
indexed["resources"]["chapter"] = chapters
|
indexed["resources"]["chapter"] = chapters
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
indexed["resources"]["chapter"] = {"error": str(e)}
|
indexed["resources"]["chapter"] = {"error": str(e)}
|
||||||
@ -444,23 +445,39 @@ class UniversalMixin(MCPMixin):
|
|||||||
|
|
||||||
return indexed
|
return indexed
|
||||||
|
|
||||||
async def _index_word_chapters(self, file_path: str, doc_id: str) -> list[dict]:
|
async def _index_word_chapters(self, file_path: str, doc_id: str, text_patterns_only: bool = False) -> list[dict]:
|
||||||
"""Extract and index chapters from a Word document."""
|
"""Extract and index chapters/sections from a Word document.
|
||||||
|
|
||||||
|
Detection strategy (in order):
|
||||||
|
1. Primary: Heading 1 styles (structured, reliable) → section://doc/N
|
||||||
|
2. Fallback: "Chapter X" text pattern (books, manuscripts) → chapter://doc/N
|
||||||
|
|
||||||
|
If text_patterns_only=True, skips heading styles and uses only text patterns.
|
||||||
|
"""
|
||||||
import re
|
import re
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
|
||||||
doc = Document(file_path)
|
doc = Document(file_path)
|
||||||
chapters = []
|
chapters = []
|
||||||
current_chapter = None
|
current_section = None
|
||||||
current_paragraphs = []
|
current_paragraphs = []
|
||||||
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
|
section_num = 0
|
||||||
|
|
||||||
def save_chapter():
|
# Detection patterns
|
||||||
nonlocal current_chapter, current_paragraphs
|
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
|
||||||
if current_chapter is not None:
|
heading_styles = {'Heading 1', 'Heading1', 'Title', 'Titre', 'Überschrift 1'}
|
||||||
|
|
||||||
|
def is_heading(para) -> bool:
|
||||||
|
"""Check if paragraph is a heading style."""
|
||||||
|
style_name = para.style.name if para.style else ''
|
||||||
|
return style_name in heading_styles or style_name.startswith('Heading 1')
|
||||||
|
|
||||||
|
def save_section(resource_type: str = "chapter"):
|
||||||
|
nonlocal current_section, current_paragraphs, section_num
|
||||||
|
if current_section is not None and current_paragraphs:
|
||||||
# Convert to markdown
|
# Convert to markdown
|
||||||
markdown_lines = []
|
markdown_lines = []
|
||||||
markdown_lines.append(f"# {current_chapter['title']}\n")
|
markdown_lines.append(f"# {current_section['title']}\n")
|
||||||
for para in current_paragraphs:
|
for para in current_paragraphs:
|
||||||
text = para.strip()
|
text = para.strip()
|
||||||
if text:
|
if text:
|
||||||
@ -469,11 +486,11 @@ class UniversalMixin(MCPMixin):
|
|||||||
content = "\n".join(markdown_lines)
|
content = "\n".join(markdown_lines)
|
||||||
|
|
||||||
resource = EmbeddedResource(
|
resource = EmbeddedResource(
|
||||||
resource_id=str(current_chapter["number"]),
|
resource_id=str(current_section["number"]),
|
||||||
resource_type="chapter",
|
resource_type=resource_type,
|
||||||
mime_type="text/markdown",
|
mime_type="text/markdown",
|
||||||
data=content,
|
data=content,
|
||||||
name=current_chapter["title"],
|
name=current_section["title"],
|
||||||
metadata={
|
metadata={
|
||||||
"word_count": len(content.split()),
|
"word_count": len(content.split()),
|
||||||
"paragraph_count": len(current_paragraphs)
|
"paragraph_count": len(current_paragraphs)
|
||||||
@ -482,28 +499,51 @@ class UniversalMixin(MCPMixin):
|
|||||||
resource_store.store(doc_id, resource, file_path)
|
resource_store.store(doc_id, resource, file_path)
|
||||||
|
|
||||||
chapters.append({
|
chapters.append({
|
||||||
"id": str(current_chapter["number"]),
|
"id": str(current_section["number"]),
|
||||||
"title": current_chapter["title"],
|
"title": current_section["title"],
|
||||||
"uri": f"chapter://{doc_id}/{current_chapter['number']}",
|
"uri": f"{resource_type}://{doc_id}/{current_section['number']}",
|
||||||
"word_count": len(content.split())
|
"word_count": len(content.split())
|
||||||
})
|
})
|
||||||
|
|
||||||
for para in doc.paragraphs:
|
# Primary: detect by Heading 1 styles (structured, reliable)
|
||||||
text = para.text.strip()
|
# Skip if text_patterns_only=True (for messy docs with inconsistent styles)
|
||||||
match = chapter_pattern.match(text)
|
if not text_patterns_only:
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
text = para.text.strip()
|
||||||
|
|
||||||
if match:
|
if is_heading(para) and text:
|
||||||
save_chapter()
|
save_section("section")
|
||||||
current_chapter = {
|
section_num += 1
|
||||||
"number": int(match.group(1)),
|
current_section = {
|
||||||
"title": text[:100]
|
"number": section_num,
|
||||||
}
|
"title": text[:100]
|
||||||
current_paragraphs = []
|
}
|
||||||
elif current_chapter is not None:
|
current_paragraphs = []
|
||||||
current_paragraphs.append(text)
|
elif current_section is not None:
|
||||||
|
current_paragraphs.append(text)
|
||||||
|
|
||||||
# Save last chapter
|
save_section("section")
|
||||||
save_chapter()
|
|
||||||
|
# Fallback: try "Chapter X" text pattern (for docs without heading styles)
|
||||||
|
if not chapters:
|
||||||
|
current_section = None
|
||||||
|
current_paragraphs = []
|
||||||
|
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
text = para.text.strip()
|
||||||
|
match = chapter_pattern.match(text)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
save_section("chapter")
|
||||||
|
current_section = {
|
||||||
|
"number": int(match.group(1)),
|
||||||
|
"title": text[:100]
|
||||||
|
}
|
||||||
|
current_paragraphs = []
|
||||||
|
elif current_section is not None:
|
||||||
|
current_paragraphs.append(text)
|
||||||
|
|
||||||
|
save_section("chapter")
|
||||||
|
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user