Fix critical xpath API bug breaking chapter/heading detection

python-docx elements don't support xpath() with namespaces kwarg. The calls silently failed in try/except blocks, causing chapter search and heading detection to never find matches. Fixed by replacing xpath(..., namespaces={...}) with: - findall('.//' + qn('w:t')) for text elements - find(qn('w:pPr')) + find(qn('w:pStyle')) for style detection - get(qn('w:val')) for attribute values Also fixed logic bug where elif prevented short-text fallback from running when a non-heading style existed on the paragraph.
2026-01-11 05:20:05 -07:00 · 2026-01-11 05:20:05 -07:00 · 2f39c4ec5b
commit 2f39c4ec5b
parent af6aadf559
2 changed files with 63 additions and 180 deletions
--- a/reports/test_results.json
+++ b/reports/test_results.json
@ -1,154 +1,18 @@
 {
  "metadata": {
-    "start_time": "2026-01-11T00:28:31.202459",
-    "end_time": "2026-01-11T00:28:33.718606",
-    "duration": 1.2442383766174316,
-    "exit_status": 0,
+    "start_time": "2026-01-11T05:19:25.816074",
    "pytest_version": "9.0.2",
-    "test_types": [
-      "pytest",
-      "torture_test"
-    ]
+    "end_time": "2026-01-11T05:19:26.468770",
+    "duration": 0.6526906490325928,
+    "exit_status": 0
  },
  "summary": {
-    "total": 6,
-    "passed": 5,
+    "total": 0,
+    "passed": 0,
    "failed": 0,
-    "skipped": 1,
-    "pass_rate": 83.33333333333334
+    "skipped": 0,
+    "pass_rate": 0
  },
-  "categories": {
-    "Excel": {
-      "total": 4,
-      "passed": 3,
-      "failed": 0,
-      "skipped": 1
-    },
-    "Word": {
-      "total": 2,
-      "passed": 2,
-      "failed": 0,
-      "skipped": 0
-    }
-  },
-  "tests": [
-    {
-      "name": "Excel Data Analysis",
-      "nodeid": "torture_test.py::test_excel_data_analysis",
-      "category": "Excel",
-      "outcome": "passed",
-      "duration": 0.17873024940490723,
-      "timestamp": "2026-01-11T00:28:33.696485",
-      "module": "torture_test",
-      "class": null,
-      "function": "test_excel_data_analysis",
-      "inputs": {
-        "file": "test_files/test_data.xlsx"
-      },
-      "outputs": {
-        "sheets_analyzed": [
-          "Test Data"
-        ]
-      },
-      "error": null,
-      "traceback": null
-    },
-    {
-      "name": "Excel Formula Extraction",
-      "nodeid": "torture_test.py::test_excel_formula_extraction",
-      "category": "Excel",
-      "outcome": "passed",
-      "duration": 0.0032067298889160156,
-      "timestamp": "2026-01-11T00:28:33.699697",
-      "module": "torture_test",
-      "class": null,
-      "function": "test_excel_formula_extraction",
-      "inputs": {
-        "file": "test_files/test_data.xlsx"
-      },
-      "outputs": {
-        "total_formulas": 8
-      },
-      "error": null,
-      "traceback": null
-    },
-    {
-      "name": "Excel Chart Data Generation",
-      "nodeid": "torture_test.py::test_excel_chart_generation",
-      "category": "Excel",
-      "outcome": "passed",
-      "duration": 0.0025446414947509766,
-      "timestamp": "2026-01-11T00:28:33.702246",
-      "module": "torture_test",
-      "class": null,
-      "function": "test_excel_chart_generation",
-      "inputs": {
-        "file": "test_files/test_data.xlsx",
-        "x_column": "Category",
-        "y_columns": [
-          "Value"
-        ]
-      },
-      "outputs": {
-        "chart_libraries": 2
-      },
-      "error": null,
-      "traceback": null
-    },
-    {
-      "name": "Word Structure Analysis",
-      "nodeid": "torture_test.py::test_word_structure_analysis",
-      "category": "Word",
-      "outcome": "passed",
-      "duration": 0.010314226150512695,
-      "timestamp": "2026-01-11T00:28:33.712565",
-      "module": "torture_test",
-      "class": null,
-      "function": "test_word_structure_analysis",
-      "inputs": {
-        "file": "test_files/test_document.docx"
-      },
-      "outputs": {
-        "total_headings": 0
-      },
-      "error": null,
-      "traceback": null
-    },
-    {
-      "name": "Word Table Extraction",
-      "nodeid": "torture_test.py::test_word_table_extraction",
-      "category": "Word",
-      "outcome": "passed",
-      "duration": 0.005824089050292969,
-      "timestamp": "2026-01-11T00:28:33.718393",
-      "module": "torture_test",
-      "class": null,
-      "function": "test_word_table_extraction",
-      "inputs": {
-        "file": "test_files/test_document.docx"
-      },
-      "outputs": {
-        "total_tables": 0
-      },
-      "error": null,
-      "traceback": null
-    },
-    {
-      "name": "Real Excel File Analysis (FORScan)",
-      "nodeid": "torture_test.py::test_real_excel_analysis",
-      "category": "Excel",
-      "outcome": "skipped",
-      "duration": 0,
-      "timestamp": "2026-01-11T00:28:33.718405",
-      "module": "torture_test",
-      "class": null,
-      "function": "test_real_excel_analysis",
-      "inputs": {
-        "file": "/home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx"
-      },
-      "outputs": null,
-      "error": "File not found: /home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx",
-      "traceback": null
-    }
-  ]
+  "categories": {},
+  "tests": []
 }
--- a/src/mcp_office_tools/utils/word_processing.py
+++ b/src/mcp_office_tools/utils/word_processing.py
@ -13,6 +13,7 @@ import zipfile
 from pathlib import Path
 from typing import Any

+from docx.oxml.ns import qn
 from PIL import Image

 # Temp directory configuration
@ -687,9 +688,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
        # Extract bookmarks (chapter markers)
        bookmarks = []
        try:
-            # Access document's bookmarks through the XML
-            for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
-                bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
+            # Access document's bookmarks through the XML using findall with qn()
+            # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
+            for bookmark in doc.element.body.findall('.//' + qn('w:bookmarkStart')):
+                bookmark_name = bookmark.get(qn('w:name'))
                if bookmark_name and not bookmark_name.startswith('_'):  # Skip system bookmarks
                    bookmarks.append(bookmark_name)
        except Exception:
@ -752,18 +754,19 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
        bookmark_starts = {}
        bookmark_ends = {}

-        # Look for bookmark markers in the document XML
+        # Look for bookmark markers in the document XML using findall with qn()
+        # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
        for elem_idx, element in enumerate(doc.element.body):
            # Look for bookmark start markers
-            for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
-                name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
+            for bookmark_start in element.findall('.//' + qn('w:bookmarkStart')):
+                name = bookmark_start.get(qn('w:name'))
                if name == bookmark_name:
-                    bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+                    bookmark_id = bookmark_start.get(qn('w:id'))
                    bookmark_starts[bookmark_id] = elem_idx

            # Look for bookmark end markers
-            for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
-                bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+            for bookmark_end in element.findall('.//' + qn('w:bookmarkEnd')):
+                bookmark_id = bookmark_end.get(qn('w:id'))
                if bookmark_id in bookmark_starts:
                    bookmark_ends[bookmark_id] = elem_idx
                    break
@ -794,26 +797,31 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
        chapter_start_idx = None
        chapter_end_idx = None

-        # Search through document elements for matching heading
+        # Search through document elements for matching heading using findall with qn()
+        # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
        for elem_idx, element in enumerate(doc.element.body):
            # Check if this element is a paragraph with heading style
            try:
                para = element
                if para.tag.endswith('}p'):  # Word paragraph element
-                    # Get the text content
-                    text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
+                    # Get the text content using findall instead of xpath
+                    text_content = ''.join(text_elem.text or '' for text_elem in para.findall('.//' + qn('w:t')))

                    # Check if this matches our chapter name (case insensitive, flexible matching)
                    if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
                        # Check if it's actually a heading by looking at paragraph style
-                        style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
-                        if style_elem:
-                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
-                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
-                                chapter_start_idx = elem_idx
-                                break
-                        # Also consider short text lines as potential headings
-                        elif len(text_content.strip()) < 100:
+                        # pStyle is inside pPr element
+                        pPr = para.find(qn('w:pPr'))
+                        is_heading_style = False
+                        if pPr is not None:
+                            pStyle = pPr.find(qn('w:pStyle'))
+                            if pStyle is not None:
+                                style_val = pStyle.get(qn('w:val'), '')
+                                is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
+
+                        # Accept if it's a heading style OR if it's short text (potential heading)
+                        # FIX: Previously used elif which skipped short text check when style existed but wasn't heading
+                        if is_heading_style or len(text_content.strip()) < 100:
                            chapter_start_idx = elem_idx
                            break
            except Exception:
@ -831,12 +839,14 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
                para = doc.element.body[elem_idx]
                if para.tag.endswith('}p'):
                    # Check if this is a major heading (same level or higher than chapter start)
-                    style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
-                    if style_elem:
-                        style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
-                        if 'heading1' in style_val.lower() or 'title' in style_val.lower():
-                            chapter_end_idx = elem_idx - 1
-                            break
+                    pPr = para.find(qn('w:pPr'))
+                    if pPr is not None:
+                        pStyle = pPr.find(qn('w:pStyle'))
+                        if pStyle is not None:
+                            style_val = pStyle.get(qn('w:val'), '')
+                            if 'heading1' in style_val.lower() or 'title' in style_val.lower():
+                                chapter_end_idx = elem_idx - 1
+                                break
            except Exception:
                continue

@ -869,17 +879,26 @@ async def _get_available_headings(doc) -> list[str]:

            try:
                if element.tag.endswith('}p'):  # Word paragraph element
-                    # Get the text content
-                    text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
+                    # Get the text content using findall instead of xpath
+                    # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
+                    text_content = ''.join(text_elem.text or '' for text_elem in element.findall('.//' + qn('w:t')))

                    if text_content.strip():
                        # Check if it's a heading by looking at paragraph style
-                        style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
-                        if style_elem:
-                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
-                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
-                                headings.append(text_content.strip()[:100])  # Limit heading length
-                        # Also consider short text lines as potential headings
+                        # pStyle is inside pPr element
+                        pPr = element.find(qn('w:pPr'))
+                        is_heading_style = False
+                        if pPr is not None:
+                            pStyle = pPr.find(qn('w:pStyle'))
+                            if pStyle is not None:
+                                style_val = pStyle.get(qn('w:val'), '')
+                                is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
+
+                        # Add if it's a heading style
+                        if is_heading_style:
+                            headings.append(text_content.strip()[:100])  # Limit heading length
+                        # Also consider short text lines as potential headings (independent of style check)
+                        # FIX: Previously used elif which skipped this when style existed but wasn't heading
                        elif len(text_content.strip()) < 100:
                            # Only add if it looks like a heading (not just short random text)
                            if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):