Fix critical xpath API bug breaking chapter/heading detection

python-docx elements don't support xpath() with namespaces kwarg. The calls silently failed in try/except blocks, causing chapter search and heading detection to never find matches. Fixed by replacing xpath(..., namespaces={...}) with: - findall('.//' + qn('w:t')) for text elements - find(qn('w:pPr')) + find(qn('w:pStyle')) for style detection - get(qn('w:val')) for attribute values Also fixed logic bug where elif prevented short-text fallback from running when a non-heading style existed on the paragraph.
2026-01-11 05:20:05 -07:00 · 2026-01-11 05:20:05 -07:00 · 2f39c4ec5b
commit 2f39c4ec5b
parent af6aadf559
2 changed files with 63 additions and 180 deletions
--- a/reports/test_results.json
+++ b/reports/test_results.json
@ -1,154 +1,18 @@
 {
  "metadata": {
-    "start_time": "2026-01-11T00:28:31.202459",
+    "start_time": "2026-01-11T05:19:25.816074",
    "end_time": "2026-01-11T00:28:33.718606",
    "duration": 1.2442383766174316,
    "exit_status": 0,
    "pytest_version": "9.0.2",
-    "test_types": [
+    "end_time": "2026-01-11T05:19:26.468770",
-      "pytest",
+    "duration": 0.6526906490325928,
-      "torture_test"
+    "exit_status": 0
    ]
  },
  "summary": {
-    "total": 6,
+    "total": 0,
-    "passed": 5,
+    "passed": 0,
    "failed": 0,
-    "skipped": 1,
+    "skipped": 0,
-    "pass_rate": 83.33333333333334
+    "pass_rate": 0
  },
-  "categories": {
+  "categories": {},
-    "Excel": {
+  "tests": []
      "total": 4,
      "passed": 3,
      "failed": 0,
      "skipped": 1
    },
    "Word": {
      "total": 2,
      "passed": 2,
      "failed": 0,
      "skipped": 0
    }
  },
  "tests": [
    {
      "name": "Excel Data Analysis",
      "nodeid": "torture_test.py::test_excel_data_analysis",
      "category": "Excel",
      "outcome": "passed",
      "duration": 0.17873024940490723,
      "timestamp": "2026-01-11T00:28:33.696485",
      "module": "torture_test",
      "class": null,
      "function": "test_excel_data_analysis",
      "inputs": {
        "file": "test_files/test_data.xlsx"
      },
      "outputs": {
        "sheets_analyzed": [
          "Test Data"
        ]
      },
      "error": null,
      "traceback": null
    },
    {
      "name": "Excel Formula Extraction",
      "nodeid": "torture_test.py::test_excel_formula_extraction",
      "category": "Excel",
      "outcome": "passed",
      "duration": 0.0032067298889160156,
      "timestamp": "2026-01-11T00:28:33.699697",
      "module": "torture_test",
      "class": null,
      "function": "test_excel_formula_extraction",
      "inputs": {
        "file": "test_files/test_data.xlsx"
      },
      "outputs": {
        "total_formulas": 8
      },
      "error": null,
      "traceback": null
    },
    {
      "name": "Excel Chart Data Generation",
      "nodeid": "torture_test.py::test_excel_chart_generation",
      "category": "Excel",
      "outcome": "passed",
      "duration": 0.0025446414947509766,
      "timestamp": "2026-01-11T00:28:33.702246",
      "module": "torture_test",
      "class": null,
      "function": "test_excel_chart_generation",
      "inputs": {
        "file": "test_files/test_data.xlsx",
        "x_column": "Category",
        "y_columns": [
          "Value"
        ]
      },
      "outputs": {
        "chart_libraries": 2
      },
      "error": null,
      "traceback": null
    },
    {
      "name": "Word Structure Analysis",
      "nodeid": "torture_test.py::test_word_structure_analysis",
      "category": "Word",
      "outcome": "passed",
      "duration": 0.010314226150512695,
      "timestamp": "2026-01-11T00:28:33.712565",
      "module": "torture_test",
      "class": null,
      "function": "test_word_structure_analysis",
      "inputs": {
        "file": "test_files/test_document.docx"
      },
      "outputs": {
        "total_headings": 0
      },
      "error": null,
      "traceback": null
    },
    {
      "name": "Word Table Extraction",
      "nodeid": "torture_test.py::test_word_table_extraction",
      "category": "Word",
      "outcome": "passed",
      "duration": 0.005824089050292969,
      "timestamp": "2026-01-11T00:28:33.718393",
      "module": "torture_test",
      "class": null,
      "function": "test_word_table_extraction",
      "inputs": {
        "file": "test_files/test_document.docx"
      },
      "outputs": {
        "total_tables": 0
      },
      "error": null,
      "traceback": null
    },
    {
      "name": "Real Excel File Analysis (FORScan)",
      "nodeid": "torture_test.py::test_real_excel_analysis",
      "category": "Excel",
      "outcome": "skipped",
      "duration": 0,
      "timestamp": "2026-01-11T00:28:33.718405",
      "module": "torture_test",
      "class": null,
      "function": "test_real_excel_analysis",
      "inputs": {
        "file": "/home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx"
      },
      "outputs": null,
      "error": "File not found: /home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx",
      "traceback": null
    }
  ]
 }
--- a/src/mcp_office_tools/utils/word_processing.py
+++ b/src/mcp_office_tools/utils/word_processing.py
@ -13,6 +13,7 @@ import zipfile
 from pathlib import Path
 from typing import Any
 from docx.oxml.ns import qn
 from PIL import Image
 # Temp directory configuration
@ -687,9 +688,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
        # Extract bookmarks (chapter markers)
        bookmarks = []
        try:
-            # Access document's bookmarks through the XML
+            # Access document's bookmarks through the XML using findall with qn()
-            for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+            # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
-                bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
+            for bookmark in doc.element.body.findall('.//' + qn('w:bookmarkStart')):
                bookmark_name = bookmark.get(qn('w:name'))
                if bookmark_name and not bookmark_name.startswith('_'):  # Skip system bookmarks
                    bookmarks.append(bookmark_name)
        except Exception:
@ -752,18 +754,19 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
        bookmark_starts = {}
        bookmark_ends = {}
-        # Look for bookmark markers in the document XML
+        # Look for bookmark markers in the document XML using findall with qn()
        # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
        for elem_idx, element in enumerate(doc.element.body):
            # Look for bookmark start markers
-            for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+            for bookmark_start in element.findall('.//' + qn('w:bookmarkStart')):
-                name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
+                name = bookmark_start.get(qn('w:name'))
                if name == bookmark_name:
-                    bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+                    bookmark_id = bookmark_start.get(qn('w:id'))
                    bookmark_starts[bookmark_id] = elem_idx
            # Look for bookmark end markers
-            for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+            for bookmark_end in element.findall('.//' + qn('w:bookmarkEnd')):
-                bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
+                bookmark_id = bookmark_end.get(qn('w:id'))
                if bookmark_id in bookmark_starts:
                    bookmark_ends[bookmark_id] = elem_idx
                    break
@ -794,26 +797,31 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
        chapter_start_idx = None
        chapter_end_idx = None
-        # Search through document elements for matching heading
+        # Search through document elements for matching heading using findall with qn()
        # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
        for elem_idx, element in enumerate(doc.element.body):
            # Check if this element is a paragraph with heading style
            try:
                para = element
                if para.tag.endswith('}p'):  # Word paragraph element
-                    # Get the text content
+                    # Get the text content using findall instead of xpath
-                    text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
+                    text_content = ''.join(text_elem.text or '' for text_elem in para.findall('.//' + qn('w:t')))
                    # Check if this matches our chapter name (case insensitive, flexible matching)
                    if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
                        # Check if it's actually a heading by looking at paragraph style
-                        style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+                        # pStyle is inside pPr element
-                        if style_elem:
+                        pPr = para.find(qn('w:pPr'))
-                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
+                        is_heading_style = False
-                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
+                        if pPr is not None:
-                                chapter_start_idx = elem_idx
+                            pStyle = pPr.find(qn('w:pStyle'))
-                                break
+                            if pStyle is not None:
-                        # Also consider short text lines as potential headings
+                                style_val = pStyle.get(qn('w:val'), '')
-                        elif len(text_content.strip()) < 100:
+                                is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
                        # Accept if it's a heading style OR if it's short text (potential heading)
                        # FIX: Previously used elif which skipped short text check when style existed but wasn't heading
                        if is_heading_style or len(text_content.strip()) < 100:
                            chapter_start_idx = elem_idx
                            break
            except Exception:
@ -831,9 +839,11 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
                para = doc.element.body[elem_idx]
                if para.tag.endswith('}p'):
                    # Check if this is a major heading (same level or higher than chapter start)
-                    style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+                    pPr = para.find(qn('w:pPr'))
-                    if style_elem:
+                    if pPr is not None:
-                        style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
+                        pStyle = pPr.find(qn('w:pStyle'))
                        if pStyle is not None:
                            style_val = pStyle.get(qn('w:val'), '')
                            if 'heading1' in style_val.lower() or 'title' in style_val.lower():
                                chapter_end_idx = elem_idx - 1
                                break
@ -869,17 +879,26 @@ async def _get_available_headings(doc) -> list[str]:
            try:
                if element.tag.endswith('}p'):  # Word paragraph element
-                    # Get the text content
+                    # Get the text content using findall instead of xpath
-                    text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
+                    # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
                    text_content = ''.join(text_elem.text or '' for text_elem in element.findall('.//' + qn('w:t')))
                    if text_content.strip():
                        # Check if it's a heading by looking at paragraph style
-                        style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+                        # pStyle is inside pPr element
-                        if style_elem:
+                        pPr = element.find(qn('w:pPr'))
-                            style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
+                        is_heading_style = False
-                            if 'heading' in style_val.lower() or 'title' in style_val.lower():
+                        if pPr is not None:
                            pStyle = pPr.find(qn('w:pStyle'))
                            if pStyle is not None:
                                style_val = pStyle.get(qn('w:val'), '')
                                is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
                        # Add if it's a heading style
                        if is_heading_style:
                            headings.append(text_content.strip()[:100])  # Limit heading length
-                        # Also consider short text lines as potential headings
+                        # Also consider short text lines as potential headings (independent of style check)
                        # FIX: Previously used elif which skipped this when style existed but wasn't heading
                        elif len(text_content.strip()) < 100:
                            # Only add if it looks like a heading (not just short random text)
                            if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):