Fix critical xpath API bug breaking chapter/heading detection

python-docx elements don't support xpath() with namespaces kwarg.
The calls silently failed in try/except blocks, causing chapter search
and heading detection to never find matches.

Fixed by replacing xpath(..., namespaces={...}) with:
- findall('.//' + qn('w:t')) for text elements
- find(qn('w:pPr')) + find(qn('w:pStyle')) for style detection
- get(qn('w:val')) for attribute values

Also fixed logic bug where elif prevented short-text fallback from
running when a non-heading style existed on the paragraph.
This commit is contained in:
Ryan Malloy 2026-01-11 05:20:05 -07:00
parent af6aadf559
commit 2f39c4ec5b
2 changed files with 63 additions and 180 deletions

View File

@ -1,154 +1,18 @@
{
"metadata": {
"start_time": "2026-01-11T00:28:31.202459",
"end_time": "2026-01-11T00:28:33.718606",
"duration": 1.2442383766174316,
"exit_status": 0,
"start_time": "2026-01-11T05:19:25.816074",
"pytest_version": "9.0.2",
"test_types": [
"pytest",
"torture_test"
]
"end_time": "2026-01-11T05:19:26.468770",
"duration": 0.6526906490325928,
"exit_status": 0
},
"summary": {
"total": 6,
"passed": 5,
"total": 0,
"passed": 0,
"failed": 0,
"skipped": 1,
"pass_rate": 83.33333333333334
"skipped": 0,
"pass_rate": 0
},
"categories": {
"Excel": {
"total": 4,
"passed": 3,
"failed": 0,
"skipped": 1
},
"Word": {
"total": 2,
"passed": 2,
"failed": 0,
"skipped": 0
}
},
"tests": [
{
"name": "Excel Data Analysis",
"nodeid": "torture_test.py::test_excel_data_analysis",
"category": "Excel",
"outcome": "passed",
"duration": 0.17873024940490723,
"timestamp": "2026-01-11T00:28:33.696485",
"module": "torture_test",
"class": null,
"function": "test_excel_data_analysis",
"inputs": {
"file": "test_files/test_data.xlsx"
},
"outputs": {
"sheets_analyzed": [
"Test Data"
]
},
"error": null,
"traceback": null
},
{
"name": "Excel Formula Extraction",
"nodeid": "torture_test.py::test_excel_formula_extraction",
"category": "Excel",
"outcome": "passed",
"duration": 0.0032067298889160156,
"timestamp": "2026-01-11T00:28:33.699697",
"module": "torture_test",
"class": null,
"function": "test_excel_formula_extraction",
"inputs": {
"file": "test_files/test_data.xlsx"
},
"outputs": {
"total_formulas": 8
},
"error": null,
"traceback": null
},
{
"name": "Excel Chart Data Generation",
"nodeid": "torture_test.py::test_excel_chart_generation",
"category": "Excel",
"outcome": "passed",
"duration": 0.0025446414947509766,
"timestamp": "2026-01-11T00:28:33.702246",
"module": "torture_test",
"class": null,
"function": "test_excel_chart_generation",
"inputs": {
"file": "test_files/test_data.xlsx",
"x_column": "Category",
"y_columns": [
"Value"
]
},
"outputs": {
"chart_libraries": 2
},
"error": null,
"traceback": null
},
{
"name": "Word Structure Analysis",
"nodeid": "torture_test.py::test_word_structure_analysis",
"category": "Word",
"outcome": "passed",
"duration": 0.010314226150512695,
"timestamp": "2026-01-11T00:28:33.712565",
"module": "torture_test",
"class": null,
"function": "test_word_structure_analysis",
"inputs": {
"file": "test_files/test_document.docx"
},
"outputs": {
"total_headings": 0
},
"error": null,
"traceback": null
},
{
"name": "Word Table Extraction",
"nodeid": "torture_test.py::test_word_table_extraction",
"category": "Word",
"outcome": "passed",
"duration": 0.005824089050292969,
"timestamp": "2026-01-11T00:28:33.718393",
"module": "torture_test",
"class": null,
"function": "test_word_table_extraction",
"inputs": {
"file": "test_files/test_document.docx"
},
"outputs": {
"total_tables": 0
},
"error": null,
"traceback": null
},
{
"name": "Real Excel File Analysis (FORScan)",
"nodeid": "torture_test.py::test_real_excel_analysis",
"category": "Excel",
"outcome": "skipped",
"duration": 0,
"timestamp": "2026-01-11T00:28:33.718405",
"module": "torture_test",
"class": null,
"function": "test_real_excel_analysis",
"inputs": {
"file": "/home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx"
},
"outputs": null,
"error": "File not found: /home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx",
"traceback": null
}
]
"categories": {},
"tests": []
}

View File

@ -13,6 +13,7 @@ import zipfile
from pathlib import Path
from typing import Any
from docx.oxml.ns import qn
from PIL import Image
# Temp directory configuration
@ -687,9 +688,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
# Extract bookmarks (chapter markers)
bookmarks = []
try:
# Access document's bookmarks through the XML
for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
# Access document's bookmarks through the XML using findall with qn()
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
for bookmark in doc.element.body.findall('.//' + qn('w:bookmarkStart')):
bookmark_name = bookmark.get(qn('w:name'))
if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks
bookmarks.append(bookmark_name)
except Exception:
@ -752,18 +754,19 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
bookmark_starts = {}
bookmark_ends = {}
# Look for bookmark markers in the document XML
# Look for bookmark markers in the document XML using findall with qn()
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
for elem_idx, element in enumerate(doc.element.body):
# Look for bookmark start markers
for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
for bookmark_start in element.findall('.//' + qn('w:bookmarkStart')):
name = bookmark_start.get(qn('w:name'))
if name == bookmark_name:
bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
bookmark_id = bookmark_start.get(qn('w:id'))
bookmark_starts[bookmark_id] = elem_idx
# Look for bookmark end markers
for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
for bookmark_end in element.findall('.//' + qn('w:bookmarkEnd')):
bookmark_id = bookmark_end.get(qn('w:id'))
if bookmark_id in bookmark_starts:
bookmark_ends[bookmark_id] = elem_idx
break
@ -794,26 +797,31 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
chapter_start_idx = None
chapter_end_idx = None
# Search through document elements for matching heading
# Search through document elements for matching heading using findall with qn()
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
for elem_idx, element in enumerate(doc.element.body):
# Check if this element is a paragraph with heading style
try:
para = element
if para.tag.endswith('}p'): # Word paragraph element
# Get the text content
text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
# Get the text content using findall instead of xpath
text_content = ''.join(text_elem.text or '' for text_elem in para.findall('.//' + qn('w:t')))
# Check if this matches our chapter name (case insensitive, flexible matching)
if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
# Check if it's actually a heading by looking at paragraph style
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
if style_elem:
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
if 'heading' in style_val.lower() or 'title' in style_val.lower():
chapter_start_idx = elem_idx
break
# Also consider short text lines as potential headings
elif len(text_content.strip()) < 100:
# pStyle is inside pPr element
pPr = para.find(qn('w:pPr'))
is_heading_style = False
if pPr is not None:
pStyle = pPr.find(qn('w:pStyle'))
if pStyle is not None:
style_val = pStyle.get(qn('w:val'), '')
is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
# Accept if it's a heading style OR if it's short text (potential heading)
# FIX: Previously used elif which skipped short text check when style existed but wasn't heading
if is_heading_style or len(text_content.strip()) < 100:
chapter_start_idx = elem_idx
break
except Exception:
@ -831,12 +839,14 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
para = doc.element.body[elem_idx]
if para.tag.endswith('}p'):
# Check if this is a major heading (same level or higher than chapter start)
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
if style_elem:
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
if 'heading1' in style_val.lower() or 'title' in style_val.lower():
chapter_end_idx = elem_idx - 1
break
pPr = para.find(qn('w:pPr'))
if pPr is not None:
pStyle = pPr.find(qn('w:pStyle'))
if pStyle is not None:
style_val = pStyle.get(qn('w:val'), '')
if 'heading1' in style_val.lower() or 'title' in style_val.lower():
chapter_end_idx = elem_idx - 1
break
except Exception:
continue
@ -869,17 +879,26 @@ async def _get_available_headings(doc) -> list[str]:
try:
if element.tag.endswith('}p'): # Word paragraph element
# Get the text content
text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
# Get the text content using findall instead of xpath
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
text_content = ''.join(text_elem.text or '' for text_elem in element.findall('.//' + qn('w:t')))
if text_content.strip():
# Check if it's a heading by looking at paragraph style
style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
if style_elem:
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
if 'heading' in style_val.lower() or 'title' in style_val.lower():
headings.append(text_content.strip()[:100]) # Limit heading length
# Also consider short text lines as potential headings
# pStyle is inside pPr element
pPr = element.find(qn('w:pPr'))
is_heading_style = False
if pPr is not None:
pStyle = pPr.find(qn('w:pStyle'))
if pStyle is not None:
style_val = pStyle.get(qn('w:val'), '')
is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
# Add if it's a heading style
if is_heading_style:
headings.append(text_content.strip()[:100]) # Limit heading length
# Also consider short text lines as potential headings (independent of style check)
# FIX: Previously used elif which skipped this when style existed but wasn't heading
elif len(text_content.strip()) < 100:
# Only add if it looks like a heading (not just short random text)
if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):