Fix critical xpath API bug breaking chapter/heading detection

python-docx elements don't support xpath() with namespaces kwarg.
The calls silently failed in try/except blocks, causing chapter search
and heading detection to never find matches.

Fixed by replacing xpath(..., namespaces={...}) with:
- findall('.//' + qn('w:t')) for text elements
- find(qn('w:pPr')) + find(qn('w:pStyle')) for style detection
- get(qn('w:val')) for attribute values

Also fixed logic bug where elif prevented short-text fallback from
running when a non-heading style existed on the paragraph.
This commit is contained in:
Ryan Malloy 2026-01-11 05:20:05 -07:00
parent af6aadf559
commit 2f39c4ec5b
2 changed files with 63 additions and 180 deletions

View File

@ -1,154 +1,18 @@
{ {
"metadata": { "metadata": {
"start_time": "2026-01-11T00:28:31.202459", "start_time": "2026-01-11T05:19:25.816074",
"end_time": "2026-01-11T00:28:33.718606",
"duration": 1.2442383766174316,
"exit_status": 0,
"pytest_version": "9.0.2", "pytest_version": "9.0.2",
"test_types": [ "end_time": "2026-01-11T05:19:26.468770",
"pytest", "duration": 0.6526906490325928,
"torture_test" "exit_status": 0
]
}, },
"summary": { "summary": {
"total": 6, "total": 0,
"passed": 5, "passed": 0,
"failed": 0, "failed": 0,
"skipped": 1, "skipped": 0,
"pass_rate": 83.33333333333334 "pass_rate": 0
}, },
"categories": { "categories": {},
"Excel": { "tests": []
"total": 4,
"passed": 3,
"failed": 0,
"skipped": 1
},
"Word": {
"total": 2,
"passed": 2,
"failed": 0,
"skipped": 0
}
},
"tests": [
{
"name": "Excel Data Analysis",
"nodeid": "torture_test.py::test_excel_data_analysis",
"category": "Excel",
"outcome": "passed",
"duration": 0.17873024940490723,
"timestamp": "2026-01-11T00:28:33.696485",
"module": "torture_test",
"class": null,
"function": "test_excel_data_analysis",
"inputs": {
"file": "test_files/test_data.xlsx"
},
"outputs": {
"sheets_analyzed": [
"Test Data"
]
},
"error": null,
"traceback": null
},
{
"name": "Excel Formula Extraction",
"nodeid": "torture_test.py::test_excel_formula_extraction",
"category": "Excel",
"outcome": "passed",
"duration": 0.0032067298889160156,
"timestamp": "2026-01-11T00:28:33.699697",
"module": "torture_test",
"class": null,
"function": "test_excel_formula_extraction",
"inputs": {
"file": "test_files/test_data.xlsx"
},
"outputs": {
"total_formulas": 8
},
"error": null,
"traceback": null
},
{
"name": "Excel Chart Data Generation",
"nodeid": "torture_test.py::test_excel_chart_generation",
"category": "Excel",
"outcome": "passed",
"duration": 0.0025446414947509766,
"timestamp": "2026-01-11T00:28:33.702246",
"module": "torture_test",
"class": null,
"function": "test_excel_chart_generation",
"inputs": {
"file": "test_files/test_data.xlsx",
"x_column": "Category",
"y_columns": [
"Value"
]
},
"outputs": {
"chart_libraries": 2
},
"error": null,
"traceback": null
},
{
"name": "Word Structure Analysis",
"nodeid": "torture_test.py::test_word_structure_analysis",
"category": "Word",
"outcome": "passed",
"duration": 0.010314226150512695,
"timestamp": "2026-01-11T00:28:33.712565",
"module": "torture_test",
"class": null,
"function": "test_word_structure_analysis",
"inputs": {
"file": "test_files/test_document.docx"
},
"outputs": {
"total_headings": 0
},
"error": null,
"traceback": null
},
{
"name": "Word Table Extraction",
"nodeid": "torture_test.py::test_word_table_extraction",
"category": "Word",
"outcome": "passed",
"duration": 0.005824089050292969,
"timestamp": "2026-01-11T00:28:33.718393",
"module": "torture_test",
"class": null,
"function": "test_word_table_extraction",
"inputs": {
"file": "test_files/test_document.docx"
},
"outputs": {
"total_tables": 0
},
"error": null,
"traceback": null
},
{
"name": "Real Excel File Analysis (FORScan)",
"nodeid": "torture_test.py::test_real_excel_analysis",
"category": "Excel",
"outcome": "skipped",
"duration": 0,
"timestamp": "2026-01-11T00:28:33.718405",
"module": "torture_test",
"class": null,
"function": "test_real_excel_analysis",
"inputs": {
"file": "/home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx"
},
"outputs": null,
"error": "File not found: /home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx",
"traceback": null
}
]
} }

View File

@ -13,6 +13,7 @@ import zipfile
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from docx.oxml.ns import qn
from PIL import Image from PIL import Image
# Temp directory configuration # Temp directory configuration
@ -687,9 +688,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
# Extract bookmarks (chapter markers) # Extract bookmarks (chapter markers)
bookmarks = [] bookmarks = []
try: try:
# Access document's bookmarks through the XML # Access document's bookmarks through the XML using findall with qn()
for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name') for bookmark in doc.element.body.findall('.//' + qn('w:bookmarkStart')):
bookmark_name = bookmark.get(qn('w:name'))
if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks
bookmarks.append(bookmark_name) bookmarks.append(bookmark_name)
except Exception: except Exception:
@ -752,18 +754,19 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
bookmark_starts = {} bookmark_starts = {}
bookmark_ends = {} bookmark_ends = {}
# Look for bookmark markers in the document XML # Look for bookmark markers in the document XML using findall with qn()
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
for elem_idx, element in enumerate(doc.element.body): for elem_idx, element in enumerate(doc.element.body):
# Look for bookmark start markers # Look for bookmark start markers
for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): for bookmark_start in element.findall('.//' + qn('w:bookmarkStart')):
name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name') name = bookmark_start.get(qn('w:name'))
if name == bookmark_name: if name == bookmark_name:
bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') bookmark_id = bookmark_start.get(qn('w:id'))
bookmark_starts[bookmark_id] = elem_idx bookmark_starts[bookmark_id] = elem_idx
# Look for bookmark end markers # Look for bookmark end markers
for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): for bookmark_end in element.findall('.//' + qn('w:bookmarkEnd')):
bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') bookmark_id = bookmark_end.get(qn('w:id'))
if bookmark_id in bookmark_starts: if bookmark_id in bookmark_starts:
bookmark_ends[bookmark_id] = elem_idx bookmark_ends[bookmark_id] = elem_idx
break break
@ -794,26 +797,31 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
chapter_start_idx = None chapter_start_idx = None
chapter_end_idx = None chapter_end_idx = None
# Search through document elements for matching heading # Search through document elements for matching heading using findall with qn()
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
for elem_idx, element in enumerate(doc.element.body): for elem_idx, element in enumerate(doc.element.body):
# Check if this element is a paragraph with heading style # Check if this element is a paragraph with heading style
try: try:
para = element para = element
if para.tag.endswith('}p'): # Word paragraph element if para.tag.endswith('}p'): # Word paragraph element
# Get the text content # Get the text content using findall instead of xpath
text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})) text_content = ''.join(text_elem.text or '' for text_elem in para.findall('.//' + qn('w:t')))
# Check if this matches our chapter name (case insensitive, flexible matching) # Check if this matches our chapter name (case insensitive, flexible matching)
if text_content.strip() and chapter_name.lower() in text_content.lower().strip(): if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
# Check if it's actually a heading by looking at paragraph style # Check if it's actually a heading by looking at paragraph style
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) # pStyle is inside pPr element
if style_elem: pPr = para.find(qn('w:pPr'))
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') is_heading_style = False
if 'heading' in style_val.lower() or 'title' in style_val.lower(): if pPr is not None:
chapter_start_idx = elem_idx pStyle = pPr.find(qn('w:pStyle'))
break if pStyle is not None:
# Also consider short text lines as potential headings style_val = pStyle.get(qn('w:val'), '')
elif len(text_content.strip()) < 100: is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
# Accept if it's a heading style OR if it's short text (potential heading)
# FIX: Previously used elif which skipped short text check when style existed but wasn't heading
if is_heading_style or len(text_content.strip()) < 100:
chapter_start_idx = elem_idx chapter_start_idx = elem_idx
break break
except Exception: except Exception:
@ -831,9 +839,11 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
para = doc.element.body[elem_idx] para = doc.element.body[elem_idx]
if para.tag.endswith('}p'): if para.tag.endswith('}p'):
# Check if this is a major heading (same level or higher than chapter start) # Check if this is a major heading (same level or higher than chapter start)
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) pPr = para.find(qn('w:pPr'))
if style_elem: if pPr is not None:
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') pStyle = pPr.find(qn('w:pStyle'))
if pStyle is not None:
style_val = pStyle.get(qn('w:val'), '')
if 'heading1' in style_val.lower() or 'title' in style_val.lower(): if 'heading1' in style_val.lower() or 'title' in style_val.lower():
chapter_end_idx = elem_idx - 1 chapter_end_idx = elem_idx - 1
break break
@ -869,17 +879,26 @@ async def _get_available_headings(doc) -> list[str]:
try: try:
if element.tag.endswith('}p'): # Word paragraph element if element.tag.endswith('}p'): # Word paragraph element
# Get the text content # Get the text content using findall instead of xpath
text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})) # Note: xpath() with namespaces kwarg doesn't work on python-docx elements
text_content = ''.join(text_elem.text or '' for text_elem in element.findall('.//' + qn('w:t')))
if text_content.strip(): if text_content.strip():
# Check if it's a heading by looking at paragraph style # Check if it's a heading by looking at paragraph style
style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) # pStyle is inside pPr element
if style_elem: pPr = element.find(qn('w:pPr'))
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') is_heading_style = False
if 'heading' in style_val.lower() or 'title' in style_val.lower(): if pPr is not None:
pStyle = pPr.find(qn('w:pStyle'))
if pStyle is not None:
style_val = pStyle.get(qn('w:val'), '')
is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
# Add if it's a heading style
if is_heading_style:
headings.append(text_content.strip()[:100]) # Limit heading length headings.append(text_content.strip()[:100]) # Limit heading length
# Also consider short text lines as potential headings # Also consider short text lines as potential headings (independent of style check)
# FIX: Previously used elif which skipped this when style existed but wasn't heading
elif len(text_content.strip()) < 100: elif len(text_content.strip()) < 100:
# Only add if it looks like a heading (not just short random text) # Only add if it looks like a heading (not just short random text)
if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']): if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):