Fix critical xpath API bug breaking chapter/heading detection
python-docx elements don't support xpath() with namespaces kwarg.
The calls silently failed in try/except blocks, causing chapter search
and heading detection to never find matches.
Fixed by replacing xpath(..., namespaces={...}) with:
- findall('.//' + qn('w:t')) for text elements
- find(qn('w:pPr')) + find(qn('w:pStyle')) for style detection
- get(qn('w:val')) for attribute values
Also fixed logic bug where elif prevented short-text fallback from
running when a non-heading style existed on the paragraph.
This commit is contained in:
parent
af6aadf559
commit
2f39c4ec5b
@ -1,154 +1,18 @@
|
||||
{
|
||||
"metadata": {
|
||||
"start_time": "2026-01-11T00:28:31.202459",
|
||||
"end_time": "2026-01-11T00:28:33.718606",
|
||||
"duration": 1.2442383766174316,
|
||||
"exit_status": 0,
|
||||
"start_time": "2026-01-11T05:19:25.816074",
|
||||
"pytest_version": "9.0.2",
|
||||
"test_types": [
|
||||
"pytest",
|
||||
"torture_test"
|
||||
]
|
||||
"end_time": "2026-01-11T05:19:26.468770",
|
||||
"duration": 0.6526906490325928,
|
||||
"exit_status": 0
|
||||
},
|
||||
"summary": {
|
||||
"total": 6,
|
||||
"passed": 5,
|
||||
"total": 0,
|
||||
"passed": 0,
|
||||
"failed": 0,
|
||||
"skipped": 1,
|
||||
"pass_rate": 83.33333333333334
|
||||
"skipped": 0,
|
||||
"pass_rate": 0
|
||||
},
|
||||
"categories": {
|
||||
"Excel": {
|
||||
"total": 4,
|
||||
"passed": 3,
|
||||
"failed": 0,
|
||||
"skipped": 1
|
||||
},
|
||||
"Word": {
|
||||
"total": 2,
|
||||
"passed": 2,
|
||||
"failed": 0,
|
||||
"skipped": 0
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"name": "Excel Data Analysis",
|
||||
"nodeid": "torture_test.py::test_excel_data_analysis",
|
||||
"category": "Excel",
|
||||
"outcome": "passed",
|
||||
"duration": 0.17873024940490723,
|
||||
"timestamp": "2026-01-11T00:28:33.696485",
|
||||
"module": "torture_test",
|
||||
"class": null,
|
||||
"function": "test_excel_data_analysis",
|
||||
"inputs": {
|
||||
"file": "test_files/test_data.xlsx"
|
||||
},
|
||||
"outputs": {
|
||||
"sheets_analyzed": [
|
||||
"Test Data"
|
||||
]
|
||||
},
|
||||
"error": null,
|
||||
"traceback": null
|
||||
},
|
||||
{
|
||||
"name": "Excel Formula Extraction",
|
||||
"nodeid": "torture_test.py::test_excel_formula_extraction",
|
||||
"category": "Excel",
|
||||
"outcome": "passed",
|
||||
"duration": 0.0032067298889160156,
|
||||
"timestamp": "2026-01-11T00:28:33.699697",
|
||||
"module": "torture_test",
|
||||
"class": null,
|
||||
"function": "test_excel_formula_extraction",
|
||||
"inputs": {
|
||||
"file": "test_files/test_data.xlsx"
|
||||
},
|
||||
"outputs": {
|
||||
"total_formulas": 8
|
||||
},
|
||||
"error": null,
|
||||
"traceback": null
|
||||
},
|
||||
{
|
||||
"name": "Excel Chart Data Generation",
|
||||
"nodeid": "torture_test.py::test_excel_chart_generation",
|
||||
"category": "Excel",
|
||||
"outcome": "passed",
|
||||
"duration": 0.0025446414947509766,
|
||||
"timestamp": "2026-01-11T00:28:33.702246",
|
||||
"module": "torture_test",
|
||||
"class": null,
|
||||
"function": "test_excel_chart_generation",
|
||||
"inputs": {
|
||||
"file": "test_files/test_data.xlsx",
|
||||
"x_column": "Category",
|
||||
"y_columns": [
|
||||
"Value"
|
||||
]
|
||||
},
|
||||
"outputs": {
|
||||
"chart_libraries": 2
|
||||
},
|
||||
"error": null,
|
||||
"traceback": null
|
||||
},
|
||||
{
|
||||
"name": "Word Structure Analysis",
|
||||
"nodeid": "torture_test.py::test_word_structure_analysis",
|
||||
"category": "Word",
|
||||
"outcome": "passed",
|
||||
"duration": 0.010314226150512695,
|
||||
"timestamp": "2026-01-11T00:28:33.712565",
|
||||
"module": "torture_test",
|
||||
"class": null,
|
||||
"function": "test_word_structure_analysis",
|
||||
"inputs": {
|
||||
"file": "test_files/test_document.docx"
|
||||
},
|
||||
"outputs": {
|
||||
"total_headings": 0
|
||||
},
|
||||
"error": null,
|
||||
"traceback": null
|
||||
},
|
||||
{
|
||||
"name": "Word Table Extraction",
|
||||
"nodeid": "torture_test.py::test_word_table_extraction",
|
||||
"category": "Word",
|
||||
"outcome": "passed",
|
||||
"duration": 0.005824089050292969,
|
||||
"timestamp": "2026-01-11T00:28:33.718393",
|
||||
"module": "torture_test",
|
||||
"class": null,
|
||||
"function": "test_word_table_extraction",
|
||||
"inputs": {
|
||||
"file": "test_files/test_document.docx"
|
||||
},
|
||||
"outputs": {
|
||||
"total_tables": 0
|
||||
},
|
||||
"error": null,
|
||||
"traceback": null
|
||||
},
|
||||
{
|
||||
"name": "Real Excel File Analysis (FORScan)",
|
||||
"nodeid": "torture_test.py::test_real_excel_analysis",
|
||||
"category": "Excel",
|
||||
"outcome": "skipped",
|
||||
"duration": 0,
|
||||
"timestamp": "2026-01-11T00:28:33.718405",
|
||||
"module": "torture_test",
|
||||
"class": null,
|
||||
"function": "test_real_excel_analysis",
|
||||
"inputs": {
|
||||
"file": "/home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx"
|
||||
},
|
||||
"outputs": null,
|
||||
"error": "File not found: /home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx",
|
||||
"traceback": null
|
||||
}
|
||||
]
|
||||
"categories": {},
|
||||
"tests": []
|
||||
}
|
||||
@ -13,6 +13,7 @@ import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from docx.oxml.ns import qn
|
||||
from PIL import Image
|
||||
|
||||
# Temp directory configuration
|
||||
@ -687,9 +688,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
||||
# Extract bookmarks (chapter markers)
|
||||
bookmarks = []
|
||||
try:
|
||||
# Access document's bookmarks through the XML
|
||||
for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||
bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
|
||||
# Access document's bookmarks through the XML using findall with qn()
|
||||
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||
for bookmark in doc.element.body.findall('.//' + qn('w:bookmarkStart')):
|
||||
bookmark_name = bookmark.get(qn('w:name'))
|
||||
if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks
|
||||
bookmarks.append(bookmark_name)
|
||||
except Exception:
|
||||
@ -752,18 +754,19 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
|
||||
bookmark_starts = {}
|
||||
bookmark_ends = {}
|
||||
|
||||
# Look for bookmark markers in the document XML
|
||||
# Look for bookmark markers in the document XML using findall with qn()
|
||||
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||
for elem_idx, element in enumerate(doc.element.body):
|
||||
# Look for bookmark start markers
|
||||
for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||
name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
|
||||
for bookmark_start in element.findall('.//' + qn('w:bookmarkStart')):
|
||||
name = bookmark_start.get(qn('w:name'))
|
||||
if name == bookmark_name:
|
||||
bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
|
||||
bookmark_id = bookmark_start.get(qn('w:id'))
|
||||
bookmark_starts[bookmark_id] = elem_idx
|
||||
|
||||
# Look for bookmark end markers
|
||||
for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||
bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
|
||||
for bookmark_end in element.findall('.//' + qn('w:bookmarkEnd')):
|
||||
bookmark_id = bookmark_end.get(qn('w:id'))
|
||||
if bookmark_id in bookmark_starts:
|
||||
bookmark_ends[bookmark_id] = elem_idx
|
||||
break
|
||||
@ -794,26 +797,31 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
|
||||
chapter_start_idx = None
|
||||
chapter_end_idx = None
|
||||
|
||||
# Search through document elements for matching heading
|
||||
# Search through document elements for matching heading using findall with qn()
|
||||
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||
for elem_idx, element in enumerate(doc.element.body):
|
||||
# Check if this element is a paragraph with heading style
|
||||
try:
|
||||
para = element
|
||||
if para.tag.endswith('}p'): # Word paragraph element
|
||||
# Get the text content
|
||||
text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
|
||||
# Get the text content using findall instead of xpath
|
||||
text_content = ''.join(text_elem.text or '' for text_elem in para.findall('.//' + qn('w:t')))
|
||||
|
||||
# Check if this matches our chapter name (case insensitive, flexible matching)
|
||||
if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
|
||||
# Check if it's actually a heading by looking at paragraph style
|
||||
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
||||
if style_elem:
|
||||
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
||||
if 'heading' in style_val.lower() or 'title' in style_val.lower():
|
||||
chapter_start_idx = elem_idx
|
||||
break
|
||||
# Also consider short text lines as potential headings
|
||||
elif len(text_content.strip()) < 100:
|
||||
# pStyle is inside pPr element
|
||||
pPr = para.find(qn('w:pPr'))
|
||||
is_heading_style = False
|
||||
if pPr is not None:
|
||||
pStyle = pPr.find(qn('w:pStyle'))
|
||||
if pStyle is not None:
|
||||
style_val = pStyle.get(qn('w:val'), '')
|
||||
is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
|
||||
|
||||
# Accept if it's a heading style OR if it's short text (potential heading)
|
||||
# FIX: Previously used elif which skipped short text check when style existed but wasn't heading
|
||||
if is_heading_style or len(text_content.strip()) < 100:
|
||||
chapter_start_idx = elem_idx
|
||||
break
|
||||
except Exception:
|
||||
@ -831,9 +839,11 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
|
||||
para = doc.element.body[elem_idx]
|
||||
if para.tag.endswith('}p'):
|
||||
# Check if this is a major heading (same level or higher than chapter start)
|
||||
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
||||
if style_elem:
|
||||
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
||||
pPr = para.find(qn('w:pPr'))
|
||||
if pPr is not None:
|
||||
pStyle = pPr.find(qn('w:pStyle'))
|
||||
if pStyle is not None:
|
||||
style_val = pStyle.get(qn('w:val'), '')
|
||||
if 'heading1' in style_val.lower() or 'title' in style_val.lower():
|
||||
chapter_end_idx = elem_idx - 1
|
||||
break
|
||||
@ -869,17 +879,26 @@ async def _get_available_headings(doc) -> list[str]:
|
||||
|
||||
try:
|
||||
if element.tag.endswith('}p'): # Word paragraph element
|
||||
# Get the text content
|
||||
text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
|
||||
# Get the text content using findall instead of xpath
|
||||
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||
text_content = ''.join(text_elem.text or '' for text_elem in element.findall('.//' + qn('w:t')))
|
||||
|
||||
if text_content.strip():
|
||||
# Check if it's a heading by looking at paragraph style
|
||||
style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
||||
if style_elem:
|
||||
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
||||
if 'heading' in style_val.lower() or 'title' in style_val.lower():
|
||||
# pStyle is inside pPr element
|
||||
pPr = element.find(qn('w:pPr'))
|
||||
is_heading_style = False
|
||||
if pPr is not None:
|
||||
pStyle = pPr.find(qn('w:pStyle'))
|
||||
if pStyle is not None:
|
||||
style_val = pStyle.get(qn('w:val'), '')
|
||||
is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
|
||||
|
||||
# Add if it's a heading style
|
||||
if is_heading_style:
|
||||
headings.append(text_content.strip()[:100]) # Limit heading length
|
||||
# Also consider short text lines as potential headings
|
||||
# Also consider short text lines as potential headings (independent of style check)
|
||||
# FIX: Previously used elif which skipped this when style existed but wasn't heading
|
||||
elif len(text_content.strip()) < 100:
|
||||
# Only add if it looks like a heading (not just short random text)
|
||||
if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user