Fix critical xpath API bug breaking chapter/heading detection
python-docx elements don't support xpath() with namespaces kwarg.
The calls silently failed in try/except blocks, causing chapter search
and heading detection to never find matches.
Fixed by replacing xpath(..., namespaces={...}) with:
- findall('.//' + qn('w:t')) for text elements
- find(qn('w:pPr')) + find(qn('w:pStyle')) for style detection
- get(qn('w:val')) for attribute values
Also fixed logic bug where elif prevented short-text fallback from
running when a non-heading style existed on the paragraph.
This commit is contained in:
parent
af6aadf559
commit
2f39c4ec5b
@ -1,154 +1,18 @@
|
|||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"start_time": "2026-01-11T00:28:31.202459",
|
"start_time": "2026-01-11T05:19:25.816074",
|
||||||
"end_time": "2026-01-11T00:28:33.718606",
|
|
||||||
"duration": 1.2442383766174316,
|
|
||||||
"exit_status": 0,
|
|
||||||
"pytest_version": "9.0.2",
|
"pytest_version": "9.0.2",
|
||||||
"test_types": [
|
"end_time": "2026-01-11T05:19:26.468770",
|
||||||
"pytest",
|
"duration": 0.6526906490325928,
|
||||||
"torture_test"
|
"exit_status": 0
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"summary": {
|
"summary": {
|
||||||
"total": 6,
|
"total": 0,
|
||||||
"passed": 5,
|
"passed": 0,
|
||||||
"failed": 0,
|
"failed": 0,
|
||||||
"skipped": 1,
|
"skipped": 0,
|
||||||
"pass_rate": 83.33333333333334
|
"pass_rate": 0
|
||||||
},
|
},
|
||||||
"categories": {
|
"categories": {},
|
||||||
"Excel": {
|
"tests": []
|
||||||
"total": 4,
|
|
||||||
"passed": 3,
|
|
||||||
"failed": 0,
|
|
||||||
"skipped": 1
|
|
||||||
},
|
|
||||||
"Word": {
|
|
||||||
"total": 2,
|
|
||||||
"passed": 2,
|
|
||||||
"failed": 0,
|
|
||||||
"skipped": 0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"name": "Excel Data Analysis",
|
|
||||||
"nodeid": "torture_test.py::test_excel_data_analysis",
|
|
||||||
"category": "Excel",
|
|
||||||
"outcome": "passed",
|
|
||||||
"duration": 0.17873024940490723,
|
|
||||||
"timestamp": "2026-01-11T00:28:33.696485",
|
|
||||||
"module": "torture_test",
|
|
||||||
"class": null,
|
|
||||||
"function": "test_excel_data_analysis",
|
|
||||||
"inputs": {
|
|
||||||
"file": "test_files/test_data.xlsx"
|
|
||||||
},
|
|
||||||
"outputs": {
|
|
||||||
"sheets_analyzed": [
|
|
||||||
"Test Data"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"traceback": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Excel Formula Extraction",
|
|
||||||
"nodeid": "torture_test.py::test_excel_formula_extraction",
|
|
||||||
"category": "Excel",
|
|
||||||
"outcome": "passed",
|
|
||||||
"duration": 0.0032067298889160156,
|
|
||||||
"timestamp": "2026-01-11T00:28:33.699697",
|
|
||||||
"module": "torture_test",
|
|
||||||
"class": null,
|
|
||||||
"function": "test_excel_formula_extraction",
|
|
||||||
"inputs": {
|
|
||||||
"file": "test_files/test_data.xlsx"
|
|
||||||
},
|
|
||||||
"outputs": {
|
|
||||||
"total_formulas": 8
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"traceback": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Excel Chart Data Generation",
|
|
||||||
"nodeid": "torture_test.py::test_excel_chart_generation",
|
|
||||||
"category": "Excel",
|
|
||||||
"outcome": "passed",
|
|
||||||
"duration": 0.0025446414947509766,
|
|
||||||
"timestamp": "2026-01-11T00:28:33.702246",
|
|
||||||
"module": "torture_test",
|
|
||||||
"class": null,
|
|
||||||
"function": "test_excel_chart_generation",
|
|
||||||
"inputs": {
|
|
||||||
"file": "test_files/test_data.xlsx",
|
|
||||||
"x_column": "Category",
|
|
||||||
"y_columns": [
|
|
||||||
"Value"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"outputs": {
|
|
||||||
"chart_libraries": 2
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"traceback": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Word Structure Analysis",
|
|
||||||
"nodeid": "torture_test.py::test_word_structure_analysis",
|
|
||||||
"category": "Word",
|
|
||||||
"outcome": "passed",
|
|
||||||
"duration": 0.010314226150512695,
|
|
||||||
"timestamp": "2026-01-11T00:28:33.712565",
|
|
||||||
"module": "torture_test",
|
|
||||||
"class": null,
|
|
||||||
"function": "test_word_structure_analysis",
|
|
||||||
"inputs": {
|
|
||||||
"file": "test_files/test_document.docx"
|
|
||||||
},
|
|
||||||
"outputs": {
|
|
||||||
"total_headings": 0
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"traceback": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Word Table Extraction",
|
|
||||||
"nodeid": "torture_test.py::test_word_table_extraction",
|
|
||||||
"category": "Word",
|
|
||||||
"outcome": "passed",
|
|
||||||
"duration": 0.005824089050292969,
|
|
||||||
"timestamp": "2026-01-11T00:28:33.718393",
|
|
||||||
"module": "torture_test",
|
|
||||||
"class": null,
|
|
||||||
"function": "test_word_table_extraction",
|
|
||||||
"inputs": {
|
|
||||||
"file": "test_files/test_document.docx"
|
|
||||||
},
|
|
||||||
"outputs": {
|
|
||||||
"total_tables": 0
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"traceback": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Real Excel File Analysis (FORScan)",
|
|
||||||
"nodeid": "torture_test.py::test_real_excel_analysis",
|
|
||||||
"category": "Excel",
|
|
||||||
"outcome": "skipped",
|
|
||||||
"duration": 0,
|
|
||||||
"timestamp": "2026-01-11T00:28:33.718405",
|
|
||||||
"module": "torture_test",
|
|
||||||
"class": null,
|
|
||||||
"function": "test_real_excel_analysis",
|
|
||||||
"inputs": {
|
|
||||||
"file": "/home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx"
|
|
||||||
},
|
|
||||||
"outputs": null,
|
|
||||||
"error": "File not found: /home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx",
|
|
||||||
"traceback": null
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
@ -13,6 +13,7 @@ import zipfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from docx.oxml.ns import qn
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
# Temp directory configuration
|
# Temp directory configuration
|
||||||
@ -687,9 +688,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
|||||||
# Extract bookmarks (chapter markers)
|
# Extract bookmarks (chapter markers)
|
||||||
bookmarks = []
|
bookmarks = []
|
||||||
try:
|
try:
|
||||||
# Access document's bookmarks through the XML
|
# Access document's bookmarks through the XML using findall with qn()
|
||||||
for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||||
bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
|
for bookmark in doc.element.body.findall('.//' + qn('w:bookmarkStart')):
|
||||||
|
bookmark_name = bookmark.get(qn('w:name'))
|
||||||
if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks
|
if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks
|
||||||
bookmarks.append(bookmark_name)
|
bookmarks.append(bookmark_name)
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -752,18 +754,19 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any
|
|||||||
bookmark_starts = {}
|
bookmark_starts = {}
|
||||||
bookmark_ends = {}
|
bookmark_ends = {}
|
||||||
|
|
||||||
# Look for bookmark markers in the document XML
|
# Look for bookmark markers in the document XML using findall with qn()
|
||||||
|
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||||
for elem_idx, element in enumerate(doc.element.body):
|
for elem_idx, element in enumerate(doc.element.body):
|
||||||
# Look for bookmark start markers
|
# Look for bookmark start markers
|
||||||
for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
for bookmark_start in element.findall('.//' + qn('w:bookmarkStart')):
|
||||||
name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
|
name = bookmark_start.get(qn('w:name'))
|
||||||
if name == bookmark_name:
|
if name == bookmark_name:
|
||||||
bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
|
bookmark_id = bookmark_start.get(qn('w:id'))
|
||||||
bookmark_starts[bookmark_id] = elem_idx
|
bookmark_starts[bookmark_id] = elem_idx
|
||||||
|
|
||||||
# Look for bookmark end markers
|
# Look for bookmark end markers
|
||||||
for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
for bookmark_end in element.findall('.//' + qn('w:bookmarkEnd')):
|
||||||
bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
|
bookmark_id = bookmark_end.get(qn('w:id'))
|
||||||
if bookmark_id in bookmark_starts:
|
if bookmark_id in bookmark_starts:
|
||||||
bookmark_ends[bookmark_id] = elem_idx
|
bookmark_ends[bookmark_id] = elem_idx
|
||||||
break
|
break
|
||||||
@ -794,26 +797,31 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
|
|||||||
chapter_start_idx = None
|
chapter_start_idx = None
|
||||||
chapter_end_idx = None
|
chapter_end_idx = None
|
||||||
|
|
||||||
# Search through document elements for matching heading
|
# Search through document elements for matching heading using findall with qn()
|
||||||
|
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||||
for elem_idx, element in enumerate(doc.element.body):
|
for elem_idx, element in enumerate(doc.element.body):
|
||||||
# Check if this element is a paragraph with heading style
|
# Check if this element is a paragraph with heading style
|
||||||
try:
|
try:
|
||||||
para = element
|
para = element
|
||||||
if para.tag.endswith('}p'): # Word paragraph element
|
if para.tag.endswith('}p'): # Word paragraph element
|
||||||
# Get the text content
|
# Get the text content using findall instead of xpath
|
||||||
text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
|
text_content = ''.join(text_elem.text or '' for text_elem in para.findall('.//' + qn('w:t')))
|
||||||
|
|
||||||
# Check if this matches our chapter name (case insensitive, flexible matching)
|
# Check if this matches our chapter name (case insensitive, flexible matching)
|
||||||
if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
|
if text_content.strip() and chapter_name.lower() in text_content.lower().strip():
|
||||||
# Check if it's actually a heading by looking at paragraph style
|
# Check if it's actually a heading by looking at paragraph style
|
||||||
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
# pStyle is inside pPr element
|
||||||
if style_elem:
|
pPr = para.find(qn('w:pPr'))
|
||||||
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
is_heading_style = False
|
||||||
if 'heading' in style_val.lower() or 'title' in style_val.lower():
|
if pPr is not None:
|
||||||
chapter_start_idx = elem_idx
|
pStyle = pPr.find(qn('w:pStyle'))
|
||||||
break
|
if pStyle is not None:
|
||||||
# Also consider short text lines as potential headings
|
style_val = pStyle.get(qn('w:val'), '')
|
||||||
elif len(text_content.strip()) < 100:
|
is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
|
||||||
|
|
||||||
|
# Accept if it's a heading style OR if it's short text (potential heading)
|
||||||
|
# FIX: Previously used elif which skipped short text check when style existed but wasn't heading
|
||||||
|
if is_heading_style or len(text_content.strip()) < 100:
|
||||||
chapter_start_idx = elem_idx
|
chapter_start_idx = elem_idx
|
||||||
break
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -831,12 +839,14 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]:
|
|||||||
para = doc.element.body[elem_idx]
|
para = doc.element.body[elem_idx]
|
||||||
if para.tag.endswith('}p'):
|
if para.tag.endswith('}p'):
|
||||||
# Check if this is a major heading (same level or higher than chapter start)
|
# Check if this is a major heading (same level or higher than chapter start)
|
||||||
style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
pPr = para.find(qn('w:pPr'))
|
||||||
if style_elem:
|
if pPr is not None:
|
||||||
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
pStyle = pPr.find(qn('w:pStyle'))
|
||||||
if 'heading1' in style_val.lower() or 'title' in style_val.lower():
|
if pStyle is not None:
|
||||||
chapter_end_idx = elem_idx - 1
|
style_val = pStyle.get(qn('w:val'), '')
|
||||||
break
|
if 'heading1' in style_val.lower() or 'title' in style_val.lower():
|
||||||
|
chapter_end_idx = elem_idx - 1
|
||||||
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -869,17 +879,26 @@ async def _get_available_headings(doc) -> list[str]:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
if element.tag.endswith('}p'): # Word paragraph element
|
if element.tag.endswith('}p'): # Word paragraph element
|
||||||
# Get the text content
|
# Get the text content using findall instead of xpath
|
||||||
text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}))
|
# Note: xpath() with namespaces kwarg doesn't work on python-docx elements
|
||||||
|
text_content = ''.join(text_elem.text or '' for text_elem in element.findall('.//' + qn('w:t')))
|
||||||
|
|
||||||
if text_content.strip():
|
if text_content.strip():
|
||||||
# Check if it's a heading by looking at paragraph style
|
# Check if it's a heading by looking at paragraph style
|
||||||
style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
# pStyle is inside pPr element
|
||||||
if style_elem:
|
pPr = element.find(qn('w:pPr'))
|
||||||
style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '')
|
is_heading_style = False
|
||||||
if 'heading' in style_val.lower() or 'title' in style_val.lower():
|
if pPr is not None:
|
||||||
headings.append(text_content.strip()[:100]) # Limit heading length
|
pStyle = pPr.find(qn('w:pStyle'))
|
||||||
# Also consider short text lines as potential headings
|
if pStyle is not None:
|
||||||
|
style_val = pStyle.get(qn('w:val'), '')
|
||||||
|
is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower()
|
||||||
|
|
||||||
|
# Add if it's a heading style
|
||||||
|
if is_heading_style:
|
||||||
|
headings.append(text_content.strip()[:100]) # Limit heading length
|
||||||
|
# Also consider short text lines as potential headings (independent of style check)
|
||||||
|
# FIX: Previously used elif which skipped this when style existed but wasn't heading
|
||||||
elif len(text_content.strip()) < 100:
|
elif len(text_content.strip()) < 100:
|
||||||
# Only add if it looks like a heading (not just short random text)
|
# Only add if it looks like a heading (not just short random text)
|
||||||
if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):
|
if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user