diff --git a/reports/test_results.json b/reports/test_results.json index 99faf0d..009899e 100644 --- a/reports/test_results.json +++ b/reports/test_results.json @@ -1,154 +1,18 @@ { "metadata": { - "start_time": "2026-01-11T00:28:31.202459", - "end_time": "2026-01-11T00:28:33.718606", - "duration": 1.2442383766174316, - "exit_status": 0, + "start_time": "2026-01-11T05:19:25.816074", "pytest_version": "9.0.2", - "test_types": [ - "pytest", - "torture_test" - ] + "end_time": "2026-01-11T05:19:26.468770", + "duration": 0.6526906490325928, + "exit_status": 0 }, "summary": { - "total": 6, - "passed": 5, + "total": 0, + "passed": 0, "failed": 0, - "skipped": 1, - "pass_rate": 83.33333333333334 + "skipped": 0, + "pass_rate": 0 }, - "categories": { - "Excel": { - "total": 4, - "passed": 3, - "failed": 0, - "skipped": 1 - }, - "Word": { - "total": 2, - "passed": 2, - "failed": 0, - "skipped": 0 - } - }, - "tests": [ - { - "name": "Excel Data Analysis", - "nodeid": "torture_test.py::test_excel_data_analysis", - "category": "Excel", - "outcome": "passed", - "duration": 0.17873024940490723, - "timestamp": "2026-01-11T00:28:33.696485", - "module": "torture_test", - "class": null, - "function": "test_excel_data_analysis", - "inputs": { - "file": "test_files/test_data.xlsx" - }, - "outputs": { - "sheets_analyzed": [ - "Test Data" - ] - }, - "error": null, - "traceback": null - }, - { - "name": "Excel Formula Extraction", - "nodeid": "torture_test.py::test_excel_formula_extraction", - "category": "Excel", - "outcome": "passed", - "duration": 0.0032067298889160156, - "timestamp": "2026-01-11T00:28:33.699697", - "module": "torture_test", - "class": null, - "function": "test_excel_formula_extraction", - "inputs": { - "file": "test_files/test_data.xlsx" - }, - "outputs": { - "total_formulas": 8 - }, - "error": null, - "traceback": null - }, - { - "name": "Excel Chart Data Generation", - "nodeid": "torture_test.py::test_excel_chart_generation", - "category": "Excel", - "outcome": "passed", - "duration": 0.0025446414947509766, - "timestamp": "2026-01-11T00:28:33.702246", - "module": "torture_test", - "class": null, - "function": "test_excel_chart_generation", - "inputs": { - "file": "test_files/test_data.xlsx", - "x_column": "Category", - "y_columns": [ - "Value" - ] - }, - "outputs": { - "chart_libraries": 2 - }, - "error": null, - "traceback": null - }, - { - "name": "Word Structure Analysis", - "nodeid": "torture_test.py::test_word_structure_analysis", - "category": "Word", - "outcome": "passed", - "duration": 0.010314226150512695, - "timestamp": "2026-01-11T00:28:33.712565", - "module": "torture_test", - "class": null, - "function": "test_word_structure_analysis", - "inputs": { - "file": "test_files/test_document.docx" - }, - "outputs": { - "total_headings": 0 - }, - "error": null, - "traceback": null - }, - { - "name": "Word Table Extraction", - "nodeid": "torture_test.py::test_word_table_extraction", - "category": "Word", - "outcome": "passed", - "duration": 0.005824089050292969, - "timestamp": "2026-01-11T00:28:33.718393", - "module": "torture_test", - "class": null, - "function": "test_word_table_extraction", - "inputs": { - "file": "test_files/test_document.docx" - }, - "outputs": { - "total_tables": 0 - }, - "error": null, - "traceback": null - }, - { - "name": "Real Excel File Analysis (FORScan)", - "nodeid": "torture_test.py::test_real_excel_analysis", - "category": "Excel", - "outcome": "skipped", - "duration": 0, - "timestamp": "2026-01-11T00:28:33.718405", - "module": "torture_test", - "class": null, - "function": "test_real_excel_analysis", - "inputs": { - "file": "/home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx" - }, - "outputs": null, - "error": "File not found: /home/rpm/FORScan Lite spreadsheets v1.1/FORScan Lite spreadsheet - PIDs.xlsx", - "traceback": null - } - ] + "categories": {}, + "tests": [] } \ No newline at end of file diff --git a/src/mcp_office_tools/utils/word_processing.py b/src/mcp_office_tools/utils/word_processing.py index 9a68ca3..57140ee 100644 --- a/src/mcp_office_tools/utils/word_processing.py +++ b/src/mcp_office_tools/utils/word_processing.py @@ -13,6 +13,7 @@ import zipfile from pathlib import Path from typing import Any +from docx.oxml.ns import qn from PIL import Image # Temp directory configuration @@ -687,9 +688,10 @@ async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]: # Extract bookmarks (chapter markers) bookmarks = [] try: - # Access document's bookmarks through the XML - for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): - bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name') + # Access document's bookmarks through the XML using findall with qn() + # Note: xpath() with namespaces kwarg doesn't work on python-docx elements + for bookmark in doc.element.body.findall('.//' + qn('w:bookmarkStart')): + bookmark_name = bookmark.get(qn('w:name')) if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks bookmarks.append(bookmark_name) except Exception: @@ -752,18 +754,19 @@ async def _find_bookmark_content_range(doc, bookmark_name: str) -> dict[str, Any bookmark_starts = {} bookmark_ends = {} - # Look for bookmark markers in the document XML + # Look for bookmark markers in the document XML using findall with qn() + # Note: xpath() with namespaces kwarg doesn't work on python-docx elements for elem_idx, element in enumerate(doc.element.body): # Look for bookmark start markers - for bookmark_start in element.xpath('.//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): - name = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name') + for bookmark_start in element.findall('.//' + qn('w:bookmarkStart')): + name = bookmark_start.get(qn('w:name')) if name == bookmark_name: - bookmark_id = bookmark_start.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') + bookmark_id = bookmark_start.get(qn('w:id')) bookmark_starts[bookmark_id] = elem_idx # Look for bookmark end markers - for bookmark_end in element.xpath('.//w:bookmarkEnd', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): - bookmark_id = bookmark_end.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') + for bookmark_end in element.findall('.//' + qn('w:bookmarkEnd')): + bookmark_id = bookmark_end.get(qn('w:id')) if bookmark_id in bookmark_starts: bookmark_ends[bookmark_id] = elem_idx break @@ -794,26 +797,31 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]: chapter_start_idx = None chapter_end_idx = None - # Search through document elements for matching heading + # Search through document elements for matching heading using findall with qn() + # Note: xpath() with namespaces kwarg doesn't work on python-docx elements for elem_idx, element in enumerate(doc.element.body): # Check if this element is a paragraph with heading style try: para = element if para.tag.endswith('}p'): # Word paragraph element - # Get the text content - text_content = ''.join(text_elem.text or '' for text_elem in para.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})) + # Get the text content using findall instead of xpath + text_content = ''.join(text_elem.text or '' for text_elem in para.findall('.//' + qn('w:t'))) # Check if this matches our chapter name (case insensitive, flexible matching) if text_content.strip() and chapter_name.lower() in text_content.lower().strip(): # Check if it's actually a heading by looking at paragraph style - style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) - if style_elem: - style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') - if 'heading' in style_val.lower() or 'title' in style_val.lower(): - chapter_start_idx = elem_idx - break - # Also consider short text lines as potential headings - elif len(text_content.strip()) < 100: + # pStyle is inside pPr element + pPr = para.find(qn('w:pPr')) + is_heading_style = False + if pPr is not None: + pStyle = pPr.find(qn('w:pStyle')) + if pStyle is not None: + style_val = pStyle.get(qn('w:val'), '') + is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower() + + # Accept if it's a heading style OR if it's short text (potential heading) + # FIX: Previously used elif which skipped short text check when style existed but wasn't heading + if is_heading_style or len(text_content.strip()) < 100: chapter_start_idx = elem_idx break except Exception: @@ -831,12 +839,14 @@ async def _find_chapter_content_range(doc, chapter_name: str) -> dict[str, Any]: para = doc.element.body[elem_idx] if para.tag.endswith('}p'): # Check if this is a major heading (same level or higher than chapter start) - style_elem = para.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) - if style_elem: - style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') - if 'heading1' in style_val.lower() or 'title' in style_val.lower(): - chapter_end_idx = elem_idx - 1 - break + pPr = para.find(qn('w:pPr')) + if pPr is not None: + pStyle = pPr.find(qn('w:pStyle')) + if pStyle is not None: + style_val = pStyle.get(qn('w:val'), '') + if 'heading1' in style_val.lower() or 'title' in style_val.lower(): + chapter_end_idx = elem_idx - 1 + break except Exception: continue @@ -869,17 +879,26 @@ async def _get_available_headings(doc) -> list[str]: try: if element.tag.endswith('}p'): # Word paragraph element - # Get the text content - text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})) + # Get the text content using findall instead of xpath + # Note: xpath() with namespaces kwarg doesn't work on python-docx elements + text_content = ''.join(text_elem.text or '' for text_elem in element.findall('.//' + qn('w:t'))) if text_content.strip(): # Check if it's a heading by looking at paragraph style - style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) - if style_elem: - style_val = style_elem[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '') - if 'heading' in style_val.lower() or 'title' in style_val.lower(): - headings.append(text_content.strip()[:100]) # Limit heading length - # Also consider short text lines as potential headings + # pStyle is inside pPr element + pPr = element.find(qn('w:pPr')) + is_heading_style = False + if pPr is not None: + pStyle = pPr.find(qn('w:pStyle')) + if pStyle is not None: + style_val = pStyle.get(qn('w:val'), '') + is_heading_style = 'heading' in style_val.lower() or 'title' in style_val.lower() + + # Add if it's a heading style + if is_heading_style: + headings.append(text_content.strip()[:100]) # Limit heading length + # Also consider short text lines as potential headings (independent of style check) + # FIX: Previously used elif which skipped this when style existed but wasn't heading elif len(text_content.strip()) < 100: # Only add if it looks like a heading (not just short random text) if any(word in text_content.lower() for word in ['chapter', 'section', 'part', 'introduction', 'conclusion']):