Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
Named for Milton Waddams, who was relocated to the basement with boxes of legacy documents. He handles the .doc and .xls files from 1997 that nobody else wants to touch. - Rename package from mcp-office-tools to mcwaddams - Update author to Ryan Malloy - Update all imports and references - Add Office Space themed README narrative - All 53 tests passing
493 lines
24 KiB
Python
493 lines
24 KiB
Python
"""Focused tests for WordMixin functionality.
|
|
|
|
This module tests the WordMixin in isolation, focusing on:
|
|
- Word-specific tool functionality
|
|
- Markdown conversion capabilities
|
|
- Chapter and bookmark extraction
|
|
- Parameter validation for Word-specific features
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from pathlib import Path
|
|
|
|
from fastmcp import FastMCP
|
|
# FastMCP testing - using direct tool access
|
|
|
|
from mcwaddams.mixins.word import WordMixin
|
|
from mcwaddams.utils import OfficeFileError
|
|
|
|
|
|
class TestWordMixinRegistration:
|
|
"""Test WordMixin tool registration and setup."""
|
|
|
|
def test_mixin_initialization(self):
|
|
"""Test WordMixin initializes correctly."""
|
|
app = FastMCP("Test Word")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
|
|
assert mixin is not None
|
|
assert len(app._tool_manager._tools) == 10 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document, extract_entities, get_chapter_summaries, save_reading_progress, get_reading_progress
|
|
|
|
def test_tool_names_registered(self):
|
|
"""Test that Word-specific tools are registered."""
|
|
app = FastMCP("Test Word")
|
|
WordMixin().register_all(app)
|
|
|
|
expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document", "extract_entities", "get_chapter_summaries", "save_reading_progress", "get_reading_progress"}
|
|
registered_tools = set(app._tool_manager._tools.keys())
|
|
assert expected_tools.issubset(registered_tools)
|
|
|
|
|
|
class TestConvertToMarkdown:
|
|
"""Test convert_to_markdown tool functionality."""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_nonexistent_file(self, mixin):
|
|
"""Test convert_to_markdown with nonexistent file."""
|
|
with pytest.raises(OfficeFileError):
|
|
await mixin.convert_to_markdown("/nonexistent/file.docx")
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcwaddams.mixins.word.resolve_office_file_path')
|
|
@patch('mcwaddams.mixins.word.validate_office_file')
|
|
@patch('mcwaddams.mixins.word.detect_format')
|
|
async def test_convert_to_markdown_validation_failure(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test convert_to_markdown with validation failure."""
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {
|
|
"is_valid": False,
|
|
"errors": ["File is password protected"]
|
|
}
|
|
|
|
with pytest.raises(OfficeFileError, match="Invalid file: File is password protected"):
|
|
await mixin.convert_to_markdown("/test.docx")
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcwaddams.mixins.word.resolve_office_file_path')
|
|
@patch('mcwaddams.mixins.word.validate_office_file')
|
|
@patch('mcwaddams.mixins.word.detect_format')
|
|
async def test_convert_to_markdown_non_word_document(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test that non-Word documents are rejected."""
|
|
mock_resolve.return_value = "/test.xlsx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {
|
|
"category": "excel",
|
|
"extension": ".xlsx",
|
|
"format_name": "Excel"
|
|
}
|
|
|
|
with pytest.raises(OfficeFileError, match="Markdown conversion currently only supports Word documents"):
|
|
await mixin.convert_to_markdown("/test.xlsx")
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcwaddams.mixins.word.resolve_office_file_path')
|
|
@patch('mcwaddams.mixins.word.validate_office_file')
|
|
@patch('mcwaddams.mixins.word.detect_format')
|
|
async def test_convert_to_markdown_docx_success(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test successful DOCX to markdown conversion."""
|
|
# Setup mocks
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {
|
|
"category": "word",
|
|
"extension": ".docx",
|
|
"format_name": "Word Document"
|
|
}
|
|
|
|
# Mock internal methods
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {
|
|
"estimated_pages": 5,
|
|
"estimated_size": "medium",
|
|
"has_images": True,
|
|
"has_complex_formatting": False
|
|
}
|
|
mock_recommendation.return_value = {
|
|
"recommendation": "proceed",
|
|
"message": "Document size is manageable for full conversion"
|
|
}
|
|
mock_convert.return_value = {
|
|
"content": "# Test Document\n\nThis is test content.",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"processing_notes": []
|
|
}
|
|
|
|
result = await mixin.convert_to_markdown("/test.docx")
|
|
|
|
# Verify structure - actual implementation uses these keys
|
|
assert "markdown" in result
|
|
assert "metadata" in result
|
|
|
|
# Verify content
|
|
assert "# Test Document" in result["markdown"]
|
|
assert result["metadata"]["format"] == "Word Document"
|
|
assert "conversion_time" in result["metadata"]
|
|
assert "conversion_method" in result["metadata"]
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_parameter_handling(self, mixin):
|
|
"""Test convert_to_markdown parameter validation and handling."""
|
|
# Mock all dependencies for parameter testing
|
|
with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
|
|
with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
|
|
with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_parse_page_range') as mock_parse_range:
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {"estimated_pages": 10}
|
|
mock_recommendation.return_value = {"recommendation": "proceed"}
|
|
mock_parse_range.return_value = [1, 2, 3, 4, 5]
|
|
mock_convert.return_value = {
|
|
"content": "# Test",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"processing_notes": []
|
|
}
|
|
|
|
# Test with specific parameters
|
|
result = await mixin.convert_to_markdown(
|
|
file_path="/test.docx",
|
|
include_images=False,
|
|
image_mode="files",
|
|
max_image_size=512000,
|
|
preserve_structure=False,
|
|
page_range="1-5",
|
|
bookmark_name="Chapter1",
|
|
chapter_name="Introduction",
|
|
summary_only=False,
|
|
output_dir="/output"
|
|
)
|
|
|
|
# Verify conversion was called with correct parameters
|
|
mock_convert.assert_called_once()
|
|
args, kwargs = mock_convert.call_args
|
|
# Note: Since bookmark_name is provided, page_numbers should be None
|
|
# (bookmark takes precedence over page_range)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_bookmark_priority(self, mixin):
|
|
"""Test that bookmark extraction takes priority over page ranges."""
|
|
with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
|
|
with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
|
|
with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_parse_page_range') as mock_parse_range:
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {"estimated_pages": 10}
|
|
mock_recommendation.return_value = {"status": "optimal"}
|
|
mock_convert.return_value = {
|
|
"content": "# Chapter Content",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"processing_notes": []
|
|
}
|
|
|
|
# Call with both page_range and bookmark_name
|
|
result = await mixin.convert_to_markdown(
|
|
"/test.docx",
|
|
page_range="1-10",
|
|
bookmark_name="Chapter1"
|
|
)
|
|
|
|
# Note: page_range IS parsed (mock_parse_range is called)
|
|
# but when bookmark_name is provided, the page_numbers are
|
|
# set to None to prioritize bookmark extraction
|
|
mock_parse_range.assert_called_once()
|
|
|
|
# Verify the conversion was called with bookmark (not page_numbers)
|
|
mock_convert.assert_called_once()
|
|
# Result should have content
|
|
assert "markdown" in result
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_summary_mode(self, mixin):
|
|
"""Test summary_only mode functionality."""
|
|
with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
|
|
with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
|
|
with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
mock_analyze.return_value = {
|
|
"estimated_pages": 25,
|
|
"estimated_size": "large",
|
|
"has_images": True
|
|
}
|
|
mock_recommendation.return_value = {
|
|
"recommendation": "summary_recommended",
|
|
"message": "Large document - summary mode recommended"
|
|
}
|
|
|
|
# Also need to mock the conversion method for summary mode
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_convert.return_value = {
|
|
"content": "# Summary Document\n\nThis is a summary of the content.",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"table_of_contents": {"note": "Summary mode"}
|
|
}
|
|
|
|
result = await mixin.convert_to_markdown(
|
|
"/test.docx",
|
|
summary_only=True
|
|
)
|
|
|
|
# Verify that summary information is returned
|
|
assert "metadata" in result
|
|
assert "summary" in result # Summary mode returns "summary" not "markdown"
|
|
assert result["metadata"]["summary_only"] is True
|
|
|
|
|
|
class TestWordSpecificHelpers:
|
|
"""Test Word-specific helper methods."""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
def test_parse_page_range_single_page(self, mixin):
|
|
"""Test parsing single page range."""
|
|
result = mixin._parse_page_range("5")
|
|
assert result == [5]
|
|
|
|
def test_parse_page_range_range(self, mixin):
|
|
"""Test parsing page ranges."""
|
|
result = mixin._parse_page_range("1-5")
|
|
assert result == [1, 2, 3, 4, 5]
|
|
|
|
def test_parse_page_range_complex(self, mixin):
|
|
"""Test parsing complex page ranges."""
|
|
result = mixin._parse_page_range("1,3,5-7,10")
|
|
expected = [1, 3, 5, 6, 7, 10]
|
|
assert result == expected
|
|
|
|
def test_parse_page_range_invalid(self, mixin):
|
|
"""Test parsing invalid page ranges returns empty list (graceful handling)."""
|
|
# Invalid strings return empty list instead of raising error
|
|
result = mixin._parse_page_range("invalid")
|
|
assert result == []
|
|
|
|
# End before start returns empty list (range(10, 6) is empty)
|
|
result = mixin._parse_page_range("10-5")
|
|
assert result == [] # Empty because range(10, 6) produces no values
|
|
|
|
def test_get_processing_recommendation(self, mixin):
|
|
"""Test processing recommendation logic."""
|
|
# The actual function uses 'estimated_content_size' not 'estimated_size'
|
|
# and returns dict with 'status', 'message', 'suggested_workflow', 'warnings'
|
|
|
|
# Small document - optimal status
|
|
doc_analysis = {"estimated_pages": 3, "estimated_content_size": "small"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "", False)
|
|
assert result["status"] == "optimal"
|
|
|
|
# Large document without page range - suboptimal status
|
|
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "", False)
|
|
assert result["status"] == "suboptimal"
|
|
assert len(result["suggested_workflow"]) > 0
|
|
|
|
# Large document with page range - optimal status
|
|
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "1-5", False)
|
|
assert result["status"] == "optimal"
|
|
|
|
# Summary mode requested - optimal status
|
|
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "", True)
|
|
assert result["status"] == "optimal"
|
|
|
|
|
|
class TestDirectToolAccess:
|
|
"""Test WordMixin integration with direct tool access."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_execution_direct(self):
|
|
"""Test Word tool execution through direct tool access."""
|
|
app = FastMCP("Test App")
|
|
WordMixin().register_all(app)
|
|
|
|
# Test error handling via direct access (nonexistent file)
|
|
convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]
|
|
with pytest.raises(OfficeFileError):
|
|
await convert_to_markdown_tool.fn(file_path="/nonexistent/file.docx")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_parameter_validation_direct(self):
|
|
"""Test parameter validation through direct access."""
|
|
app = FastMCP("Test App")
|
|
WordMixin().register_all(app)
|
|
|
|
# Test with various parameter combinations - wrong file type should be caught
|
|
convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]
|
|
|
|
# This should trigger the format validation and raise OfficeFileError
|
|
with pytest.raises(OfficeFileError):
|
|
await convert_to_markdown_tool.fn(
|
|
file_path="/test.xlsx", # Wrong file type
|
|
include_images=True,
|
|
image_mode="base64",
|
|
preserve_structure=True
|
|
)
|
|
|
|
|
|
class TestLegacyWordSupport:
|
|
"""Test support for legacy Word documents (.doc)."""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcwaddams.mixins.word.resolve_office_file_path')
|
|
@patch('mcwaddams.mixins.word.validate_office_file')
|
|
@patch('mcwaddams.mixins.word.detect_format')
|
|
async def test_convert_legacy_doc_to_markdown(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test conversion of legacy .doc files."""
|
|
mock_resolve.return_value = "/test.doc"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {
|
|
"category": "word",
|
|
"extension": ".doc",
|
|
"format_name": "Word Document (Legacy)"
|
|
}
|
|
|
|
# Mock internal methods for legacy support
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_convert_doc_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {"estimated_pages": 3}
|
|
mock_recommendation.return_value = {"recommendation": "proceed"}
|
|
mock_convert.return_value = {
|
|
"content": "# Legacy Document\n\nContent from .doc file",
|
|
"method_used": "legacy-parser",
|
|
"images": [],
|
|
"processing_notes": ["Converted from legacy format"]
|
|
}
|
|
|
|
result = await mixin.convert_to_markdown("/test.doc")
|
|
|
|
# Verify legacy conversion worked
|
|
assert "# Legacy Document" in result["markdown"]
|
|
assert "legacy-parser" in str(result["metadata"])
|
|
# Note: processing_notes are not in the result, only in internal conversion
|
|
assert "metadata" in result
|
|
assert "conversion_method" in result["metadata"]
|
|
|
|
|
|
class TestPageRangeFiltering:
|
|
"""Test page_range content filtering for convert_to_markdown.
|
|
|
|
These tests verify that the page_range parameter correctly filters
|
|
content based on either explicit page breaks or estimated paragraph counts.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcwaddams.mixins.word.resolve_office_file_path')
|
|
@patch('mcwaddams.mixins.word.validate_office_file')
|
|
@patch('mcwaddams.mixins.word.detect_format')
|
|
async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test that different page_range values return different content.
|
|
|
|
This is the key regression test for the page_range bug where
|
|
include_current_page was set but never used to filter content.
|
|
"""
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
|
|
mock_analyze.return_value = {"estimated_pages": 10}
|
|
mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
|
|
|
|
# Create mock conversions that return different content per page
|
|
call_count = [0]
|
|
def mock_convert_side_effect(*args, **kwargs):
|
|
call_count[0] += 1
|
|
page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
|
|
if page_numbers == [1, 2]:
|
|
return {
|
|
"content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
|
|
"method_used": "python-docx-custom",
|
|
"images": [],
|
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
|
}
|
|
elif page_numbers == [10, 11]:
|
|
return {
|
|
"content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
|
|
"method_used": "python-docx-custom",
|
|
"images": [],
|
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
|
}
|
|
else:
|
|
return {
|
|
"content": "# Full Content",
|
|
"method_used": "python-docx-custom",
|
|
"images": [],
|
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
|
|
}
|
|
|
|
with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
|
|
# Test page_range 1-2
|
|
result_1_2 = await mixin.convert_to_markdown(
|
|
file_path="/test.docx",
|
|
page_range="1-2"
|
|
)
|
|
|
|
# Test page_range 10-11
|
|
result_10_11 = await mixin.convert_to_markdown(
|
|
file_path="/test.docx",
|
|
page_range="10-11"
|
|
)
|
|
|
|
# The content should be different for different page ranges
|
|
assert "Page 1-2" in result_1_2["markdown"]
|
|
assert "Page 10-11" in result_10_11["markdown"]
|
|
assert result_1_2["markdown"] != result_10_11["markdown"]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"]) |