mcp-office-tools/tests/test_word_mixin.py
Ryan Malloy 31948d6ffc
Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
Rename package to mcwaddams
Named for Milton Waddams, who was relocated to the basement with
boxes of legacy documents. He handles the .doc and .xls files from
1997 that nobody else wants to touch.

- Rename package from mcp-office-tools to mcwaddams
- Update author to Ryan Malloy
- Update all imports and references
- Add Office Space themed README narrative
- All 53 tests passing
2026-01-11 11:35:35 -07:00

493 lines
24 KiB
Python

"""Focused tests for WordMixin functionality.
This module tests the WordMixin in isolation, focusing on:
- Word-specific tool functionality
- Markdown conversion capabilities
- Chapter and bookmark extraction
- Parameter validation for Word-specific features
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from pathlib import Path
from fastmcp import FastMCP
# FastMCP testing - using direct tool access
from mcwaddams.mixins.word import WordMixin
from mcwaddams.utils import OfficeFileError
class TestWordMixinRegistration:
"""Test WordMixin tool registration and setup."""
def test_mixin_initialization(self):
"""Test WordMixin initializes correctly."""
app = FastMCP("Test Word")
mixin = WordMixin()
mixin.register_all(app)
assert mixin is not None
assert len(app._tool_manager._tools) == 10 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document, extract_entities, get_chapter_summaries, save_reading_progress, get_reading_progress
def test_tool_names_registered(self):
"""Test that Word-specific tools are registered."""
app = FastMCP("Test Word")
WordMixin().register_all(app)
expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document", "extract_entities", "get_chapter_summaries", "save_reading_progress", "get_reading_progress"}
registered_tools = set(app._tool_manager._tools.keys())
assert expected_tools.issubset(registered_tools)
class TestConvertToMarkdown:
"""Test convert_to_markdown tool functionality."""
@pytest.fixture
def mixin(self):
"""Create WordMixin for testing."""
app = FastMCP("Test")
mixin = WordMixin()
mixin.register_all(app)
return mixin
@pytest.mark.asyncio
async def test_convert_to_markdown_nonexistent_file(self, mixin):
"""Test convert_to_markdown with nonexistent file."""
with pytest.raises(OfficeFileError):
await mixin.convert_to_markdown("/nonexistent/file.docx")
@pytest.mark.asyncio
@patch('mcwaddams.mixins.word.resolve_office_file_path')
@patch('mcwaddams.mixins.word.validate_office_file')
@patch('mcwaddams.mixins.word.detect_format')
async def test_convert_to_markdown_validation_failure(self, mock_detect, mock_validate, mock_resolve, mixin):
"""Test convert_to_markdown with validation failure."""
mock_resolve.return_value = "/test.docx"
mock_validate.return_value = {
"is_valid": False,
"errors": ["File is password protected"]
}
with pytest.raises(OfficeFileError, match="Invalid file: File is password protected"):
await mixin.convert_to_markdown("/test.docx")
@pytest.mark.asyncio
@patch('mcwaddams.mixins.word.resolve_office_file_path')
@patch('mcwaddams.mixins.word.validate_office_file')
@patch('mcwaddams.mixins.word.detect_format')
async def test_convert_to_markdown_non_word_document(self, mock_detect, mock_validate, mock_resolve, mixin):
"""Test that non-Word documents are rejected."""
mock_resolve.return_value = "/test.xlsx"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {
"category": "excel",
"extension": ".xlsx",
"format_name": "Excel"
}
with pytest.raises(OfficeFileError, match="Markdown conversion currently only supports Word documents"):
await mixin.convert_to_markdown("/test.xlsx")
@pytest.mark.asyncio
@patch('mcwaddams.mixins.word.resolve_office_file_path')
@patch('mcwaddams.mixins.word.validate_office_file')
@patch('mcwaddams.mixins.word.detect_format')
async def test_convert_to_markdown_docx_success(self, mock_detect, mock_validate, mock_resolve, mixin):
"""Test successful DOCX to markdown conversion."""
# Setup mocks
mock_resolve.return_value = "/test.docx"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {
"category": "word",
"extension": ".docx",
"format_name": "Word Document"
}
# Mock internal methods
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
mock_analyze.return_value = {
"estimated_pages": 5,
"estimated_size": "medium",
"has_images": True,
"has_complex_formatting": False
}
mock_recommendation.return_value = {
"recommendation": "proceed",
"message": "Document size is manageable for full conversion"
}
mock_convert.return_value = {
"content": "# Test Document\n\nThis is test content.",
"method_used": "python-docx",
"images": [],
"processing_notes": []
}
result = await mixin.convert_to_markdown("/test.docx")
# Verify structure - actual implementation uses these keys
assert "markdown" in result
assert "metadata" in result
# Verify content
assert "# Test Document" in result["markdown"]
assert result["metadata"]["format"] == "Word Document"
assert "conversion_time" in result["metadata"]
assert "conversion_method" in result["metadata"]
@pytest.mark.asyncio
async def test_convert_to_markdown_parameter_handling(self, mixin):
"""Test convert_to_markdown parameter validation and handling."""
# Mock all dependencies for parameter testing
with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
mock_resolve.return_value = "/test.docx"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
with patch.object(mixin, '_parse_page_range') as mock_parse_range:
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
mock_analyze.return_value = {"estimated_pages": 10}
mock_recommendation.return_value = {"recommendation": "proceed"}
mock_parse_range.return_value = [1, 2, 3, 4, 5]
mock_convert.return_value = {
"content": "# Test",
"method_used": "python-docx",
"images": [],
"processing_notes": []
}
# Test with specific parameters
result = await mixin.convert_to_markdown(
file_path="/test.docx",
include_images=False,
image_mode="files",
max_image_size=512000,
preserve_structure=False,
page_range="1-5",
bookmark_name="Chapter1",
chapter_name="Introduction",
summary_only=False,
output_dir="/output"
)
# Verify conversion was called with correct parameters
mock_convert.assert_called_once()
args, kwargs = mock_convert.call_args
# Note: Since bookmark_name is provided, page_numbers should be None
# (bookmark takes precedence over page_range)
@pytest.mark.asyncio
async def test_convert_to_markdown_bookmark_priority(self, mixin):
"""Test that bookmark extraction takes priority over page ranges."""
with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
mock_resolve.return_value = "/test.docx"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
with patch.object(mixin, '_parse_page_range') as mock_parse_range:
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
mock_analyze.return_value = {"estimated_pages": 10}
mock_recommendation.return_value = {"status": "optimal"}
mock_convert.return_value = {
"content": "# Chapter Content",
"method_used": "python-docx",
"images": [],
"processing_notes": []
}
# Call with both page_range and bookmark_name
result = await mixin.convert_to_markdown(
"/test.docx",
page_range="1-10",
bookmark_name="Chapter1"
)
# Note: page_range IS parsed (mock_parse_range is called)
# but when bookmark_name is provided, the page_numbers are
# set to None to prioritize bookmark extraction
mock_parse_range.assert_called_once()
# Verify the conversion was called with bookmark (not page_numbers)
mock_convert.assert_called_once()
# Result should have content
assert "markdown" in result
@pytest.mark.asyncio
async def test_convert_to_markdown_summary_mode(self, mixin):
"""Test summary_only mode functionality."""
with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
mock_resolve.return_value = "/test.docx"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
mock_analyze.return_value = {
"estimated_pages": 25,
"estimated_size": "large",
"has_images": True
}
mock_recommendation.return_value = {
"recommendation": "summary_recommended",
"message": "Large document - summary mode recommended"
}
# Also need to mock the conversion method for summary mode
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
mock_convert.return_value = {
"content": "# Summary Document\n\nThis is a summary of the content.",
"method_used": "python-docx",
"images": [],
"table_of_contents": {"note": "Summary mode"}
}
result = await mixin.convert_to_markdown(
"/test.docx",
summary_only=True
)
# Verify that summary information is returned
assert "metadata" in result
assert "summary" in result # Summary mode returns "summary" not "markdown"
assert result["metadata"]["summary_only"] is True
class TestWordSpecificHelpers:
"""Test Word-specific helper methods."""
@pytest.fixture
def mixin(self):
"""Create WordMixin for testing."""
app = FastMCP("Test")
mixin = WordMixin()
mixin.register_all(app)
return mixin
def test_parse_page_range_single_page(self, mixin):
"""Test parsing single page range."""
result = mixin._parse_page_range("5")
assert result == [5]
def test_parse_page_range_range(self, mixin):
"""Test parsing page ranges."""
result = mixin._parse_page_range("1-5")
assert result == [1, 2, 3, 4, 5]
def test_parse_page_range_complex(self, mixin):
"""Test parsing complex page ranges."""
result = mixin._parse_page_range("1,3,5-7,10")
expected = [1, 3, 5, 6, 7, 10]
assert result == expected
def test_parse_page_range_invalid(self, mixin):
"""Test parsing invalid page ranges returns empty list (graceful handling)."""
# Invalid strings return empty list instead of raising error
result = mixin._parse_page_range("invalid")
assert result == []
# End before start returns empty list (range(10, 6) is empty)
result = mixin._parse_page_range("10-5")
assert result == [] # Empty because range(10, 6) produces no values
def test_get_processing_recommendation(self, mixin):
"""Test processing recommendation logic."""
# The actual function uses 'estimated_content_size' not 'estimated_size'
# and returns dict with 'status', 'message', 'suggested_workflow', 'warnings'
# Small document - optimal status
doc_analysis = {"estimated_pages": 3, "estimated_content_size": "small"}
result = mixin._get_processing_recommendation(doc_analysis, "", False)
assert result["status"] == "optimal"
# Large document without page range - suboptimal status
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
result = mixin._get_processing_recommendation(doc_analysis, "", False)
assert result["status"] == "suboptimal"
assert len(result["suggested_workflow"]) > 0
# Large document with page range - optimal status
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
result = mixin._get_processing_recommendation(doc_analysis, "1-5", False)
assert result["status"] == "optimal"
# Summary mode requested - optimal status
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
result = mixin._get_processing_recommendation(doc_analysis, "", True)
assert result["status"] == "optimal"
class TestDirectToolAccess:
"""Test WordMixin integration with direct tool access."""
@pytest.mark.asyncio
async def test_tool_execution_direct(self):
"""Test Word tool execution through direct tool access."""
app = FastMCP("Test App")
WordMixin().register_all(app)
# Test error handling via direct access (nonexistent file)
convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]
with pytest.raises(OfficeFileError):
await convert_to_markdown_tool.fn(file_path="/nonexistent/file.docx")
@pytest.mark.asyncio
async def test_tool_parameter_validation_direct(self):
"""Test parameter validation through direct access."""
app = FastMCP("Test App")
WordMixin().register_all(app)
# Test with various parameter combinations - wrong file type should be caught
convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]
# This should trigger the format validation and raise OfficeFileError
with pytest.raises(OfficeFileError):
await convert_to_markdown_tool.fn(
file_path="/test.xlsx", # Wrong file type
include_images=True,
image_mode="base64",
preserve_structure=True
)
class TestLegacyWordSupport:
"""Test support for legacy Word documents (.doc)."""
@pytest.fixture
def mixin(self):
"""Create WordMixin for testing."""
app = FastMCP("Test")
mixin = WordMixin()
mixin.register_all(app)
return mixin
@pytest.mark.asyncio
@patch('mcwaddams.mixins.word.resolve_office_file_path')
@patch('mcwaddams.mixins.word.validate_office_file')
@patch('mcwaddams.mixins.word.detect_format')
async def test_convert_legacy_doc_to_markdown(self, mock_detect, mock_validate, mock_resolve, mixin):
"""Test conversion of legacy .doc files."""
mock_resolve.return_value = "/test.doc"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {
"category": "word",
"extension": ".doc",
"format_name": "Word Document (Legacy)"
}
# Mock internal methods for legacy support
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
with patch.object(mixin, '_convert_doc_to_markdown') as mock_convert:
mock_analyze.return_value = {"estimated_pages": 3}
mock_recommendation.return_value = {"recommendation": "proceed"}
mock_convert.return_value = {
"content": "# Legacy Document\n\nContent from .doc file",
"method_used": "legacy-parser",
"images": [],
"processing_notes": ["Converted from legacy format"]
}
result = await mixin.convert_to_markdown("/test.doc")
# Verify legacy conversion worked
assert "# Legacy Document" in result["markdown"]
assert "legacy-parser" in str(result["metadata"])
# Note: processing_notes are not in the result, only in internal conversion
assert "metadata" in result
assert "conversion_method" in result["metadata"]
class TestPageRangeFiltering:
"""Test page_range content filtering for convert_to_markdown.
These tests verify that the page_range parameter correctly filters
content based on either explicit page breaks or estimated paragraph counts.
"""
@pytest.fixture
def mixin(self):
"""Create WordMixin for testing."""
app = FastMCP("Test")
mixin = WordMixin()
mixin.register_all(app)
return mixin
@pytest.mark.asyncio
@patch('mcwaddams.mixins.word.resolve_office_file_path')
@patch('mcwaddams.mixins.word.validate_office_file')
@patch('mcwaddams.mixins.word.detect_format')
async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
"""Test that different page_range values return different content.
This is the key regression test for the page_range bug where
include_current_page was set but never used to filter content.
"""
mock_resolve.return_value = "/test.docx"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
mock_analyze.return_value = {"estimated_pages": 10}
mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
# Create mock conversions that return different content per page
call_count = [0]
def mock_convert_side_effect(*args, **kwargs):
call_count[0] += 1
page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
if page_numbers == [1, 2]:
return {
"content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
"method_used": "python-docx-custom",
"images": [],
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
}
elif page_numbers == [10, 11]:
return {
"content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
"method_used": "python-docx-custom",
"images": [],
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
}
else:
return {
"content": "# Full Content",
"method_used": "python-docx-custom",
"images": [],
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
}
with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
# Test page_range 1-2
result_1_2 = await mixin.convert_to_markdown(
file_path="/test.docx",
page_range="1-2"
)
# Test page_range 10-11
result_10_11 = await mixin.convert_to_markdown(
file_path="/test.docx",
page_range="10-11"
)
# The content should be different for different page ranges
assert "Page 1-2" in result_1_2["markdown"]
assert "Page 10-11" in result_10_11["markdown"]
assert result_1_2["markdown"] != result_10_11["markdown"]
if __name__ == "__main__":
pytest.main([__file__, "-v"])