New tools for Word document analysis: - extract_entities: Pattern-based extraction of people, places, organizations - get_chapter_summaries: Chapter previews with opening sentences and word counts - save_reading_progress: Bookmark reading position to JSON file - get_reading_progress: Resume reading from saved position New MCP prompts (basic to advanced workflows): - explore-document: Get started with a new document - find-character: Track character mentions - chapter-preview: Quick chapter overviews - resume-reading: Continue where you left off - document-analysis: Comprehensive multi-tool analysis - character-journey: Track character arc through narrative - document-comparison: Compare entities between chapters - full-reading-session: Guided reading with bookmarking - manuscript-review: Complete editorial workflow Updated test counts for 19 total tools (6 universal + 10 word + 3 excel)
493 lines
24 KiB
Python
493 lines
24 KiB
Python
"""Focused tests for WordMixin functionality.
|
|
|
|
This module tests the WordMixin in isolation, focusing on:
|
|
- Word-specific tool functionality
|
|
- Markdown conversion capabilities
|
|
- Chapter and bookmark extraction
|
|
- Parameter validation for Word-specific features
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from pathlib import Path
|
|
|
|
from fastmcp import FastMCP
|
|
# FastMCP testing - using direct tool access
|
|
|
|
from mcp_office_tools.mixins.word import WordMixin
|
|
from mcp_office_tools.utils import OfficeFileError
|
|
|
|
|
|
class TestWordMixinRegistration:
|
|
"""Test WordMixin tool registration and setup."""
|
|
|
|
def test_mixin_initialization(self):
|
|
"""Test WordMixin initializes correctly."""
|
|
app = FastMCP("Test Word")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
|
|
assert mixin is not None
|
|
assert len(app._tool_manager._tools) == 10 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document, extract_entities, get_chapter_summaries, save_reading_progress, get_reading_progress
|
|
|
|
def test_tool_names_registered(self):
|
|
"""Test that Word-specific tools are registered."""
|
|
app = FastMCP("Test Word")
|
|
WordMixin().register_all(app)
|
|
|
|
expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document", "extract_entities", "get_chapter_summaries", "save_reading_progress", "get_reading_progress"}
|
|
registered_tools = set(app._tool_manager._tools.keys())
|
|
assert expected_tools.issubset(registered_tools)
|
|
|
|
|
|
class TestConvertToMarkdown:
|
|
"""Test convert_to_markdown tool functionality."""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_nonexistent_file(self, mixin):
|
|
"""Test convert_to_markdown with nonexistent file."""
|
|
with pytest.raises(OfficeFileError):
|
|
await mixin.convert_to_markdown("/nonexistent/file.docx")
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
|
|
@patch('mcp_office_tools.mixins.word.validate_office_file')
|
|
@patch('mcp_office_tools.mixins.word.detect_format')
|
|
async def test_convert_to_markdown_validation_failure(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test convert_to_markdown with validation failure."""
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {
|
|
"is_valid": False,
|
|
"errors": ["File is password protected"]
|
|
}
|
|
|
|
with pytest.raises(OfficeFileError, match="Invalid file: File is password protected"):
|
|
await mixin.convert_to_markdown("/test.docx")
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
|
|
@patch('mcp_office_tools.mixins.word.validate_office_file')
|
|
@patch('mcp_office_tools.mixins.word.detect_format')
|
|
async def test_convert_to_markdown_non_word_document(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test that non-Word documents are rejected."""
|
|
mock_resolve.return_value = "/test.xlsx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {
|
|
"category": "excel",
|
|
"extension": ".xlsx",
|
|
"format_name": "Excel"
|
|
}
|
|
|
|
with pytest.raises(OfficeFileError, match="Markdown conversion currently only supports Word documents"):
|
|
await mixin.convert_to_markdown("/test.xlsx")
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
|
|
@patch('mcp_office_tools.mixins.word.validate_office_file')
|
|
@patch('mcp_office_tools.mixins.word.detect_format')
|
|
async def test_convert_to_markdown_docx_success(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test successful DOCX to markdown conversion."""
|
|
# Setup mocks
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {
|
|
"category": "word",
|
|
"extension": ".docx",
|
|
"format_name": "Word Document"
|
|
}
|
|
|
|
# Mock internal methods
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {
|
|
"estimated_pages": 5,
|
|
"estimated_size": "medium",
|
|
"has_images": True,
|
|
"has_complex_formatting": False
|
|
}
|
|
mock_recommendation.return_value = {
|
|
"recommendation": "proceed",
|
|
"message": "Document size is manageable for full conversion"
|
|
}
|
|
mock_convert.return_value = {
|
|
"content": "# Test Document\n\nThis is test content.",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"processing_notes": []
|
|
}
|
|
|
|
result = await mixin.convert_to_markdown("/test.docx")
|
|
|
|
# Verify structure - actual implementation uses these keys
|
|
assert "markdown" in result
|
|
assert "metadata" in result
|
|
|
|
# Verify content
|
|
assert "# Test Document" in result["markdown"]
|
|
assert result["metadata"]["format"] == "Word Document"
|
|
assert "conversion_time" in result["metadata"]
|
|
assert "conversion_method" in result["metadata"]
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_parameter_handling(self, mixin):
|
|
"""Test convert_to_markdown parameter validation and handling."""
|
|
# Mock all dependencies for parameter testing
|
|
with patch('mcp_office_tools.mixins.word.resolve_office_file_path') as mock_resolve:
|
|
with patch('mcp_office_tools.mixins.word.validate_office_file') as mock_validate:
|
|
with patch('mcp_office_tools.mixins.word.detect_format') as mock_detect:
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_parse_page_range') as mock_parse_range:
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {"estimated_pages": 10}
|
|
mock_recommendation.return_value = {"recommendation": "proceed"}
|
|
mock_parse_range.return_value = [1, 2, 3, 4, 5]
|
|
mock_convert.return_value = {
|
|
"content": "# Test",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"processing_notes": []
|
|
}
|
|
|
|
# Test with specific parameters
|
|
result = await mixin.convert_to_markdown(
|
|
file_path="/test.docx",
|
|
include_images=False,
|
|
image_mode="files",
|
|
max_image_size=512000,
|
|
preserve_structure=False,
|
|
page_range="1-5",
|
|
bookmark_name="Chapter1",
|
|
chapter_name="Introduction",
|
|
summary_only=False,
|
|
output_dir="/output"
|
|
)
|
|
|
|
# Verify conversion was called with correct parameters
|
|
mock_convert.assert_called_once()
|
|
args, kwargs = mock_convert.call_args
|
|
# Note: Since bookmark_name is provided, page_numbers should be None
|
|
# (bookmark takes precedence over page_range)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_bookmark_priority(self, mixin):
|
|
"""Test that bookmark extraction takes priority over page ranges."""
|
|
with patch('mcp_office_tools.mixins.word.resolve_office_file_path') as mock_resolve:
|
|
with patch('mcp_office_tools.mixins.word.validate_office_file') as mock_validate:
|
|
with patch('mcp_office_tools.mixins.word.detect_format') as mock_detect:
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_parse_page_range') as mock_parse_range:
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {"estimated_pages": 10}
|
|
mock_recommendation.return_value = {"status": "optimal"}
|
|
mock_convert.return_value = {
|
|
"content": "# Chapter Content",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"processing_notes": []
|
|
}
|
|
|
|
# Call with both page_range and bookmark_name
|
|
result = await mixin.convert_to_markdown(
|
|
"/test.docx",
|
|
page_range="1-10",
|
|
bookmark_name="Chapter1"
|
|
)
|
|
|
|
# Note: page_range IS parsed (mock_parse_range is called)
|
|
# but when bookmark_name is provided, the page_numbers are
|
|
# set to None to prioritize bookmark extraction
|
|
mock_parse_range.assert_called_once()
|
|
|
|
# Verify the conversion was called with bookmark (not page_numbers)
|
|
mock_convert.assert_called_once()
|
|
# Result should have content
|
|
assert "markdown" in result
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_convert_to_markdown_summary_mode(self, mixin):
|
|
"""Test summary_only mode functionality."""
|
|
with patch('mcp_office_tools.mixins.word.resolve_office_file_path') as mock_resolve:
|
|
with patch('mcp_office_tools.mixins.word.validate_office_file') as mock_validate:
|
|
with patch('mcp_office_tools.mixins.word.detect_format') as mock_detect:
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
mock_analyze.return_value = {
|
|
"estimated_pages": 25,
|
|
"estimated_size": "large",
|
|
"has_images": True
|
|
}
|
|
mock_recommendation.return_value = {
|
|
"recommendation": "summary_recommended",
|
|
"message": "Large document - summary mode recommended"
|
|
}
|
|
|
|
# Also need to mock the conversion method for summary mode
|
|
with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
|
|
mock_convert.return_value = {
|
|
"content": "# Summary Document\n\nThis is a summary of the content.",
|
|
"method_used": "python-docx",
|
|
"images": [],
|
|
"table_of_contents": {"note": "Summary mode"}
|
|
}
|
|
|
|
result = await mixin.convert_to_markdown(
|
|
"/test.docx",
|
|
summary_only=True
|
|
)
|
|
|
|
# Verify that summary information is returned
|
|
assert "metadata" in result
|
|
assert "summary" in result # Summary mode returns "summary" not "markdown"
|
|
assert result["metadata"]["summary_only"] is True
|
|
|
|
|
|
class TestWordSpecificHelpers:
|
|
"""Test Word-specific helper methods."""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
def test_parse_page_range_single_page(self, mixin):
|
|
"""Test parsing single page range."""
|
|
result = mixin._parse_page_range("5")
|
|
assert result == [5]
|
|
|
|
def test_parse_page_range_range(self, mixin):
|
|
"""Test parsing page ranges."""
|
|
result = mixin._parse_page_range("1-5")
|
|
assert result == [1, 2, 3, 4, 5]
|
|
|
|
def test_parse_page_range_complex(self, mixin):
|
|
"""Test parsing complex page ranges."""
|
|
result = mixin._parse_page_range("1,3,5-7,10")
|
|
expected = [1, 3, 5, 6, 7, 10]
|
|
assert result == expected
|
|
|
|
def test_parse_page_range_invalid(self, mixin):
|
|
"""Test parsing invalid page ranges returns empty list (graceful handling)."""
|
|
# Invalid strings return empty list instead of raising error
|
|
result = mixin._parse_page_range("invalid")
|
|
assert result == []
|
|
|
|
# End before start returns empty list (range(10, 6) is empty)
|
|
result = mixin._parse_page_range("10-5")
|
|
assert result == [] # Empty because range(10, 6) produces no values
|
|
|
|
def test_get_processing_recommendation(self, mixin):
|
|
"""Test processing recommendation logic."""
|
|
# The actual function uses 'estimated_content_size' not 'estimated_size'
|
|
# and returns dict with 'status', 'message', 'suggested_workflow', 'warnings'
|
|
|
|
# Small document - optimal status
|
|
doc_analysis = {"estimated_pages": 3, "estimated_content_size": "small"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "", False)
|
|
assert result["status"] == "optimal"
|
|
|
|
# Large document without page range - suboptimal status
|
|
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "", False)
|
|
assert result["status"] == "suboptimal"
|
|
assert len(result["suggested_workflow"]) > 0
|
|
|
|
# Large document with page range - optimal status
|
|
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "1-5", False)
|
|
assert result["status"] == "optimal"
|
|
|
|
# Summary mode requested - optimal status
|
|
doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
|
|
result = mixin._get_processing_recommendation(doc_analysis, "", True)
|
|
assert result["status"] == "optimal"
|
|
|
|
|
|
class TestDirectToolAccess:
|
|
"""Test WordMixin integration with direct tool access."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_execution_direct(self):
|
|
"""Test Word tool execution through direct tool access."""
|
|
app = FastMCP("Test App")
|
|
WordMixin().register_all(app)
|
|
|
|
# Test error handling via direct access (nonexistent file)
|
|
convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]
|
|
with pytest.raises(OfficeFileError):
|
|
await convert_to_markdown_tool.fn(file_path="/nonexistent/file.docx")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_parameter_validation_direct(self):
|
|
"""Test parameter validation through direct access."""
|
|
app = FastMCP("Test App")
|
|
WordMixin().register_all(app)
|
|
|
|
# Test with various parameter combinations - wrong file type should be caught
|
|
convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]
|
|
|
|
# This should trigger the format validation and raise OfficeFileError
|
|
with pytest.raises(OfficeFileError):
|
|
await convert_to_markdown_tool.fn(
|
|
file_path="/test.xlsx", # Wrong file type
|
|
include_images=True,
|
|
image_mode="base64",
|
|
preserve_structure=True
|
|
)
|
|
|
|
|
|
class TestLegacyWordSupport:
|
|
"""Test support for legacy Word documents (.doc)."""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
|
|
@patch('mcp_office_tools.mixins.word.validate_office_file')
|
|
@patch('mcp_office_tools.mixins.word.detect_format')
|
|
async def test_convert_legacy_doc_to_markdown(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test conversion of legacy .doc files."""
|
|
mock_resolve.return_value = "/test.doc"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {
|
|
"category": "word",
|
|
"extension": ".doc",
|
|
"format_name": "Word Document (Legacy)"
|
|
}
|
|
|
|
# Mock internal methods for legacy support
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
|
|
with patch.object(mixin, '_convert_doc_to_markdown') as mock_convert:
|
|
mock_analyze.return_value = {"estimated_pages": 3}
|
|
mock_recommendation.return_value = {"recommendation": "proceed"}
|
|
mock_convert.return_value = {
|
|
"content": "# Legacy Document\n\nContent from .doc file",
|
|
"method_used": "legacy-parser",
|
|
"images": [],
|
|
"processing_notes": ["Converted from legacy format"]
|
|
}
|
|
|
|
result = await mixin.convert_to_markdown("/test.doc")
|
|
|
|
# Verify legacy conversion worked
|
|
assert "# Legacy Document" in result["markdown"]
|
|
assert "legacy-parser" in str(result["metadata"])
|
|
# Note: processing_notes are not in the result, only in internal conversion
|
|
assert "metadata" in result
|
|
assert "conversion_method" in result["metadata"]
|
|
|
|
|
|
class TestPageRangeFiltering:
|
|
"""Test page_range content filtering for convert_to_markdown.
|
|
|
|
These tests verify that the page_range parameter correctly filters
|
|
content based on either explicit page breaks or estimated paragraph counts.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def mixin(self):
|
|
"""Create WordMixin for testing."""
|
|
app = FastMCP("Test")
|
|
mixin = WordMixin()
|
|
mixin.register_all(app)
|
|
return mixin
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
|
|
@patch('mcp_office_tools.mixins.word.validate_office_file')
|
|
@patch('mcp_office_tools.mixins.word.detect_format')
|
|
async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
|
|
"""Test that different page_range values return different content.
|
|
|
|
This is the key regression test for the page_range bug where
|
|
include_current_page was set but never used to filter content.
|
|
"""
|
|
mock_resolve.return_value = "/test.docx"
|
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
|
|
|
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
|
|
mock_analyze.return_value = {"estimated_pages": 10}
|
|
mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
|
|
|
|
# Create mock conversions that return different content per page
|
|
call_count = [0]
|
|
def mock_convert_side_effect(*args, **kwargs):
|
|
call_count[0] += 1
|
|
page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
|
|
if page_numbers == [1, 2]:
|
|
return {
|
|
"content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
|
|
"method_used": "python-docx-custom",
|
|
"images": [],
|
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
|
}
|
|
elif page_numbers == [10, 11]:
|
|
return {
|
|
"content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
|
|
"method_used": "python-docx-custom",
|
|
"images": [],
|
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
|
}
|
|
else:
|
|
return {
|
|
"content": "# Full Content",
|
|
"method_used": "python-docx-custom",
|
|
"images": [],
|
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
|
|
}
|
|
|
|
with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
|
|
# Test page_range 1-2
|
|
result_1_2 = await mixin.convert_to_markdown(
|
|
file_path="/test.docx",
|
|
page_range="1-2"
|
|
)
|
|
|
|
# Test page_range 10-11
|
|
result_10_11 = await mixin.convert_to_markdown(
|
|
file_path="/test.docx",
|
|
page_range="10-11"
|
|
)
|
|
|
|
# The content should be different for different page ranges
|
|
assert "Page 1-2" in result_1_2["markdown"]
|
|
assert "Page 10-11" in result_10_11["markdown"]
|
|
assert result_1_2["markdown"] != result_10_11["markdown"]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"]) |