mcp-office-tools/tests/test_word_mixin.py

"""Focused tests for WordMixin functionality.

This module tests the WordMixin in isolation, focusing on:
- Word-specific tool functionality
- Markdown conversion capabilities
- Chapter and bookmark extraction
- Parameter validation for Word-specific features
"""

import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from pathlib import Path

from fastmcp import FastMCP
# FastMCP testing - using direct tool access

from mcwaddams.mixins.word import WordMixin
from mcwaddams.utils import OfficeFileError


class TestWordMixinRegistration:
    """Test WordMixin tool registration and setup."""

    def test_mixin_initialization(self):
        """Test WordMixin initializes correctly."""
        app = FastMCP("Test Word")
        mixin = WordMixin()
        mixin.register_all(app)

        assert mixin is not None
        assert len(app._tool_manager._tools) == 10  # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document, extract_entities, get_chapter_summaries, save_reading_progress, get_reading_progress

    def test_tool_names_registered(self):
        """Test that Word-specific tools are registered."""
        app = FastMCP("Test Word")
        WordMixin().register_all(app)

        expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document", "extract_entities", "get_chapter_summaries", "save_reading_progress", "get_reading_progress"}
        registered_tools = set(app._tool_manager._tools.keys())
        assert expected_tools.issubset(registered_tools)


class TestConvertToMarkdown:
    """Test convert_to_markdown tool functionality."""

    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        mixin = WordMixin()
        mixin.register_all(app)
        return mixin

    @pytest.mark.asyncio
    async def test_convert_to_markdown_nonexistent_file(self, mixin):
        """Test convert_to_markdown with nonexistent file."""
        with pytest.raises(OfficeFileError):
            await mixin.convert_to_markdown("/nonexistent/file.docx")

    @pytest.mark.asyncio
    @patch('mcwaddams.mixins.word.resolve_office_file_path')
    @patch('mcwaddams.mixins.word.validate_office_file')
    @patch('mcwaddams.mixins.word.detect_format')
    async def test_convert_to_markdown_validation_failure(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test convert_to_markdown with validation failure."""
        mock_resolve.return_value = "/test.docx"
        mock_validate.return_value = {
            "is_valid": False,
            "errors": ["File is password protected"]
        }

        with pytest.raises(OfficeFileError, match="Invalid file: File is password protected"):
            await mixin.convert_to_markdown("/test.docx")

    @pytest.mark.asyncio
    @patch('mcwaddams.mixins.word.resolve_office_file_path')
    @patch('mcwaddams.mixins.word.validate_office_file')
    @patch('mcwaddams.mixins.word.detect_format')
    async def test_convert_to_markdown_non_word_document(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test that non-Word documents are rejected."""
        mock_resolve.return_value = "/test.xlsx"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {
            "category": "excel",
            "extension": ".xlsx",
            "format_name": "Excel"
        }

        with pytest.raises(OfficeFileError, match="Markdown conversion currently only supports Word documents"):
            await mixin.convert_to_markdown("/test.xlsx")

    @pytest.mark.asyncio
    @patch('mcwaddams.mixins.word.resolve_office_file_path')
    @patch('mcwaddams.mixins.word.validate_office_file')
    @patch('mcwaddams.mixins.word.detect_format')
    async def test_convert_to_markdown_docx_success(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test successful DOCX to markdown conversion."""
        # Setup mocks
        mock_resolve.return_value = "/test.docx"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {
            "category": "word",
            "extension": ".docx",
            "format_name": "Word Document"
        }

        # Mock internal methods
        with patch.object(mixin, '_analyze_document_size') as mock_analyze:
            with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
                    mock_analyze.return_value = {
                        "estimated_pages": 5,
                        "estimated_size": "medium",
                        "has_images": True,
                        "has_complex_formatting": False
                    }
                    mock_recommendation.return_value = {
                        "recommendation": "proceed",
                        "message": "Document size is manageable for full conversion"
                    }
                    mock_convert.return_value = {
                        "content": "# Test Document\n\nThis is test content.",
                        "method_used": "python-docx",
                        "images": [],
                        "processing_notes": []
                    }

                    result = await mixin.convert_to_markdown("/test.docx")

                    # Verify structure - actual implementation uses these keys
                    assert "markdown" in result
                    assert "metadata" in result

                    # Verify content
                    assert "# Test Document" in result["markdown"]
                    assert result["metadata"]["format"] == "Word Document"
                    assert "conversion_time" in result["metadata"]
                    assert "conversion_method" in result["metadata"]

    @pytest.mark.asyncio
    async def test_convert_to_markdown_parameter_handling(self, mixin):
        """Test convert_to_markdown parameter validation and handling."""
        # Mock all dependencies for parameter testing
        with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
            with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
                with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
                    mock_resolve.return_value = "/test.docx"
                    mock_validate.return_value = {"is_valid": True, "errors": []}
                    mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}

                    with patch.object(mixin, '_analyze_document_size') as mock_analyze:
                        with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                            with patch.object(mixin, '_parse_page_range') as mock_parse_range:
                                with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
                                    mock_analyze.return_value = {"estimated_pages": 10}
                                    mock_recommendation.return_value = {"recommendation": "proceed"}
                                    mock_parse_range.return_value = [1, 2, 3, 4, 5]
                                    mock_convert.return_value = {
                                        "content": "# Test",
                                        "method_used": "python-docx",
                                        "images": [],
                                        "processing_notes": []
                                    }

                                    # Test with specific parameters
                                    result = await mixin.convert_to_markdown(
                                        file_path="/test.docx",
                                        include_images=False,
                                        image_mode="files",
                                        max_image_size=512000,
                                        preserve_structure=False,
                                        page_range="1-5",
                                        bookmark_name="Chapter1",
                                        chapter_name="Introduction",
                                        summary_only=False,
                                        output_dir="/output"
                                    )

                                    # Verify conversion was called with correct parameters
                                    mock_convert.assert_called_once()
                                    args, kwargs = mock_convert.call_args
                                    # Note: Since bookmark_name is provided, page_numbers should be None
                                    # (bookmark takes precedence over page_range)

    @pytest.mark.asyncio
    async def test_convert_to_markdown_bookmark_priority(self, mixin):
        """Test that bookmark extraction takes priority over page ranges."""
        with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
            with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
                with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
                    mock_resolve.return_value = "/test.docx"
                    mock_validate.return_value = {"is_valid": True, "errors": []}
                    mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}

                    with patch.object(mixin, '_analyze_document_size') as mock_analyze:
                        with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                            with patch.object(mixin, '_parse_page_range') as mock_parse_range:
                                with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
                                    mock_analyze.return_value = {"estimated_pages": 10}
                                    mock_recommendation.return_value = {"status": "optimal"}
                                    mock_convert.return_value = {
                                        "content": "# Chapter Content",
                                        "method_used": "python-docx",
                                        "images": [],
                                        "processing_notes": []
                                    }

                                    # Call with both page_range and bookmark_name
                                    result = await mixin.convert_to_markdown(
                                        "/test.docx",
                                        page_range="1-10",
                                        bookmark_name="Chapter1"
                                    )

                                    # Note: page_range IS parsed (mock_parse_range is called)
                                    # but when bookmark_name is provided, the page_numbers are
                                    # set to None to prioritize bookmark extraction
                                    mock_parse_range.assert_called_once()

                                    # Verify the conversion was called with bookmark (not page_numbers)
                                    mock_convert.assert_called_once()
                                    # Result should have content
                                    assert "markdown" in result

    @pytest.mark.asyncio
    async def test_convert_to_markdown_summary_mode(self, mixin):
        """Test summary_only mode functionality."""
        with patch('mcwaddams.mixins.word.resolve_office_file_path') as mock_resolve:
            with patch('mcwaddams.mixins.word.validate_office_file') as mock_validate:
                with patch('mcwaddams.mixins.word.detect_format') as mock_detect:
                    mock_resolve.return_value = "/test.docx"
                    mock_validate.return_value = {"is_valid": True, "errors": []}
                    mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}

                    with patch.object(mixin, '_analyze_document_size') as mock_analyze:
                        with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                            mock_analyze.return_value = {
                                "estimated_pages": 25,
                                "estimated_size": "large",
                                "has_images": True
                            }
                            mock_recommendation.return_value = {
                                "recommendation": "summary_recommended",
                                "message": "Large document - summary mode recommended"
                            }

                            # Also need to mock the conversion method for summary mode
                            with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
                                mock_convert.return_value = {
                                    "content": "# Summary Document\n\nThis is a summary of the content.",
                                    "method_used": "python-docx",
                                    "images": [],
                                    "table_of_contents": {"note": "Summary mode"}
                                }

                                result = await mixin.convert_to_markdown(
                                    "/test.docx",
                                    summary_only=True
                                )

                                # Verify that summary information is returned
                                assert "metadata" in result
                                assert "summary" in result  # Summary mode returns "summary" not "markdown"
                                assert result["metadata"]["summary_only"] is True


class TestWordSpecificHelpers:
    """Test Word-specific helper methods."""

    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        mixin = WordMixin()
        mixin.register_all(app)
        return mixin

    def test_parse_page_range_single_page(self, mixin):
        """Test parsing single page range."""
        result = mixin._parse_page_range("5")
        assert result == [5]

    def test_parse_page_range_range(self, mixin):
        """Test parsing page ranges."""
        result = mixin._parse_page_range("1-5")
        assert result == [1, 2, 3, 4, 5]

    def test_parse_page_range_complex(self, mixin):
        """Test parsing complex page ranges."""
        result = mixin._parse_page_range("1,3,5-7,10")
        expected = [1, 3, 5, 6, 7, 10]
        assert result == expected

    def test_parse_page_range_invalid(self, mixin):
        """Test parsing invalid page ranges returns empty list (graceful handling)."""
        # Invalid strings return empty list instead of raising error
        result = mixin._parse_page_range("invalid")
        assert result == []

        # End before start returns empty list (range(10, 6) is empty)
        result = mixin._parse_page_range("10-5")
        assert result == []  # Empty because range(10, 6) produces no values

    def test_get_processing_recommendation(self, mixin):
        """Test processing recommendation logic."""
        # The actual function uses 'estimated_content_size' not 'estimated_size'
        # and returns dict with 'status', 'message', 'suggested_workflow', 'warnings'

        # Small document - optimal status
        doc_analysis = {"estimated_pages": 3, "estimated_content_size": "small"}
        result = mixin._get_processing_recommendation(doc_analysis, "", False)
        assert result["status"] == "optimal"

        # Large document without page range - suboptimal status
        doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
        result = mixin._get_processing_recommendation(doc_analysis, "", False)
        assert result["status"] == "suboptimal"
        assert len(result["suggested_workflow"]) > 0

        # Large document with page range - optimal status
        doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
        result = mixin._get_processing_recommendation(doc_analysis, "1-5", False)
        assert result["status"] == "optimal"

        # Summary mode requested - optimal status
        doc_analysis = {"estimated_pages": 25, "estimated_content_size": "large"}
        result = mixin._get_processing_recommendation(doc_analysis, "", True)
        assert result["status"] == "optimal"


class TestDirectToolAccess:
    """Test WordMixin integration with direct tool access."""

    @pytest.mark.asyncio
    async def test_tool_execution_direct(self):
        """Test Word tool execution through direct tool access."""
        app = FastMCP("Test App")
        WordMixin().register_all(app)

        # Test error handling via direct access (nonexistent file)
        convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]
        with pytest.raises(OfficeFileError):
            await convert_to_markdown_tool.fn(file_path="/nonexistent/file.docx")

    @pytest.mark.asyncio
    async def test_tool_parameter_validation_direct(self):
        """Test parameter validation through direct access."""
        app = FastMCP("Test App")
        WordMixin().register_all(app)

        # Test with various parameter combinations - wrong file type should be caught
        convert_to_markdown_tool = app._tool_manager._tools["convert_to_markdown"]

        # This should trigger the format validation and raise OfficeFileError
        with pytest.raises(OfficeFileError):
            await convert_to_markdown_tool.fn(
                file_path="/test.xlsx",  # Wrong file type
                include_images=True,
                image_mode="base64",
                preserve_structure=True
            )


class TestLegacyWordSupport:
    """Test support for legacy Word documents (.doc)."""

    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        mixin = WordMixin()
        mixin.register_all(app)
        return mixin

    @pytest.mark.asyncio
    @patch('mcwaddams.mixins.word.resolve_office_file_path')
    @patch('mcwaddams.mixins.word.validate_office_file')
    @patch('mcwaddams.mixins.word.detect_format')
    async def test_convert_legacy_doc_to_markdown(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test conversion of legacy .doc files."""
        mock_resolve.return_value = "/test.doc"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {
            "category": "word",
            "extension": ".doc",
            "format_name": "Word Document (Legacy)"
        }

        # Mock internal methods for legacy support
        with patch.object(mixin, '_analyze_document_size') as mock_analyze:
            with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                with patch.object(mixin, '_convert_doc_to_markdown') as mock_convert:
                    mock_analyze.return_value = {"estimated_pages": 3}
                    mock_recommendation.return_value = {"recommendation": "proceed"}
                    mock_convert.return_value = {
                        "content": "# Legacy Document\n\nContent from .doc file",
                        "method_used": "legacy-parser",
                        "images": [],
                        "processing_notes": ["Converted from legacy format"]
                    }

                    result = await mixin.convert_to_markdown("/test.doc")

                    # Verify legacy conversion worked
                    assert "# Legacy Document" in result["markdown"]
                    assert "legacy-parser" in str(result["metadata"])
                    # Note: processing_notes are not in the result, only in internal conversion
                    assert "metadata" in result
                    assert "conversion_method" in result["metadata"]


class TestPageRangeFiltering:
    """Test page_range content filtering for convert_to_markdown.

    These tests verify that the page_range parameter correctly filters
    content based on either explicit page breaks or estimated paragraph counts.
    """

    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        mixin = WordMixin()
        mixin.register_all(app)
        return mixin

    @pytest.mark.asyncio
    @patch('mcwaddams.mixins.word.resolve_office_file_path')
    @patch('mcwaddams.mixins.word.validate_office_file')
    @patch('mcwaddams.mixins.word.detect_format')
    async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test that different page_range values return different content.

        This is the key regression test for the page_range bug where
        include_current_page was set but never used to filter content.
        """
        mock_resolve.return_value = "/test.docx"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}

        with patch.object(mixin, '_analyze_document_size') as mock_analyze:
            with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
                mock_analyze.return_value = {"estimated_pages": 10}
                mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}

                # Create mock conversions that return different content per page
                call_count = [0]
                def mock_convert_side_effect(*args, **kwargs):
                    call_count[0] += 1
                    page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
                    if page_numbers == [1, 2]:
                        return {
                            "content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
                            "method_used": "python-docx-custom",
                            "images": [],
                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
                        }
                    elif page_numbers == [10, 11]:
                        return {
                            "content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
                            "method_used": "python-docx-custom",
                            "images": [],
                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
                        }
                    else:
                        return {
                            "content": "# Full Content",
                            "method_used": "python-docx-custom",
                            "images": [],
                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
                        }

                with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
                    # Test page_range 1-2
                    result_1_2 = await mixin.convert_to_markdown(
                        file_path="/test.docx",
                        page_range="1-2"
                    )

                    # Test page_range 10-11
                    result_10_11 = await mixin.convert_to_markdown(
                        file_path="/test.docx",
                        page_range="10-11"
                    )

                    # The content should be different for different page ranges
                    assert "Page 1-2" in result_1_2["markdown"]
                    assert "Page 10-11" in result_10_11["markdown"]
                    assert result_1_2["markdown"] != result_10_11["markdown"]


if __name__ == "__main__":
    pytest.main([__file__, "-v"])