mcp-office-tools/tests/test_word_mixin.py

"""Focused tests for WordMixin functionality.

This module tests the WordMixin in isolation, focusing on:
- Word-specific tool functionality
- Markdown conversion capabilities
- Chapter and bookmark extraction
- Parameter validation for Word-specific features
"""

import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from pathlib import Path

from fastmcp import FastMCP
# FastMCP testing - using direct tool access

from mcp_office_tools.mixins.word import WordMixin
from mcp_office_tools.utils import OfficeFileError


class TestWordMixinRegistration:
    """Test WordMixin tool registration and setup."""

    def test_mixin_initialization(self):
        """Test WordMixin initializes correctly."""
        app = FastMCP("Test Word")
        mixin = WordMixin(app)

        assert mixin.app == app
        assert len(app._tools) == 1  # 1 word tool

    def test_tool_names_registered(self):
        """Test that Word-specific tools are registered."""
        app = FastMCP("Test Word")
        WordMixin(app)

        expected_tools = {"convert_to_markdown"}
        registered_tools = set(app._tools.keys())
        assert expected_tools.issubset(registered_tools)


class TestConvertToMarkdown:
    """Test convert_to_markdown tool functionality."""

    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        return WordMixin(app)

    @pytest.mark.asyncio
    async def test_convert_to_markdown_nonexistent_file(self, mixin):
        """Test convert_to_markdown with nonexistent file."""
        with pytest.raises(OfficeFileError):
            await mixin.convert_to_markdown("/nonexistent/file.docx")

    @pytest.mark.asyncio
    @patch('mcp_office_tools.utils.validation.resolve_office_file_path')
    @patch('mcp_office_tools.utils.validation.validate_office_file')
    @patch('mcp_office_tools.utils.file_detection.detect_format')
    async def test_convert_to_markdown_validation_failure(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test convert_to_markdown with validation failure."""
        mock_resolve.return_value = "/test.docx"
        mock_validate.return_value = {
            "is_valid": False,
            "errors": ["File is password protected"]
        }

        with pytest.raises(OfficeFileError, match="Invalid file: File is password protected"):
            await mixin.convert_to_markdown("/test.docx")

    @pytest.mark.asyncio
    @patch('mcp_office_tools.utils.validation.resolve_office_file_path')
    @patch('mcp_office_tools.utils.validation.validate_office_file')
    @patch('mcp_office_tools.utils.file_detection.detect_format')
    async def test_convert_to_markdown_non_word_document(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test that non-Word documents are rejected."""
        mock_resolve.return_value = "/test.xlsx"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {
            "category": "excel",
            "extension": ".xlsx",
            "format_name": "Excel"
        }

        with pytest.raises(OfficeFileError, match="Markdown conversion currently only supports Word documents"):
            await mixin.convert_to_markdown("/test.xlsx")

    @pytest.mark.asyncio
    @patch('mcp_office_tools.utils.validation.resolve_office_file_path')
    @patch('mcp_office_tools.utils.validation.validate_office_file')
    @patch('mcp_office_tools.utils.file_detection.detect_format')
    async def test_convert_to_markdown_docx_success(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test successful DOCX to markdown conversion."""
        # Setup mocks
        mock_resolve.return_value = "/test.docx"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {
            "category": "word",
            "extension": ".docx",
            "format_name": "Word Document"
        }

        # Mock internal methods
        with patch.object(mixin, '_analyze_document_size') as mock_analyze:
            with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
                    mock_analyze.return_value = {
                        "estimated_pages": 5,
                        "estimated_size": "medium",
                        "has_images": True,
                        "has_complex_formatting": False
                    }
                    mock_recommendation.return_value = {
                        "recommendation": "proceed",
                        "message": "Document size is manageable for full conversion"
                    }
                    mock_convert.return_value = {
                        "markdown": "# Test Document\n\nThis is test content.",
                        "images": [],
                        "metadata": {"conversion_method": "python-docx"},
                        "processing_notes": []
                    }

                    result = await mixin.convert_to_markdown("/test.docx")

                    # Verify structure
                    assert "markdown" in result
                    assert "metadata" in result
                    assert "processing_info" in result

                    # Verify content
                    assert "# Test Document" in result["markdown"]
                    assert result["metadata"]["format"] == "Word Document"
                    assert "conversion_time" in result["metadata"]

    @pytest.mark.asyncio
    async def test_convert_to_markdown_parameter_handling(self, mixin):
        """Test convert_to_markdown parameter validation and handling."""
        # Mock all dependencies for parameter testing
        with patch('mcp_office_tools.utils.validation.resolve_office_file_path') as mock_resolve:
            with patch('mcp_office_tools.utils.validation.validate_office_file') as mock_validate:
                with patch('mcp_office_tools.utils.file_detection.detect_format') as mock_detect:
                    mock_resolve.return_value = "/test.docx"
                    mock_validate.return_value = {"is_valid": True, "errors": []}
                    mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}

                    with patch.object(mixin, '_analyze_document_size') as mock_analyze:
                        with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                            with patch.object(mixin, '_parse_page_range') as mock_parse_range:
                                with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
                                    mock_analyze.return_value = {"estimated_pages": 10}
                                    mock_recommendation.return_value = {"recommendation": "proceed"}
                                    mock_parse_range.return_value = [1, 2, 3, 4, 5]
                                    mock_convert.return_value = {
                                        "markdown": "# Test",
                                        "images": [],
                                        "metadata": {},
                                        "processing_notes": []
                                    }

                                    # Test with specific parameters
                                    result = await mixin.convert_to_markdown(
                                        file_path="/test.docx",
                                        include_images=False,
                                        image_mode="files",
                                        max_image_size=512000,
                                        preserve_structure=False,
                                        page_range="1-5",
                                        bookmark_name="Chapter1",
                                        chapter_name="Introduction",
                                        summary_only=False,
                                        output_dir="/output"
                                    )

                                    # Verify conversion was called with correct parameters
                                    mock_convert.assert_called_once()
                                    args, kwargs = mock_convert.call_args
                                    # Note: Since bookmark_name is provided, page_numbers should be None
                                    # (bookmark takes precedence over page_range)

    @pytest.mark.asyncio
    async def test_convert_to_markdown_bookmark_priority(self, mixin):
        """Test that bookmark extraction takes priority over page ranges."""
        with patch('mcp_office_tools.utils.validation.resolve_office_file_path') as mock_resolve:
            with patch('mcp_office_tools.utils.validation.validate_office_file') as mock_validate:
                with patch('mcp_office_tools.utils.file_detection.detect_format') as mock_detect:
                    mock_resolve.return_value = "/test.docx"
                    mock_validate.return_value = {"is_valid": True, "errors": []}
                    mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}

                    with patch.object(mixin, '_analyze_document_size'):
                        with patch.object(mixin, '_get_processing_recommendation'):
                            with patch.object(mixin, '_parse_page_range') as mock_parse_range:
                                with patch.object(mixin, '_convert_docx_to_markdown') as mock_convert:
                                    mock_convert.return_value = {
                                        "markdown": "# Chapter Content",
                                        "images": [],
                                        "metadata": {},
                                        "processing_notes": []
                                    }

                                    # Call with both page_range and bookmark_name
                                    await mixin.convert_to_markdown(
                                        "/test.docx",
                                        page_range="1-10",
                                        bookmark_name="Chapter1"
                                    )

                                    # Verify that page range parsing was NOT called
                                    # (because bookmark takes priority)
                                    mock_parse_range.assert_not_called()

    @pytest.mark.asyncio
    async def test_convert_to_markdown_summary_mode(self, mixin):
        """Test summary_only mode functionality."""
        with patch('mcp_office_tools.utils.validation.resolve_office_file_path') as mock_resolve:
            with patch('mcp_office_tools.utils.validation.validate_office_file') as mock_validate:
                with patch('mcp_office_tools.utils.file_detection.detect_format') as mock_detect:
                    mock_resolve.return_value = "/test.docx"
                    mock_validate.return_value = {"is_valid": True, "errors": []}
                    mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word"}

                    with patch.object(mixin, '_analyze_document_size') as mock_analyze:
                        with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                            mock_analyze.return_value = {
                                "estimated_pages": 25,
                                "estimated_size": "large",
                                "has_images": True
                            }
                            mock_recommendation.return_value = {
                                "recommendation": "summary_recommended",
                                "message": "Large document - summary mode recommended"
                            }

                            result = await mixin.convert_to_markdown(
                                "/test.docx",
                                summary_only=True
                            )

                            # Verify that summary information is returned
                            assert "metadata" in result
                            assert "processing_info" in result
                            # In summary mode, conversion should not happen


class TestWordSpecificHelpers:
    """Test Word-specific helper methods."""

    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        return WordMixin(app)

    def test_parse_page_range_single_page(self, mixin):
        """Test parsing single page range."""
        result = mixin._parse_page_range("5")
        assert result == [5]

    def test_parse_page_range_range(self, mixin):
        """Test parsing page ranges."""
        result = mixin._parse_page_range("1-5")
        assert result == [1, 2, 3, 4, 5]

    def test_parse_page_range_complex(self, mixin):
        """Test parsing complex page ranges."""
        result = mixin._parse_page_range("1,3,5-7,10")
        expected = [1, 3, 5, 6, 7, 10]
        assert result == expected

    def test_parse_page_range_invalid(self, mixin):
        """Test parsing invalid page ranges."""
        with pytest.raises(OfficeFileError):
            mixin._parse_page_range("invalid")

        with pytest.raises(OfficeFileError):
            mixin._parse_page_range("10-5")  # End before start

    def test_get_processing_recommendation(self, mixin):
        """Test processing recommendation logic."""
        # Small document - proceed normally
        doc_analysis = {"estimated_pages": 3, "estimated_size": "small"}
        result = mixin._get_processing_recommendation(doc_analysis, "", False)
        assert result["recommendation"] == "proceed"

        # Large document without page range - suggest summary
        doc_analysis = {"estimated_pages": 25, "estimated_size": "large"}
        result = mixin._get_processing_recommendation(doc_analysis, "", False)
        assert result["recommendation"] == "summary_recommended"

        # Large document with page range - proceed
        doc_analysis = {"estimated_pages": 25, "estimated_size": "large"}
        result = mixin._get_processing_recommendation(doc_analysis, "1-5", False)
        assert result["recommendation"] == "proceed"

        # Summary mode requested - proceed with summary
        doc_analysis = {"estimated_pages": 25, "estimated_size": "large"}
        result = mixin._get_processing_recommendation(doc_analysis, "", True)
        assert result["recommendation"] == "proceed"


class TestDirectToolAccess:
    """Test WordMixin integration with direct tool access."""

    @pytest.mark.asyncio
    async def test_tool_execution_direct(self):
        """Test Word tool execution through direct tool access."""
        app = FastMCP("Test App")
        WordMixin(app)

        # Test error handling via direct access (nonexistent file)
        convert_to_markdown_tool = app._tools["convert_to_markdown"]
        with pytest.raises(OfficeFileError):
            await convert_to_markdown_tool(file_path="/nonexistent/file.docx")

    @pytest.mark.asyncio
    async def test_tool_parameter_validation_direct(self):
        """Test parameter validation through direct access."""
        app = FastMCP("Test App")
        WordMixin(app)

        # Test with various parameter combinations - wrong file type should be caught
        convert_to_markdown_tool = app._tools["convert_to_markdown"]

        # This should trigger the format validation and raise OfficeFileError
        with pytest.raises(OfficeFileError):
            await convert_to_markdown_tool(
                file_path="/test.xlsx",  # Wrong file type
                include_images=True,
                image_mode="base64",
                preserve_structure=True
            )


class TestLegacyWordSupport:
    """Test support for legacy Word documents (.doc)."""

    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        return WordMixin(app)

    @pytest.mark.asyncio
    @patch('mcp_office_tools.utils.validation.resolve_office_file_path')
    @patch('mcp_office_tools.utils.validation.validate_office_file')
    @patch('mcp_office_tools.utils.file_detection.detect_format')
    async def test_convert_legacy_doc_to_markdown(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test conversion of legacy .doc files."""
        mock_resolve.return_value = "/test.doc"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {
            "category": "word",
            "extension": ".doc",
            "format_name": "Word Document (Legacy)"
        }

        # Mock internal methods for legacy support
        with patch.object(mixin, '_analyze_document_size') as mock_analyze:
            with patch.object(mixin, '_get_processing_recommendation') as mock_recommendation:
                with patch.object(mixin, '_convert_doc_to_markdown') as mock_convert:
                    mock_analyze.return_value = {"estimated_pages": 3}
                    mock_recommendation.return_value = {"recommendation": "proceed"}
                    mock_convert.return_value = {
                        "markdown": "# Legacy Document\n\nContent from .doc file",
                        "images": [],
                        "metadata": {"conversion_method": "legacy-parser"},
                        "processing_notes": ["Converted from legacy format"]
                    }

                    result = await mixin.convert_to_markdown("/test.doc")

                    # Verify legacy conversion worked
                    assert "# Legacy Document" in result["markdown"]
                    assert "legacy-parser" in str(result["metadata"])
                    assert len(result["processing_info"]["processing_notes"]) > 0


if __name__ == "__main__":
    pytest.main([__file__, "-v"])