crawailer/tests/test_memory_management.py

"""
Memory Management and Leak Detection Tests

Tests for memory usage patterns, leak detection, and resource cleanup
in browser automation scenarios. Critical for production deployments
that need to handle long-running operations without memory bloat.

Test Categories:
- Memory baseline and growth patterns
- DOM node accumulation and cleanup
- JavaScript heap management
- Event listener leak detection
- Resource cleanup validation
- Long-running session stability
- Memory pressure handling
- Garbage collection effectiveness
"""

import pytest
import asyncio
import gc
import psutil
import os
from unittest.mock import Mock, patch, AsyncMock
from typing import List, Dict, Any

from crawailer import get, get_many, discover
from crawailer.browser import Browser
from crawailer.config import BrowserConfig


class MockMemoryProfiler:
    """Mock memory profiler for testing memory patterns"""

    def __init__(self):
        self.baseline = 50_000_000  # 50MB baseline
        self.current = self.baseline
        self.peak = self.baseline
        self.allocations = []

    def get_memory_usage(self) -> int:
        """Get current memory usage in bytes"""
        return self.current

    def allocate(self, size: int):
        """Simulate memory allocation"""
        self.current += size
        self.peak = max(self.peak, self.current)
        self.allocations.append(size)

    def deallocate(self, size: int):
        """Simulate memory deallocation"""
        self.current = max(self.baseline, self.current - size)

    def trigger_gc(self):
        """Simulate garbage collection"""
        # Cleanup 70% of non-baseline memory
        excess = self.current - self.baseline
        if excess > 0:
            cleanup = int(excess * 0.7)
            self.current -= cleanup


class MockBrowserMemory:
    """Mock browser memory tracking"""

    def __init__(self):
        self.dom_nodes = 1000  # Initial DOM nodes
        self.js_heap_size = 10_000_000  # 10MB
        self.event_listeners = 50
        self.network_connections = 0
        self.active_timers = 0

    def add_dom_nodes(self, count: int):
        self.dom_nodes += count

    def remove_dom_nodes(self, count: int):
        self.dom_nodes = max(1000, self.dom_nodes - count)

    def allocate_js_heap(self, size: int):
        self.js_heap_size += size

    def add_event_listeners(self, count: int):
        self.event_listeners += count

    def cleanup_listeners(self, count: int):
        self.event_listeners = max(50, self.event_listeners - count)


@pytest.fixture
def memory_profiler():
    """Memory profiler fixture"""
    return MockMemoryProfiler()


@pytest.fixture
def browser_memory():
    """Browser memory tracking fixture"""
    return MockBrowserMemory()


@pytest.fixture
def mock_browser_with_memory(browser_memory):
    """Browser with memory tracking"""
    browser = Mock()
    browser.memory = browser_memory

    async def mock_fetch_page(url, **kwargs):
        # Simulate memory allocation during page load
        browser.memory.add_dom_nodes(500)
        browser.memory.allocate_js_heap(1_000_000)
        browser.memory.add_event_listeners(10)

        script_result = None
        if 'script_after' in kwargs:
            script = kwargs['script_after']
            if 'memory' in script.lower():
                script_result = {
                    'domNodes': browser.memory.dom_nodes,
                    'heapSize': browser.memory.js_heap_size,
                    'listeners': browser.memory.event_listeners
                }
            elif 'leak' in script.lower():
                # Simulate memory leak
                browser.memory.add_dom_nodes(1000)
                browser.memory.allocate_js_heap(5_000_000)
                script_result = {'leaked': True}

        return Mock(
            content="<html><body>Memory test page</body></html>",
            url=url,
            script_result=script_result,
            status_code=200
        )

    browser.fetch_page = mock_fetch_page
    return browser


class TestMemoryBaseline:
    """Test memory baseline and growth patterns"""

    @pytest.mark.asyncio
    async def test_memory_baseline_establishment(self, memory_profiler, mock_browser_with_memory):
        """Test establishing memory usage baseline"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            initial_memory = memory_profiler.get_memory_usage()

            # Single page load should have predictable memory usage
            content = await get("http://localhost:8083/memory-test")

            # Simulate some memory allocation for page processing
            memory_profiler.allocate(2_000_000)  # 2MB for page processing

            final_memory = memory_profiler.get_memory_usage()
            memory_growth = final_memory - initial_memory

            # Memory growth should be reasonable (under 5MB for single page)
            assert memory_growth < 5_000_000
            assert content.content is not None

    @pytest.mark.asyncio
    async def test_memory_growth_patterns(self, memory_profiler, mock_browser_with_memory):
        """Test memory growth patterns over multiple operations"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            baseline = memory_profiler.get_memory_usage()
            measurements = [baseline]

            # Process multiple pages and track memory growth
            urls = [f"http://localhost:8083/page-{i}" for i in range(10)]

            for i, url in enumerate(urls):
                await get(url)
                # Simulate incremental memory usage
                memory_profiler.allocate(1_500_000)  # 1.5MB per page
                measurements.append(memory_profiler.get_memory_usage())

            # Check for linear vs exponential growth
            growth_rates = []
            for i in range(1, len(measurements)):
                rate = measurements[i] - measurements[i-1]
                growth_rates.append(rate)

            # Growth should be roughly linear, not exponential
            avg_growth = sum(growth_rates) / len(growth_rates)
            for rate in growth_rates[-3:]:  # Check last 3 measurements
                assert abs(rate - avg_growth) < avg_growth * 0.5  # Within 50% of average

    @pytest.mark.asyncio
    async def test_memory_with_javascript_execution(self, memory_profiler, mock_browser_with_memory):
        """Test memory usage with JavaScript execution"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            baseline = memory_profiler.get_memory_usage()

            # Execute JavaScript that reports memory usage
            content = await get(
                "http://localhost:8083/js-memory-test",
                script="window.performance.memory ? window.performance.memory.usedJSHeapSize : 'unavailable'"
            )

            # Simulate JS execution memory overhead
            memory_profiler.allocate(3_000_000)  # 3MB for JS execution

            final_memory = memory_profiler.get_memory_usage()
            js_overhead = final_memory - baseline

            # JS execution should have reasonable overhead
            assert js_overhead < 10_000_000  # Under 10MB
            assert content.script_result is not None


class TestDOMNodeManagement:
    """Test DOM node accumulation and cleanup"""

    @pytest.mark.asyncio
    async def test_dom_node_accumulation(self, browser_memory, mock_browser_with_memory):
        """Test DOM node accumulation over multiple page loads"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            initial_nodes = browser_memory.dom_nodes

            # Load pages with varying DOM complexity
            urls = [
                "http://localhost:8083/simple-page",      # 500 nodes
                "http://localhost:8083/complex-page",     # 500 nodes
                "http://localhost:8083/heavy-page"        # 500 nodes
            ]

            for url in urls:
                await get(url)

            final_nodes = browser_memory.dom_nodes
            node_growth = final_nodes - initial_nodes

            # Should accumulate nodes (1500 added)
            assert node_growth == 1500
            assert final_nodes == 2500

    @pytest.mark.asyncio
    async def test_dom_cleanup_between_pages(self, browser_memory, mock_browser_with_memory):
        """Test DOM cleanup between page navigations"""
        # Modify mock to simulate cleanup
        original_fetch = mock_browser_with_memory.fetch_page

        async def fetch_with_cleanup(url, **kwargs):
            # Cleanup previous page DOM nodes (simulate navigation)
            if browser_memory.dom_nodes > 1000:
                cleanup_nodes = min(500, browser_memory.dom_nodes - 1000)
                browser_memory.remove_dom_nodes(cleanup_nodes)

            return await original_fetch(url, **kwargs)

        mock_browser_with_memory.fetch_page = fetch_with_cleanup

        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Load multiple pages with cleanup
            for i in range(5):
                await get(f"http://localhost:8083/page-{i}")

            # Should maintain reasonable DOM node count
            assert browser_memory.dom_nodes < 3000  # Not unlimited growth

    @pytest.mark.asyncio
    async def test_large_dom_handling(self, browser_memory, mock_browser_with_memory):
        """Test handling of pages with very large DOM trees"""
        # Simulate large page
        async def fetch_large_page(url, **kwargs):
            if 'large' in url:
                browser_memory.add_dom_nodes(10000)  # Very large page
            else:
                browser_memory.add_dom_nodes(500)    # Normal page

            return Mock(
                content="<html><body>Large DOM test</body></html>",
                url=url,
                status_code=200
            )

        mock_browser_with_memory.fetch_page = fetch_large_page

        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            initial_nodes = browser_memory.dom_nodes

            # Load large page
            content = await get("http://localhost:8083/large-dom-page")

            final_nodes = browser_memory.dom_nodes

            assert final_nodes - initial_nodes == 10000
            assert content.content is not None


class TestJavaScriptHeapManagement:
    """Test JavaScript heap memory management"""

    @pytest.mark.asyncio
    async def test_js_heap_growth(self, browser_memory, mock_browser_with_memory):
        """Test JavaScript heap growth patterns"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            initial_heap = browser_memory.js_heap_size

            # Execute scripts that allocate memory
            memory_scripts = [
                "new Array(100000).fill('data')",     # Allocate array
                "Object.assign({}, ...new Array(1000).fill({key: 'value'}))",  # Object allocation
                "document.querySelectorAll('*').length"  # DOM query
            ]

            for script in memory_scripts:
                await get("http://localhost:8083/js-test", script=script)

            final_heap = browser_memory.js_heap_size
            heap_growth = final_heap - initial_heap

            # Should show measurable heap growth
            assert heap_growth == 3_000_000  # 1MB per script execution

    @pytest.mark.asyncio
    async def test_js_memory_leak_detection(self, browser_memory, mock_browser_with_memory):
        """Test detection of JavaScript memory leaks"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Execute script that creates potential leak
            leak_script = """
            // Simulate memory leak pattern
            window.leakyData = window.leakyData || [];
            window.leakyData.push(new Array(10000).fill('leak'));
            'leak created'
            """

            initial_heap = browser_memory.js_heap_size

            # Execute leak script multiple times
            for i in range(3):
                content = await get("http://localhost:8083/leak-test", script=leak_script)

            final_heap = browser_memory.js_heap_size
            leak_growth = final_heap - initial_heap

            # Should detect significant memory growth
            assert leak_growth >= 15_000_000  # Significant growth indicates leak
            assert content.script_result == {'leaked': True}

    @pytest.mark.asyncio
    async def test_js_garbage_collection(self, browser_memory, mock_browser_with_memory):
        """Test JavaScript garbage collection effectiveness"""
        # Add GC simulation to mock
        async def fetch_with_gc(url, **kwargs):
            result = await mock_browser_with_memory.fetch_page(url, **kwargs)

            # Simulate GC trigger after script execution
            if 'script_after' in kwargs and 'gc' in kwargs['script_after'].lower():
                # Simulate GC cleanup (reduce heap by 50%)
                excess_heap = browser_memory.js_heap_size - 10_000_000
                if excess_heap > 0:
                    browser_memory.js_heap_size -= int(excess_heap * 0.5)

            return result

        mock_browser_with_memory.fetch_page = fetch_with_gc

        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Allocate memory then trigger GC
            await get("http://localhost:8083/allocate", script="new Array(1000000).fill('data')")
            pre_gc_heap = browser_memory.js_heap_size

            await get("http://localhost:8083/gc-test", script="if (window.gc) window.gc(); 'gc triggered'")
            post_gc_heap = browser_memory.js_heap_size

            # GC should reduce heap size
            assert post_gc_heap < pre_gc_heap


class TestEventListenerLeaks:
    """Test event listener leak detection and cleanup"""

    @pytest.mark.asyncio
    async def test_event_listener_accumulation(self, browser_memory, mock_browser_with_memory):
        """Test event listener accumulation patterns"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            initial_listeners = browser_memory.event_listeners

            # Execute scripts that add event listeners
            listener_scripts = [
                "document.addEventListener('click', function() {})",
                "window.addEventListener('resize', function() {})",
                "document.body.addEventListener('mouseover', function() {})"
            ]

            for script in listener_scripts:
                await get("http://localhost:8083/listener-test", script=script)

            final_listeners = browser_memory.event_listeners
            listener_growth = final_listeners - initial_listeners

            # Should accumulate listeners (10 per page + 3 custom = 33)
            assert listener_growth == 33

    @pytest.mark.asyncio
    async def test_listener_cleanup_on_navigation(self, browser_memory, mock_browser_with_memory):
        """Test listener cleanup during page navigation"""
        # Modify mock to simulate listener cleanup
        navigation_count = 0

        async def fetch_with_listener_cleanup(url, **kwargs):
            nonlocal navigation_count
            navigation_count += 1

            # Cleanup listeners on navigation (every 2nd navigation)
            if navigation_count % 2 == 0 and browser_memory.event_listeners > 50:
                cleanup_count = min(20, browser_memory.event_listeners - 50)
                browser_memory.cleanup_listeners(cleanup_count)

            return await mock_browser_with_memory.fetch_page(url, **kwargs)

        mock_browser_with_memory.fetch_page = fetch_with_listener_cleanup

        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Navigate multiple times
            for i in range(6):
                await get(f"http://localhost:8083/nav-test-{i}")

            # Should show periodic cleanup
            assert browser_memory.event_listeners < 120  # Not unlimited growth

    @pytest.mark.asyncio
    async def test_orphaned_listener_detection(self, browser_memory, mock_browser_with_memory):
        """Test detection of orphaned event listeners"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Create scenario with orphaned listeners
            orphan_script = """
            // Create elements, add listeners, then remove elements (orphaning listeners)
            const div = document.createElement('div');
            div.addEventListener('click', function() {});
            document.body.appendChild(div);
            document.body.removeChild(div);  // Element removed but listener may persist
            'orphan created'
            """

            initial_listeners = browser_memory.event_listeners

            # Create multiple orphaned listeners
            for i in range(3):
                await get("http://localhost:8083/orphan-test", script=orphan_script)

            final_listeners = browser_memory.event_listeners

            # Should accumulate listeners even after element removal
            assert final_listeners > initial_listeners


class TestResourceCleanup:
    """Test resource cleanup and session management"""

    @pytest.mark.asyncio
    async def test_session_resource_cleanup(self, memory_profiler, mock_browser_with_memory):
        """Test resource cleanup after session completion"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Simulate session with multiple operations
            urls = [f"http://localhost:8083/session-{i}" for i in range(5)]

            initial_memory = memory_profiler.get_memory_usage()

            # Process URLs
            contents = await get_many(urls)

            # Simulate memory allocation during processing
            memory_profiler.allocate(10_000_000)  # 10MB allocated

            # Simulate session cleanup
            memory_profiler.trigger_gc()

            final_memory = memory_profiler.get_memory_usage()

            # Should show significant cleanup
            cleanup_amount = 10_000_000 * 0.7  # 70% cleanup
            expected_memory = initial_memory + 10_000_000 - cleanup_amount

            assert abs(final_memory - expected_memory) < 1_000_000  # Within 1MB
            assert len(contents) == 5

    @pytest.mark.asyncio
    async def test_browser_instance_cleanup(self, mock_browser_with_memory):
        """Test browser instance resource cleanup"""
        cleanup_called = False

        async def mock_cleanup():
            nonlocal cleanup_called
            cleanup_called = True

        mock_browser_with_memory.close = mock_cleanup

        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Use browser instance
            await get("http://localhost:8083/cleanup-test")

            # Simulate browser cleanup
            await mock_browser_with_memory.close()

            assert cleanup_called

    @pytest.mark.asyncio
    async def test_concurrent_session_isolation(self, memory_profiler, mock_browser_with_memory):
        """Test memory isolation between concurrent sessions"""
        session_memories = []

        async def session_task(session_id: int):
            # Each session processes some pages
            for i in range(3):
                await get(f"http://localhost:8083/session-{session_id}-page-{i}")
                memory_profiler.allocate(2_000_000)  # 2MB per page

            session_memories.append(memory_profiler.get_memory_usage())

        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            initial_memory = memory_profiler.get_memory_usage()

            # Run concurrent sessions
            tasks = [session_task(i) for i in range(3)]
            await asyncio.gather(*tasks)

            final_memory = memory_profiler.get_memory_usage()
            total_growth = final_memory - initial_memory

            # Total growth should be sum of all sessions
            expected_growth = 3 * 3 * 2_000_000  # 3 sessions * 3 pages * 2MB
            assert abs(total_growth - expected_growth) < 2_000_000  # Within 2MB tolerance


class TestLongRunningStability:
    """Test long-running session stability and memory management"""

    @pytest.mark.asyncio
    async def test_extended_session_stability(self, memory_profiler, mock_browser_with_memory):
        """Test memory stability over extended sessions"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            memory_samples = []

            # Simulate extended session (50 operations)
            for i in range(50):
                await get(f"http://localhost:8083/extended-{i}")
                memory_profiler.allocate(1_000_000)  # 1MB per operation

                # Trigger GC every 10 operations
                if i % 10 == 9:
                    memory_profiler.trigger_gc()

                memory_samples.append(memory_profiler.get_memory_usage())

            # Check for memory stability (no runaway growth)
            # After GC cycles, memory should stabilize
            recent_samples = memory_samples[-10:]  # Last 10 samples
            memory_variance = max(recent_samples) - min(recent_samples)

            # Variance should be reasonable (under 10MB)
            assert memory_variance < 10_000_000

    @pytest.mark.asyncio
    async def test_memory_pressure_handling(self, memory_profiler, mock_browser_with_memory):
        """Test handling of memory pressure conditions"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Simulate memory pressure scenario
            initial_memory = memory_profiler.get_memory_usage()

            # Allocate significant memory
            memory_profiler.allocate(100_000_000)  # 100MB

            # Try to process page under memory pressure
            try:
                content = await get("http://localhost:8083/memory-pressure-test")
                # Should complete successfully
                assert content.content is not None

                # Trigger emergency GC
                memory_profiler.trigger_gc()

                # Memory should be reduced significantly
                final_memory = memory_profiler.get_memory_usage()
                reduction = (initial_memory + 100_000_000) - final_memory
                assert reduction > 50_000_000  # At least 50MB cleaned up

            except Exception as e:
                # Should handle memory pressure gracefully
                assert "memory" in str(e).lower() or "resource" in str(e).lower()

    @pytest.mark.asyncio
    async def test_batch_processing_memory_efficiency(self, memory_profiler, mock_browser_with_memory):
        """Test memory efficiency in batch processing scenarios"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            urls = [f"http://localhost:8083/batch-{i}" for i in range(20)]

            initial_memory = memory_profiler.get_memory_usage()

            # Process in batches with memory monitoring
            batch_size = 5
            for i in range(0, len(urls), batch_size):
                batch_urls = urls[i:i+batch_size]
                contents = await get_many(batch_urls)

                # Simulate batch memory usage
                memory_profiler.allocate(batch_size * 2_000_000)  # 2MB per URL

                # GC between batches
                memory_profiler.trigger_gc()

                assert len(contents) == len(batch_urls)

            final_memory = memory_profiler.get_memory_usage()
            total_growth = final_memory - initial_memory

            # With GC between batches, growth should be minimal
            assert total_growth < 20_000_000  # Under 20MB total growth


class TestMemoryMetrics:
    """Test memory metrics and monitoring capabilities"""

    @pytest.mark.asyncio
    async def test_memory_usage_reporting(self, browser_memory, mock_browser_with_memory):
        """Test memory usage metrics reporting"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Execute script that reports memory metrics
            memory_script = """
            ({
                domNodes: document.querySelectorAll('*').length,
                heapSize: window.performance.memory ? window.performance.memory.usedJSHeapSize : 'unavailable',
                listeners: getEventListeners ? Object.keys(getEventListeners(document)).length : 'unavailable'
            })
            """

            content = await get("http://localhost:8083/memory-metrics", script=memory_script)

            # Should return memory metrics
            assert content.script_result is not None
            metrics = content.script_result
            assert 'domNodes' in metrics
            assert 'heapSize' in metrics
            assert 'listeners' in metrics

    @pytest.mark.asyncio
    async def test_performance_memory_api(self, mock_browser_with_memory):
        """Test Performance Memory API integration"""
        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Test performance.memory API
            performance_script = """
            if (window.performance && window.performance.memory) {
                ({
                    usedJSHeapSize: window.performance.memory.usedJSHeapSize,
                    totalJSHeapSize: window.performance.memory.totalJSHeapSize,
                    jsHeapSizeLimit: window.performance.memory.jsHeapSizeLimit
                })
            } else {
                'performance.memory not available'
            }
            """

            content = await get("http://localhost:8083/performance-memory", script=performance_script)

            # Should report performance memory data or unavailability
            assert content.script_result is not None

    @pytest.mark.asyncio
    async def test_memory_threshold_monitoring(self, memory_profiler, mock_browser_with_memory):
        """Test memory threshold monitoring and alerts"""
        threshold = 75_000_000  # 75MB threshold

        with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
            # Process pages while monitoring threshold
            for i in range(30):
                await get(f"http://localhost:8083/threshold-{i}")
                memory_profiler.allocate(3_000_000)  # 3MB per page

                current_memory = memory_profiler.get_memory_usage()
                if current_memory > threshold:
                    # Trigger cleanup when threshold exceeded
                    memory_profiler.trigger_gc()

                    # Verify cleanup brought memory below threshold
                    post_cleanup_memory = memory_profiler.get_memory_usage()
                    # Should be significantly reduced
                    assert post_cleanup_memory < threshold * 0.8  # Below 80% of threshold


if __name__ == "__main__":
    # Demo script showing memory management testing
    print("🧠 Memory Management Test Suite")
    print("=" * 50)
    print()
    print("This test suite validates memory management and leak detection:")
    print()
    print("📊 Memory Baseline Tests:")
    print("  • Memory growth patterns over multiple operations")
    print("  • JavaScript execution memory overhead")
    print("  • Baseline establishment and maintenance")
    print()
    print("🌳 DOM Node Management:")
    print("  • DOM node accumulation and cleanup")
    print("  • Large DOM tree handling")
    print("  • Memory efficiency with complex pages")
    print()
    print("⚡ JavaScript Heap Management:")
    print("  • Heap growth and leak detection")
    print("  • Garbage collection effectiveness")
    print("  • Memory allocation patterns")
    print()
    print("🎧 Event Listener Management:")
    print("  • Listener accumulation tracking")
    print("  • Orphaned listener detection")
    print("  • Cleanup on navigation")
    print()
    print("🔄 Resource Cleanup:")
    print("  • Session resource management")
    print("  • Browser instance cleanup")
    print("  • Concurrent session isolation")
    print()
    print("⏱️  Long-Running Stability:")
    print("  • Extended session memory stability")
    print("  • Memory pressure handling")
    print("  • Batch processing efficiency")
    print()
    print("📈 Memory Metrics:")
    print("  • Performance Memory API integration")
    print("  • Threshold monitoring and alerts")
    print("  • Real-time memory usage reporting")
    print()
    print("Run with: pytest tests/test_memory_management.py -v")
    print()
    print("🎯 Production Benefits:")
    print("  • Prevents memory leaks in long-running processes")
    print("  • Ensures stable performance under load")
    print("  • Provides memory monitoring capabilities")
    print("  • Validates resource cleanup effectiveness")