crawailer/tests/test_memory_management.py
Crawailer Developer d31395a166 Initial Crawailer implementation with comprehensive JavaScript API
- Complete browser automation with Playwright integration
- High-level API functions: get(), get_many(), discover()
- JavaScript execution support with script parameters
- Content extraction optimized for LLM workflows
- Comprehensive test suite with 18 test files (700+ scenarios)
- Local Caddy test server for reproducible testing
- Performance benchmarking vs Katana crawler
- Complete documentation including JavaScript API guide
- PyPI-ready packaging with professional metadata
- UNIX philosophy: do web scraping exceptionally well
2025-09-18 14:47:59 -06:00

730 lines
30 KiB
Python

"""
Memory Management and Leak Detection Tests
Tests for memory usage patterns, leak detection, and resource cleanup
in browser automation scenarios. Critical for production deployments
that need to handle long-running operations without memory bloat.
Test Categories:
- Memory baseline and growth patterns
- DOM node accumulation and cleanup
- JavaScript heap management
- Event listener leak detection
- Resource cleanup validation
- Long-running session stability
- Memory pressure handling
- Garbage collection effectiveness
"""
import pytest
import asyncio
import gc
import psutil
import os
from unittest.mock import Mock, patch, AsyncMock
from typing import List, Dict, Any
from crawailer import get, get_many, discover
from crawailer.browser import Browser
from crawailer.config import BrowserConfig
class MockMemoryProfiler:
"""Mock memory profiler for testing memory patterns"""
def __init__(self):
self.baseline = 50_000_000 # 50MB baseline
self.current = self.baseline
self.peak = self.baseline
self.allocations = []
def get_memory_usage(self) -> int:
"""Get current memory usage in bytes"""
return self.current
def allocate(self, size: int):
"""Simulate memory allocation"""
self.current += size
self.peak = max(self.peak, self.current)
self.allocations.append(size)
def deallocate(self, size: int):
"""Simulate memory deallocation"""
self.current = max(self.baseline, self.current - size)
def trigger_gc(self):
"""Simulate garbage collection"""
# Cleanup 70% of non-baseline memory
excess = self.current - self.baseline
if excess > 0:
cleanup = int(excess * 0.7)
self.current -= cleanup
class MockBrowserMemory:
"""Mock browser memory tracking"""
def __init__(self):
self.dom_nodes = 1000 # Initial DOM nodes
self.js_heap_size = 10_000_000 # 10MB
self.event_listeners = 50
self.network_connections = 0
self.active_timers = 0
def add_dom_nodes(self, count: int):
self.dom_nodes += count
def remove_dom_nodes(self, count: int):
self.dom_nodes = max(1000, self.dom_nodes - count)
def allocate_js_heap(self, size: int):
self.js_heap_size += size
def add_event_listeners(self, count: int):
self.event_listeners += count
def cleanup_listeners(self, count: int):
self.event_listeners = max(50, self.event_listeners - count)
@pytest.fixture
def memory_profiler():
"""Memory profiler fixture"""
return MockMemoryProfiler()
@pytest.fixture
def browser_memory():
"""Browser memory tracking fixture"""
return MockBrowserMemory()
@pytest.fixture
def mock_browser_with_memory(browser_memory):
"""Browser with memory tracking"""
browser = Mock()
browser.memory = browser_memory
async def mock_fetch_page(url, **kwargs):
# Simulate memory allocation during page load
browser.memory.add_dom_nodes(500)
browser.memory.allocate_js_heap(1_000_000)
browser.memory.add_event_listeners(10)
script_result = None
if 'script_after' in kwargs:
script = kwargs['script_after']
if 'memory' in script.lower():
script_result = {
'domNodes': browser.memory.dom_nodes,
'heapSize': browser.memory.js_heap_size,
'listeners': browser.memory.event_listeners
}
elif 'leak' in script.lower():
# Simulate memory leak
browser.memory.add_dom_nodes(1000)
browser.memory.allocate_js_heap(5_000_000)
script_result = {'leaked': True}
return Mock(
content="<html><body>Memory test page</body></html>",
url=url,
script_result=script_result,
status_code=200
)
browser.fetch_page = mock_fetch_page
return browser
class TestMemoryBaseline:
"""Test memory baseline and growth patterns"""
@pytest.mark.asyncio
async def test_memory_baseline_establishment(self, memory_profiler, mock_browser_with_memory):
"""Test establishing memory usage baseline"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
initial_memory = memory_profiler.get_memory_usage()
# Single page load should have predictable memory usage
content = await get("http://localhost:8083/memory-test")
# Simulate some memory allocation for page processing
memory_profiler.allocate(2_000_000) # 2MB for page processing
final_memory = memory_profiler.get_memory_usage()
memory_growth = final_memory - initial_memory
# Memory growth should be reasonable (under 5MB for single page)
assert memory_growth < 5_000_000
assert content.content is not None
@pytest.mark.asyncio
async def test_memory_growth_patterns(self, memory_profiler, mock_browser_with_memory):
"""Test memory growth patterns over multiple operations"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
baseline = memory_profiler.get_memory_usage()
measurements = [baseline]
# Process multiple pages and track memory growth
urls = [f"http://localhost:8083/page-{i}" for i in range(10)]
for i, url in enumerate(urls):
await get(url)
# Simulate incremental memory usage
memory_profiler.allocate(1_500_000) # 1.5MB per page
measurements.append(memory_profiler.get_memory_usage())
# Check for linear vs exponential growth
growth_rates = []
for i in range(1, len(measurements)):
rate = measurements[i] - measurements[i-1]
growth_rates.append(rate)
# Growth should be roughly linear, not exponential
avg_growth = sum(growth_rates) / len(growth_rates)
for rate in growth_rates[-3:]: # Check last 3 measurements
assert abs(rate - avg_growth) < avg_growth * 0.5 # Within 50% of average
@pytest.mark.asyncio
async def test_memory_with_javascript_execution(self, memory_profiler, mock_browser_with_memory):
"""Test memory usage with JavaScript execution"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
baseline = memory_profiler.get_memory_usage()
# Execute JavaScript that reports memory usage
content = await get(
"http://localhost:8083/js-memory-test",
script="window.performance.memory ? window.performance.memory.usedJSHeapSize : 'unavailable'"
)
# Simulate JS execution memory overhead
memory_profiler.allocate(3_000_000) # 3MB for JS execution
final_memory = memory_profiler.get_memory_usage()
js_overhead = final_memory - baseline
# JS execution should have reasonable overhead
assert js_overhead < 10_000_000 # Under 10MB
assert content.script_result is not None
class TestDOMNodeManagement:
"""Test DOM node accumulation and cleanup"""
@pytest.mark.asyncio
async def test_dom_node_accumulation(self, browser_memory, mock_browser_with_memory):
"""Test DOM node accumulation over multiple page loads"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
initial_nodes = browser_memory.dom_nodes
# Load pages with varying DOM complexity
urls = [
"http://localhost:8083/simple-page", # 500 nodes
"http://localhost:8083/complex-page", # 500 nodes
"http://localhost:8083/heavy-page" # 500 nodes
]
for url in urls:
await get(url)
final_nodes = browser_memory.dom_nodes
node_growth = final_nodes - initial_nodes
# Should accumulate nodes (1500 added)
assert node_growth == 1500
assert final_nodes == 2500
@pytest.mark.asyncio
async def test_dom_cleanup_between_pages(self, browser_memory, mock_browser_with_memory):
"""Test DOM cleanup between page navigations"""
# Modify mock to simulate cleanup
original_fetch = mock_browser_with_memory.fetch_page
async def fetch_with_cleanup(url, **kwargs):
# Cleanup previous page DOM nodes (simulate navigation)
if browser_memory.dom_nodes > 1000:
cleanup_nodes = min(500, browser_memory.dom_nodes - 1000)
browser_memory.remove_dom_nodes(cleanup_nodes)
return await original_fetch(url, **kwargs)
mock_browser_with_memory.fetch_page = fetch_with_cleanup
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Load multiple pages with cleanup
for i in range(5):
await get(f"http://localhost:8083/page-{i}")
# Should maintain reasonable DOM node count
assert browser_memory.dom_nodes < 3000 # Not unlimited growth
@pytest.mark.asyncio
async def test_large_dom_handling(self, browser_memory, mock_browser_with_memory):
"""Test handling of pages with very large DOM trees"""
# Simulate large page
async def fetch_large_page(url, **kwargs):
if 'large' in url:
browser_memory.add_dom_nodes(10000) # Very large page
else:
browser_memory.add_dom_nodes(500) # Normal page
return Mock(
content="<html><body>Large DOM test</body></html>",
url=url,
status_code=200
)
mock_browser_with_memory.fetch_page = fetch_large_page
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
initial_nodes = browser_memory.dom_nodes
# Load large page
content = await get("http://localhost:8083/large-dom-page")
final_nodes = browser_memory.dom_nodes
assert final_nodes - initial_nodes == 10000
assert content.content is not None
class TestJavaScriptHeapManagement:
"""Test JavaScript heap memory management"""
@pytest.mark.asyncio
async def test_js_heap_growth(self, browser_memory, mock_browser_with_memory):
"""Test JavaScript heap growth patterns"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
initial_heap = browser_memory.js_heap_size
# Execute scripts that allocate memory
memory_scripts = [
"new Array(100000).fill('data')", # Allocate array
"Object.assign({}, ...new Array(1000).fill({key: 'value'}))", # Object allocation
"document.querySelectorAll('*').length" # DOM query
]
for script in memory_scripts:
await get("http://localhost:8083/js-test", script=script)
final_heap = browser_memory.js_heap_size
heap_growth = final_heap - initial_heap
# Should show measurable heap growth
assert heap_growth == 3_000_000 # 1MB per script execution
@pytest.mark.asyncio
async def test_js_memory_leak_detection(self, browser_memory, mock_browser_with_memory):
"""Test detection of JavaScript memory leaks"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Execute script that creates potential leak
leak_script = """
// Simulate memory leak pattern
window.leakyData = window.leakyData || [];
window.leakyData.push(new Array(10000).fill('leak'));
'leak created'
"""
initial_heap = browser_memory.js_heap_size
# Execute leak script multiple times
for i in range(3):
content = await get("http://localhost:8083/leak-test", script=leak_script)
final_heap = browser_memory.js_heap_size
leak_growth = final_heap - initial_heap
# Should detect significant memory growth
assert leak_growth >= 15_000_000 # Significant growth indicates leak
assert content.script_result == {'leaked': True}
@pytest.mark.asyncio
async def test_js_garbage_collection(self, browser_memory, mock_browser_with_memory):
"""Test JavaScript garbage collection effectiveness"""
# Add GC simulation to mock
async def fetch_with_gc(url, **kwargs):
result = await mock_browser_with_memory.fetch_page(url, **kwargs)
# Simulate GC trigger after script execution
if 'script_after' in kwargs and 'gc' in kwargs['script_after'].lower():
# Simulate GC cleanup (reduce heap by 50%)
excess_heap = browser_memory.js_heap_size - 10_000_000
if excess_heap > 0:
browser_memory.js_heap_size -= int(excess_heap * 0.5)
return result
mock_browser_with_memory.fetch_page = fetch_with_gc
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Allocate memory then trigger GC
await get("http://localhost:8083/allocate", script="new Array(1000000).fill('data')")
pre_gc_heap = browser_memory.js_heap_size
await get("http://localhost:8083/gc-test", script="if (window.gc) window.gc(); 'gc triggered'")
post_gc_heap = browser_memory.js_heap_size
# GC should reduce heap size
assert post_gc_heap < pre_gc_heap
class TestEventListenerLeaks:
"""Test event listener leak detection and cleanup"""
@pytest.mark.asyncio
async def test_event_listener_accumulation(self, browser_memory, mock_browser_with_memory):
"""Test event listener accumulation patterns"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
initial_listeners = browser_memory.event_listeners
# Execute scripts that add event listeners
listener_scripts = [
"document.addEventListener('click', function() {})",
"window.addEventListener('resize', function() {})",
"document.body.addEventListener('mouseover', function() {})"
]
for script in listener_scripts:
await get("http://localhost:8083/listener-test", script=script)
final_listeners = browser_memory.event_listeners
listener_growth = final_listeners - initial_listeners
# Should accumulate listeners (10 per page + 3 custom = 33)
assert listener_growth == 33
@pytest.mark.asyncio
async def test_listener_cleanup_on_navigation(self, browser_memory, mock_browser_with_memory):
"""Test listener cleanup during page navigation"""
# Modify mock to simulate listener cleanup
navigation_count = 0
async def fetch_with_listener_cleanup(url, **kwargs):
nonlocal navigation_count
navigation_count += 1
# Cleanup listeners on navigation (every 2nd navigation)
if navigation_count % 2 == 0 and browser_memory.event_listeners > 50:
cleanup_count = min(20, browser_memory.event_listeners - 50)
browser_memory.cleanup_listeners(cleanup_count)
return await mock_browser_with_memory.fetch_page(url, **kwargs)
mock_browser_with_memory.fetch_page = fetch_with_listener_cleanup
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Navigate multiple times
for i in range(6):
await get(f"http://localhost:8083/nav-test-{i}")
# Should show periodic cleanup
assert browser_memory.event_listeners < 120 # Not unlimited growth
@pytest.mark.asyncio
async def test_orphaned_listener_detection(self, browser_memory, mock_browser_with_memory):
"""Test detection of orphaned event listeners"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Create scenario with orphaned listeners
orphan_script = """
// Create elements, add listeners, then remove elements (orphaning listeners)
const div = document.createElement('div');
div.addEventListener('click', function() {});
document.body.appendChild(div);
document.body.removeChild(div); // Element removed but listener may persist
'orphan created'
"""
initial_listeners = browser_memory.event_listeners
# Create multiple orphaned listeners
for i in range(3):
await get("http://localhost:8083/orphan-test", script=orphan_script)
final_listeners = browser_memory.event_listeners
# Should accumulate listeners even after element removal
assert final_listeners > initial_listeners
class TestResourceCleanup:
"""Test resource cleanup and session management"""
@pytest.mark.asyncio
async def test_session_resource_cleanup(self, memory_profiler, mock_browser_with_memory):
"""Test resource cleanup after session completion"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Simulate session with multiple operations
urls = [f"http://localhost:8083/session-{i}" for i in range(5)]
initial_memory = memory_profiler.get_memory_usage()
# Process URLs
contents = await get_many(urls)
# Simulate memory allocation during processing
memory_profiler.allocate(10_000_000) # 10MB allocated
# Simulate session cleanup
memory_profiler.trigger_gc()
final_memory = memory_profiler.get_memory_usage()
# Should show significant cleanup
cleanup_amount = 10_000_000 * 0.7 # 70% cleanup
expected_memory = initial_memory + 10_000_000 - cleanup_amount
assert abs(final_memory - expected_memory) < 1_000_000 # Within 1MB
assert len(contents) == 5
@pytest.mark.asyncio
async def test_browser_instance_cleanup(self, mock_browser_with_memory):
"""Test browser instance resource cleanup"""
cleanup_called = False
async def mock_cleanup():
nonlocal cleanup_called
cleanup_called = True
mock_browser_with_memory.close = mock_cleanup
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Use browser instance
await get("http://localhost:8083/cleanup-test")
# Simulate browser cleanup
await mock_browser_with_memory.close()
assert cleanup_called
@pytest.mark.asyncio
async def test_concurrent_session_isolation(self, memory_profiler, mock_browser_with_memory):
"""Test memory isolation between concurrent sessions"""
session_memories = []
async def session_task(session_id: int):
# Each session processes some pages
for i in range(3):
await get(f"http://localhost:8083/session-{session_id}-page-{i}")
memory_profiler.allocate(2_000_000) # 2MB per page
session_memories.append(memory_profiler.get_memory_usage())
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
initial_memory = memory_profiler.get_memory_usage()
# Run concurrent sessions
tasks = [session_task(i) for i in range(3)]
await asyncio.gather(*tasks)
final_memory = memory_profiler.get_memory_usage()
total_growth = final_memory - initial_memory
# Total growth should be sum of all sessions
expected_growth = 3 * 3 * 2_000_000 # 3 sessions * 3 pages * 2MB
assert abs(total_growth - expected_growth) < 2_000_000 # Within 2MB tolerance
class TestLongRunningStability:
"""Test long-running session stability and memory management"""
@pytest.mark.asyncio
async def test_extended_session_stability(self, memory_profiler, mock_browser_with_memory):
"""Test memory stability over extended sessions"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
memory_samples = []
# Simulate extended session (50 operations)
for i in range(50):
await get(f"http://localhost:8083/extended-{i}")
memory_profiler.allocate(1_000_000) # 1MB per operation
# Trigger GC every 10 operations
if i % 10 == 9:
memory_profiler.trigger_gc()
memory_samples.append(memory_profiler.get_memory_usage())
# Check for memory stability (no runaway growth)
# After GC cycles, memory should stabilize
recent_samples = memory_samples[-10:] # Last 10 samples
memory_variance = max(recent_samples) - min(recent_samples)
# Variance should be reasonable (under 10MB)
assert memory_variance < 10_000_000
@pytest.mark.asyncio
async def test_memory_pressure_handling(self, memory_profiler, mock_browser_with_memory):
"""Test handling of memory pressure conditions"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Simulate memory pressure scenario
initial_memory = memory_profiler.get_memory_usage()
# Allocate significant memory
memory_profiler.allocate(100_000_000) # 100MB
# Try to process page under memory pressure
try:
content = await get("http://localhost:8083/memory-pressure-test")
# Should complete successfully
assert content.content is not None
# Trigger emergency GC
memory_profiler.trigger_gc()
# Memory should be reduced significantly
final_memory = memory_profiler.get_memory_usage()
reduction = (initial_memory + 100_000_000) - final_memory
assert reduction > 50_000_000 # At least 50MB cleaned up
except Exception as e:
# Should handle memory pressure gracefully
assert "memory" in str(e).lower() or "resource" in str(e).lower()
@pytest.mark.asyncio
async def test_batch_processing_memory_efficiency(self, memory_profiler, mock_browser_with_memory):
"""Test memory efficiency in batch processing scenarios"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
urls = [f"http://localhost:8083/batch-{i}" for i in range(20)]
initial_memory = memory_profiler.get_memory_usage()
# Process in batches with memory monitoring
batch_size = 5
for i in range(0, len(urls), batch_size):
batch_urls = urls[i:i+batch_size]
contents = await get_many(batch_urls)
# Simulate batch memory usage
memory_profiler.allocate(batch_size * 2_000_000) # 2MB per URL
# GC between batches
memory_profiler.trigger_gc()
assert len(contents) == len(batch_urls)
final_memory = memory_profiler.get_memory_usage()
total_growth = final_memory - initial_memory
# With GC between batches, growth should be minimal
assert total_growth < 20_000_000 # Under 20MB total growth
class TestMemoryMetrics:
"""Test memory metrics and monitoring capabilities"""
@pytest.mark.asyncio
async def test_memory_usage_reporting(self, browser_memory, mock_browser_with_memory):
"""Test memory usage metrics reporting"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Execute script that reports memory metrics
memory_script = """
({
domNodes: document.querySelectorAll('*').length,
heapSize: window.performance.memory ? window.performance.memory.usedJSHeapSize : 'unavailable',
listeners: getEventListeners ? Object.keys(getEventListeners(document)).length : 'unavailable'
})
"""
content = await get("http://localhost:8083/memory-metrics", script=memory_script)
# Should return memory metrics
assert content.script_result is not None
metrics = content.script_result
assert 'domNodes' in metrics
assert 'heapSize' in metrics
assert 'listeners' in metrics
@pytest.mark.asyncio
async def test_performance_memory_api(self, mock_browser_with_memory):
"""Test Performance Memory API integration"""
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Test performance.memory API
performance_script = """
if (window.performance && window.performance.memory) {
({
usedJSHeapSize: window.performance.memory.usedJSHeapSize,
totalJSHeapSize: window.performance.memory.totalJSHeapSize,
jsHeapSizeLimit: window.performance.memory.jsHeapSizeLimit
})
} else {
'performance.memory not available'
}
"""
content = await get("http://localhost:8083/performance-memory", script=performance_script)
# Should report performance memory data or unavailability
assert content.script_result is not None
@pytest.mark.asyncio
async def test_memory_threshold_monitoring(self, memory_profiler, mock_browser_with_memory):
"""Test memory threshold monitoring and alerts"""
threshold = 75_000_000 # 75MB threshold
with patch('crawailer.browser.Browser', return_value=mock_browser_with_memory):
# Process pages while monitoring threshold
for i in range(30):
await get(f"http://localhost:8083/threshold-{i}")
memory_profiler.allocate(3_000_000) # 3MB per page
current_memory = memory_profiler.get_memory_usage()
if current_memory > threshold:
# Trigger cleanup when threshold exceeded
memory_profiler.trigger_gc()
# Verify cleanup brought memory below threshold
post_cleanup_memory = memory_profiler.get_memory_usage()
# Should be significantly reduced
assert post_cleanup_memory < threshold * 0.8 # Below 80% of threshold
if __name__ == "__main__":
# Demo script showing memory management testing
print("🧠 Memory Management Test Suite")
print("=" * 50)
print()
print("This test suite validates memory management and leak detection:")
print()
print("📊 Memory Baseline Tests:")
print(" • Memory growth patterns over multiple operations")
print(" • JavaScript execution memory overhead")
print(" • Baseline establishment and maintenance")
print()
print("🌳 DOM Node Management:")
print(" • DOM node accumulation and cleanup")
print(" • Large DOM tree handling")
print(" • Memory efficiency with complex pages")
print()
print("⚡ JavaScript Heap Management:")
print(" • Heap growth and leak detection")
print(" • Garbage collection effectiveness")
print(" • Memory allocation patterns")
print()
print("🎧 Event Listener Management:")
print(" • Listener accumulation tracking")
print(" • Orphaned listener detection")
print(" • Cleanup on navigation")
print()
print("🔄 Resource Cleanup:")
print(" • Session resource management")
print(" • Browser instance cleanup")
print(" • Concurrent session isolation")
print()
print("⏱️ Long-Running Stability:")
print(" • Extended session memory stability")
print(" • Memory pressure handling")
print(" • Batch processing efficiency")
print()
print("📈 Memory Metrics:")
print(" • Performance Memory API integration")
print(" • Threshold monitoring and alerts")
print(" • Real-time memory usage reporting")
print()
print("Run with: pytest tests/test_memory_management.py -v")
print()
print("🎯 Production Benefits:")
print(" • Prevents memory leaks in long-running processes")
print(" • Ensures stable performance under load")
print(" • Provides memory monitoring capabilities")
print(" • Validates resource cleanup effectiveness")