
- Comprehensive test suite (700+ lines) for JS execution in high-level API - Test coverage analysis and validation infrastructure - Enhancement proposal and implementation strategy - Mock HTTP server with realistic JavaScript scenarios - Parallel implementation strategy using expert agents and git worktrees Ready for test-driven implementation of JavaScript enhancements.
135 lines
4.0 KiB
Python
135 lines
4.0 KiB
Python
"""
|
|
Basic tests for Crawailer functionality.
|
|
|
|
Simple tests to verify the core components work together.
|
|
"""
|
|
|
|
import pytest
|
|
import asyncio
|
|
from unittest.mock import Mock, AsyncMock
|
|
|
|
from crawailer.content import WebContent, ContentExtractor
|
|
from crawailer.utils import clean_text, detect_content_type, calculate_reading_time
|
|
from crawailer.config import BrowserConfig, CrawlConfig
|
|
|
|
|
|
def test_web_content_creation():
|
|
"""Test WebContent dataclass creation and properties."""
|
|
content = WebContent(
|
|
url="https://example.com",
|
|
title="Test Article",
|
|
markdown="# Test\n\nThis is a test article.",
|
|
text="Test\n\nThis is a test article.",
|
|
html="<h1>Test</h1><p>This is a test article.</p>"
|
|
)
|
|
|
|
assert content.url == "https://example.com"
|
|
assert content.title == "Test Article"
|
|
assert content.word_count == 6 # "This is a test article."
|
|
assert content.reading_time == "1 min read"
|
|
assert content.content_hash != ""
|
|
|
|
|
|
def test_clean_text():
|
|
"""Test text cleaning utility."""
|
|
dirty_text = " Hello world \n\n with spaces "
|
|
clean = clean_text(dirty_text)
|
|
assert clean == "Hello world with spaces"
|
|
|
|
# Test aggressive cleaning
|
|
dirty_with_boilerplate = "Read our Cookie Policy and Privacy Policy. Hello world."
|
|
clean_aggressive = clean_text(dirty_with_boilerplate, aggressive=True)
|
|
assert "Cookie Policy" not in clean_aggressive
|
|
assert "Hello world" in clean_aggressive
|
|
|
|
|
|
def test_detect_content_type():
|
|
"""Test content type detection."""
|
|
# Product page
|
|
product_html = '<div class="price">$99</div><button class="add-to-cart">Buy</button>'
|
|
assert detect_content_type(product_html) == "product"
|
|
|
|
# Article
|
|
article_html = '<article><h1>Title</h1><p>Content</p></article>'
|
|
assert detect_content_type(article_html) == "article"
|
|
|
|
# Documentation
|
|
doc_html = '<div>API documentation for developers</div>'
|
|
assert detect_content_type(doc_html, title="API Guide") == "documentation"
|
|
|
|
|
|
def test_reading_time_calculation():
|
|
"""Test reading time calculation."""
|
|
short_text = "Hello world"
|
|
assert calculate_reading_time(short_text) == "1 min read"
|
|
|
|
long_text = " ".join(["word"] * 400) # 400 words
|
|
assert calculate_reading_time(long_text) == "2 min read"
|
|
|
|
|
|
def test_browser_config():
|
|
"""Test browser configuration."""
|
|
config = BrowserConfig()
|
|
assert config.headless is True
|
|
assert config.timeout == 30000
|
|
assert config.viewport["width"] == 1920
|
|
|
|
# Test custom config
|
|
custom_config = BrowserConfig(headless=False, timeout=15000)
|
|
assert custom_config.headless is False
|
|
assert custom_config.timeout == 15000
|
|
|
|
|
|
def test_crawl_config():
|
|
"""Test complete crawl configuration."""
|
|
config = CrawlConfig.default()
|
|
assert config.browser.headless is True
|
|
assert config.extraction.clean_text is True
|
|
assert config.concurrency.max_concurrent == 5
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_content_extractor():
|
|
"""Test content extraction from mock HTML."""
|
|
html = """
|
|
<html>
|
|
<head>
|
|
<title>Test Page</title>
|
|
<meta name="author" content="Test Author">
|
|
</head>
|
|
<body>
|
|
<h1>Main Title</h1>
|
|
<p>This is the main content of the page.</p>
|
|
<a href="https://example.com">External Link</a>
|
|
<a href="/internal">Internal Link</a>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
page_data = {
|
|
"url": "https://test.com",
|
|
"html": html,
|
|
"status": 200,
|
|
"load_time": 1.5
|
|
}
|
|
|
|
extractor = ContentExtractor(
|
|
clean=True,
|
|
extract_links=True,
|
|
extract_metadata=True
|
|
)
|
|
|
|
content = await extractor.extract(page_data)
|
|
|
|
assert content.url == "https://test.com"
|
|
assert content.title == "Test Page"
|
|
assert "Main Title" in content.text
|
|
assert "main content" in content.text
|
|
assert content.status_code == 200
|
|
assert content.load_time == 1.5
|
|
assert len(content.links) == 2 # Two links found
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run tests
|
|
pytest.main([__file__, "-v"]) |