crawailer/tests/test_basic.py
Crawailer Developer 7634f9fc32 Initial commit: JavaScript API enhancement preparation
- Comprehensive test suite (700+ lines) for JS execution in high-level API
- Test coverage analysis and validation infrastructure
- Enhancement proposal and implementation strategy
- Mock HTTP server with realistic JavaScript scenarios
- Parallel implementation strategy using expert agents and git worktrees

Ready for test-driven implementation of JavaScript enhancements.
2025-09-14 21:22:30 -06:00

135 lines
4.0 KiB
Python

"""
Basic tests for Crawailer functionality.
Simple tests to verify the core components work together.
"""
import pytest
import asyncio
from unittest.mock import Mock, AsyncMock
from crawailer.content import WebContent, ContentExtractor
from crawailer.utils import clean_text, detect_content_type, calculate_reading_time
from crawailer.config import BrowserConfig, CrawlConfig
def test_web_content_creation():
"""Test WebContent dataclass creation and properties."""
content = WebContent(
url="https://example.com",
title="Test Article",
markdown="# Test\n\nThis is a test article.",
text="Test\n\nThis is a test article.",
html="<h1>Test</h1><p>This is a test article.</p>"
)
assert content.url == "https://example.com"
assert content.title == "Test Article"
assert content.word_count == 6 # "This is a test article."
assert content.reading_time == "1 min read"
assert content.content_hash != ""
def test_clean_text():
"""Test text cleaning utility."""
dirty_text = " Hello world \n\n with spaces "
clean = clean_text(dirty_text)
assert clean == "Hello world with spaces"
# Test aggressive cleaning
dirty_with_boilerplate = "Read our Cookie Policy and Privacy Policy. Hello world."
clean_aggressive = clean_text(dirty_with_boilerplate, aggressive=True)
assert "Cookie Policy" not in clean_aggressive
assert "Hello world" in clean_aggressive
def test_detect_content_type():
"""Test content type detection."""
# Product page
product_html = '<div class="price">$99</div><button class="add-to-cart">Buy</button>'
assert detect_content_type(product_html) == "product"
# Article
article_html = '<article><h1>Title</h1><p>Content</p></article>'
assert detect_content_type(article_html) == "article"
# Documentation
doc_html = '<div>API documentation for developers</div>'
assert detect_content_type(doc_html, title="API Guide") == "documentation"
def test_reading_time_calculation():
"""Test reading time calculation."""
short_text = "Hello world"
assert calculate_reading_time(short_text) == "1 min read"
long_text = " ".join(["word"] * 400) # 400 words
assert calculate_reading_time(long_text) == "2 min read"
def test_browser_config():
"""Test browser configuration."""
config = BrowserConfig()
assert config.headless is True
assert config.timeout == 30000
assert config.viewport["width"] == 1920
# Test custom config
custom_config = BrowserConfig(headless=False, timeout=15000)
assert custom_config.headless is False
assert custom_config.timeout == 15000
def test_crawl_config():
"""Test complete crawl configuration."""
config = CrawlConfig.default()
assert config.browser.headless is True
assert config.extraction.clean_text is True
assert config.concurrency.max_concurrent == 5
@pytest.mark.asyncio
async def test_content_extractor():
"""Test content extraction from mock HTML."""
html = """
<html>
<head>
<title>Test Page</title>
<meta name="author" content="Test Author">
</head>
<body>
<h1>Main Title</h1>
<p>This is the main content of the page.</p>
<a href="https://example.com">External Link</a>
<a href="/internal">Internal Link</a>
</body>
</html>
"""
page_data = {
"url": "https://test.com",
"html": html,
"status": 200,
"load_time": 1.5
}
extractor = ContentExtractor(
clean=True,
extract_links=True,
extract_metadata=True
)
content = await extractor.extract(page_data)
assert content.url == "https://test.com"
assert content.title == "Test Page"
assert "Main Title" in content.text
assert "main content" in content.text
assert content.status_code == 200
assert content.load_time == 1.5
assert len(content.links) == 2 # Two links found
if __name__ == "__main__":
# Run tests
pytest.main([__file__, "-v"])