""" Basic tests for Crawailer functionality. Simple tests to verify the core components work together. """ import pytest import asyncio from unittest.mock import Mock, AsyncMock from crawailer.content import WebContent, ContentExtractor from crawailer.utils import clean_text, detect_content_type, calculate_reading_time from crawailer.config import BrowserConfig, CrawlConfig def test_web_content_creation(): """Test WebContent dataclass creation and properties.""" content = WebContent( url="https://example.com", title="Test Article", markdown="# Test\n\nThis is a test article.", text="Test\n\nThis is a test article.", html="

Test

This is a test article.

" ) assert content.url == "https://example.com" assert content.title == "Test Article" assert content.word_count == 6 # "This is a test article." assert content.reading_time == "1 min read" assert content.content_hash != "" def test_clean_text(): """Test text cleaning utility.""" dirty_text = " Hello world \n\n with spaces " clean = clean_text(dirty_text) assert clean == "Hello world with spaces" # Test aggressive cleaning dirty_with_boilerplate = "Read our Cookie Policy and Privacy Policy. Hello world." clean_aggressive = clean_text(dirty_with_boilerplate, aggressive=True) assert "Cookie Policy" not in clean_aggressive assert "Hello world" in clean_aggressive def test_detect_content_type(): """Test content type detection.""" # Product page product_html = '

$99

' assert detect_content_type(product_html) == "product" # Article article_html = '

Title

Content

' assert detect_content_type(article_html) == "article" # Documentation doc_html = '

API documentation for developers

' assert detect_content_type(doc_html, title="API Guide") == "documentation" def test_reading_time_calculation(): """Test reading time calculation.""" short_text = "Hello world" assert calculate_reading_time(short_text) == "1 min read" long_text = " ".join(["word"] * 400) # 400 words assert calculate_reading_time(long_text) == "2 min read" def test_browser_config(): """Test browser configuration.""" config = BrowserConfig() assert config.headless is True assert config.timeout == 30000 assert config.viewport["width"] == 1920 # Test custom config custom_config = BrowserConfig(headless=False, timeout=15000) assert custom_config.headless is False assert custom_config.timeout == 15000 def test_crawl_config(): """Test complete crawl configuration.""" config = CrawlConfig.default() assert config.browser.headless is True assert config.extraction.clean_text is True assert config.concurrency.max_concurrent == 5 @pytest.mark.asyncio async def test_content_extractor(): """Test content extraction from mock HTML.""" html = """ Test Page

Main Title

This is the main content of the page.

External Link Internal Link """ page_data = { "url": "https://test.com", "html": html, "status": 200, "load_time": 1.5 } extractor = ContentExtractor( clean=True, extract_links=True, extract_metadata=True ) content = await extractor.extract(page_data) assert content.url == "https://test.com" assert content.title == "Test Page" assert "Main Title" in content.text assert "main content" in content.text assert content.status_code == 200 assert content.load_time == 1.5 assert len(content.links) == 2 # Two links found if __name__ == "__main__": # Run tests pytest.main([__file__, "-v"])