
- Comprehensive test suite (700+ lines) for JS execution in high-level API - Test coverage analysis and validation infrastructure - Enhancement proposal and implementation strategy - Mock HTTP server with realistic JavaScript scenarios - Parallel implementation strategy using expert agents and git worktrees Ready for test-driven implementation of JavaScript enhancements.
122 lines
3.5 KiB
Python
122 lines
3.5 KiB
Python
"""
|
|
Basic usage examples for Crawailer.
|
|
|
|
This demonstrates the main API functions and typical workflows.
|
|
"""
|
|
|
|
import asyncio
|
|
import crawailer as web
|
|
|
|
|
|
async def basic_example():
|
|
"""Basic content extraction from a single URL."""
|
|
print("🕷️ Basic Crawailer Example")
|
|
print("=" * 50)
|
|
|
|
# Simple content extraction
|
|
print("\n1. Single page extraction:")
|
|
content = await web.get("https://example.com")
|
|
|
|
print(f" Title: {content.title}")
|
|
print(f" Word count: {content.word_count}")
|
|
print(f" Reading time: {content.reading_time}")
|
|
print(f" Quality score: {content.quality_score:.1f}/10")
|
|
print(f" Content type: {content.content_type}")
|
|
|
|
# Show first 200 characters of markdown
|
|
print(f"\n Markdown preview:")
|
|
print(f" {content.markdown[:200]}...")
|
|
|
|
|
|
async def batch_example():
|
|
"""Batch processing multiple URLs."""
|
|
print("\n2. Batch processing:")
|
|
|
|
urls = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://httpbin.org/json" # This will be different content
|
|
]
|
|
|
|
results = await web.get_many(urls, max_concurrent=3)
|
|
|
|
print(f" Processed {len(results)} URLs")
|
|
for i, result in enumerate(results):
|
|
if result:
|
|
print(f" {i+1}. {result.title} ({result.word_count} words)")
|
|
else:
|
|
print(f" {i+1}. Failed to fetch")
|
|
|
|
|
|
async def discovery_example():
|
|
"""Content discovery (placeholder implementation)."""
|
|
print("\n3. Content discovery:")
|
|
|
|
try:
|
|
# Note: This is a placeholder implementation
|
|
results = await web.discover("web crawling", max_pages=3)
|
|
print(f" Found {len(results)} relevant sources")
|
|
|
|
for result in results:
|
|
print(f" - {result.title}")
|
|
|
|
except NotImplementedError:
|
|
print(" Discovery feature coming soon!")
|
|
|
|
|
|
async def context_manager_example():
|
|
"""Using browser as context manager for more control."""
|
|
print("\n4. Advanced browser control:")
|
|
|
|
from crawailer import Browser, BrowserConfig
|
|
|
|
config = BrowserConfig(headless=True, timeout=15000)
|
|
|
|
async with Browser(config) as browser:
|
|
# Fetch with custom wait condition
|
|
page_data = await browser.fetch_page(
|
|
"https://httpbin.org/delay/1",
|
|
timeout=10
|
|
)
|
|
|
|
print(f" Fetched: {page_data['url']}")
|
|
print(f" Status: {page_data['status']}")
|
|
print(f" Load time: {page_data['load_time']:.2f}s")
|
|
|
|
|
|
async def content_analysis_example():
|
|
"""Analyzing extracted content."""
|
|
print("\n5. Content analysis:")
|
|
|
|
content = await web.get("https://httpbin.org/html")
|
|
|
|
print(f" Content hash: {content.content_hash[:16]}...")
|
|
print(f" Language: {content.language}")
|
|
print(f" Links found: {len(content.links)}")
|
|
print(f" Images found: {len(content.images)}")
|
|
|
|
if content.links:
|
|
print(f" First link: {content.links[0]['text']} -> {content.links[0]['url']}")
|
|
|
|
|
|
async def main():
|
|
"""Run all examples."""
|
|
try:
|
|
await basic_example()
|
|
await batch_example()
|
|
await discovery_example()
|
|
await context_manager_example()
|
|
await content_analysis_example()
|
|
|
|
print("\n✅ All examples completed successfully!")
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Error: {e}")
|
|
|
|
finally:
|
|
# Clean up global resources
|
|
await web.cleanup()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |