#!/usr/bin/env python3 """ Real-world testing of Crawailer JavaScript API enhancements. Tests various website types to validate production readiness. """ import asyncio import sys import time from datetime import datetime from typing import List, Dict, Any # Add src to path to use our enhanced implementation sys.path.insert(0, 'src') import crawailer as web class RealWorldTester: """Test suite for real-world website crawling with JavaScript enhancement.""" def __init__(self): self.results = [] self.test_start_time = None async def test_static_content_baseline(self): """Test with static content to ensure basic functionality works.""" print("๐Ÿงช Testing Static Content (Baseline)") print("-" * 50) test_cases = [ { "name": "Wikipedia Article", "url": "https://en.wikipedia.org/wiki/Web_scraping", "expected_elements": ["Web scraping", "content", "extraction"], "use_js": False }, { "name": "Example.com", "url": "https://example.com", "expected_elements": ["Example Domain", "information", "examples"], "use_js": False } ] for test in test_cases: await self._run_test_case(test) async def test_dynamic_content_scenarios(self): """Test JavaScript-enhanced content extraction.""" print("\n๐Ÿš€ Testing Dynamic Content with JavaScript") print("-" * 50) test_cases = [ { "name": "GitHub Repository (Dynamic Loading)", "url": "https://github.com/microsoft/playwright", "script": """ // Wait for dynamic content and return repository stats await new Promise(r => setTimeout(r, 2000)); const stars = document.querySelector('[data-view-component="true"] strong')?.innerText || 'unknown'; return {stars: stars, loaded: true}; """, "expected_elements": ["Playwright", "browser", "automation"], "use_js": True }, { "name": "JSONPlaceholder API Demo", "url": "https://jsonplaceholder.typicode.com/", "script": """ // Look for API endpoints and examples const links = Array.from(document.querySelectorAll('a')).map(a => a.href); const codeBlocks = Array.from(document.querySelectorAll('code')).map(c => c.innerText); return { links_found: links.length, code_examples: codeBlocks.length, has_api_info: document.body.innerText.includes('REST API') }; """, "expected_elements": ["REST API", "JSON", "placeholder"], "use_js": True } ] for test in test_cases: await self._run_test_case(test) async def test_spa_and_modern_sites(self): """Test Single Page Applications and modern JavaScript-heavy sites.""" print("\nโšก Testing SPAs and Modern JavaScript Sites") print("-" * 50) test_cases = [ { "name": "React Documentation", "url": "https://react.dev/", "script": """ // Wait for React app to load await new Promise(r => setTimeout(r, 3000)); const title = document.querySelector('h1')?.innerText || 'No title found'; const navItems = document.querySelectorAll('nav a').length; return { page_title: title, navigation_items: navItems, react_loaded: !!window.React || document.body.innerText.includes('React') }; """, "expected_elements": ["React", "JavaScript", "library"], "use_js": True } ] for test in test_cases: await self._run_test_case(test) async def test_batch_processing(self): """Test get_many() with multiple sites and different JavaScript requirements.""" print("\n๐Ÿ“ฆ Testing Batch Processing with Mixed JavaScript") print("-" * 50) urls = [ "https://httpbin.org/html", # Static HTML "https://httpbin.org/json", # JSON endpoint "https://example.com" # Simple static page ] scripts = [ "document.querySelector('h1')?.innerText || 'No H1 found'", # Extract title "JSON.stringify(Object.keys(window).slice(0, 5))", # Get some window properties None # No script for simple page ] start_time = time.time() try: print(f"Processing {len(urls)} URLs with mixed JavaScript requirements...") results = await web.get_many(urls, script=scripts, max_concurrent=3) processing_time = time.time() - start_time print(f"โœ… Batch processing completed in {processing_time:.2f}s") print(f"โœ… Successfully processed {len([r for r in results if r])} out of {len(urls)} URLs") for i, (url, result) in enumerate(zip(urls, results)): if result: script_status = "โœ… JS executed" if result.script_result else "โž– No JS" word_count = result.word_count print(f" {i+1}. {url[:50]:<50} | {word_count:>4} words | {script_status}") if result.script_result: print(f" Script result: {str(result.script_result)[:80]}") else: print(f" {i+1}. {url[:50]:<50} | FAILED") self.results.append({ "test_name": "Batch Processing", "status": "success", "urls_processed": len([r for r in results if r]), "total_urls": len(urls), "processing_time": processing_time, "details": f"Mixed JS/no-JS processing successful" }) except Exception as e: print(f"โŒ Batch processing failed: {e}") self.results.append({ "test_name": "Batch Processing", "status": "failed", "error": str(e) }) async def test_discovery_scenarios(self): """Test discover() function with JavaScript enhancement.""" print("\n๐Ÿ” Testing Discovery with JavaScript Enhancement") print("-" * 50) try: print("Testing discover() function (Note: May be limited implementation)") # Test basic discovery start_time = time.time() results = await web.discover("Python web scraping", max_pages=3) discovery_time = time.time() - start_time print(f"โœ… Discovery completed in {discovery_time:.2f}s") print(f"โœ… Found {len(results)} results") for i, result in enumerate(results[:3]): print(f" {i+1}. {result.title[:60]}") print(f" URL: {result.url}") print(f" Words: {result.word_count}") self.results.append({ "test_name": "Discovery Function", "status": "success", "results_found": len(results), "discovery_time": discovery_time }) except NotImplementedError: print("โ„น๏ธ Discovery function not yet fully implemented (expected)") self.results.append({ "test_name": "Discovery Function", "status": "not_implemented", "note": "Expected - discovery may need search engine integration" }) except Exception as e: print(f"โŒ Discovery test failed: {e}") self.results.append({ "test_name": "Discovery Function", "status": "failed", "error": str(e) }) async def _run_test_case(self, test: Dict[str, Any]): """Run an individual test case.""" print(f"\n๐ŸŒ Testing: {test['name']}") print(f" URL: {test['url']}") start_time = time.time() try: if test['use_js'] and 'script' in test: print(f" JavaScript: {test['script'][:60]}...") content = await web.get( test['url'], script=test['script'], timeout=45 ) else: print(" Mode: Static content extraction") content = await web.get(test['url'], timeout=30) load_time = time.time() - start_time # Analyze results found_elements = sum(1 for element in test['expected_elements'] if element.lower() in content.text.lower()) print(f" โœ… Loaded in {load_time:.2f}s") print(f" โœ… Title: {content.title}") print(f" โœ… Content: {content.word_count} words") print(f" โœ… Expected elements found: {found_elements}/{len(test['expected_elements'])}") if content.script_result: print(f" โœ… JavaScript result: {str(content.script_result)[:100]}") if content.script_error: print(f" โš ๏ธ JavaScript error: {content.script_error}") self.results.append({ "test_name": test['name'], "url": test['url'], "status": "success", "load_time": load_time, "word_count": content.word_count, "elements_found": found_elements, "expected_elements": len(test['expected_elements']), "has_js_result": content.script_result is not None, "has_js_error": content.script_error is not None }) except Exception as e: load_time = time.time() - start_time print(f" โŒ Failed after {load_time:.2f}s: {e}") self.results.append({ "test_name": test['name'], "url": test['url'], "status": "failed", "load_time": load_time, "error": str(e) }) def print_summary(self): """Print comprehensive test results summary.""" print("\n" + "="*80) print("๐ŸŽฏ REAL-WORLD TESTING SUMMARY") print("="*80) total_tests = len(self.results) successful_tests = len([r for r in self.results if r['status'] == 'success']) failed_tests = len([r for r in self.results if r['status'] == 'failed']) not_implemented = len([r for r in self.results if r['status'] == 'not_implemented']) success_rate = (successful_tests / total_tests * 100) if total_tests > 0 else 0 print(f"\n๐Ÿ“Š OVERALL RESULTS:") print(f" Total tests: {total_tests}") print(f" โœ… Successful: {successful_tests}") print(f" โŒ Failed: {failed_tests}") print(f" โ„น๏ธ Not implemented: {not_implemented}") print(f" ๐Ÿ“ˆ Success rate: {success_rate:.1f}%") if successful_tests > 0: successful_results = [r for r in self.results if r['status'] == 'success'] avg_load_time = sum(r.get('load_time', 0) for r in successful_results) / len(successful_results) total_words = sum(r.get('word_count', 0) for r in successful_results) js_enabled_tests = len([r for r in successful_results if r.get('has_js_result', False)]) print(f"\nโšก PERFORMANCE METRICS:") print(f" Average load time: {avg_load_time:.2f}s") print(f" Total content extracted: {total_words:,} words") print(f" JavaScript-enhanced extractions: {js_enabled_tests}") print(f"\n๐Ÿ“‹ DETAILED RESULTS:") for result in self.results: status_icon = "โœ…" if result['status'] == 'success' else "โŒ" if result['status'] == 'failed' else "โ„น๏ธ" print(f" {status_icon} {result['test_name']}") if result['status'] == 'success': load_time = result.get('load_time', 0) words = result.get('word_count', 0) js_indicator = " (JS)" if result.get('has_js_result', False) else "" print(f" {load_time:.2f}s | {words} words{js_indicator}") elif result['status'] == 'failed': print(f" Error: {result.get('error', 'Unknown error')}") print(f"\n๐ŸŽ‰ JavaScript API Enhancement: {'VALIDATED' if success_rate >= 70 else 'NEEDS IMPROVEMENT'}") if success_rate >= 70: print(" The JavaScript API enhancement is working well in real-world scenarios!") else: print(" Some issues detected that may need attention.") async def main(): """Run comprehensive real-world testing.""" print("๐Ÿš€ Crawailer JavaScript API Enhancement - Real-World Testing") print("="*80) print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("Testing enhanced JavaScript capabilities with real websites...") tester = RealWorldTester() tester.test_start_time = time.time() try: # Run all test suites await tester.test_static_content_baseline() await tester.test_dynamic_content_scenarios() await tester.test_spa_and_modern_sites() await tester.test_batch_processing() await tester.test_discovery_scenarios() except KeyboardInterrupt: print("\nโš ๏ธ Testing interrupted by user") except Exception as e: print(f"\n๐Ÿ’ฅ Unexpected error during testing: {e}") import traceback traceback.print_exc() finally: total_time = time.time() - tester.test_start_time print(f"\nTotal testing time: {total_time:.2f}s") tester.print_summary() if __name__ == "__main__": print("Note: This requires Playwright to be installed and browser setup complete.") print("Run 'playwright install chromium' if you haven't already.") print() try: asyncio.run(main()) except KeyboardInterrupt: print("\nTesting cancelled by user.") except Exception as e: print(f"Failed to start testing: {e}") print("Make sure Playwright is properly installed and configured.")