Complete Phase 2: Browser JavaScript integration with script_before/script_after support

This commit is contained in:
Crawailer Developer 2025-09-14 21:37:13 -06:00
parent 05df964ce1
commit e544086e6b
5 changed files with 247 additions and 9 deletions

View File

@ -1,7 +1,7 @@
{ {
"project_status": "ready_for_implementation", "project_status": "phase_2_complete",
"last_updated": "2024-09-15T14:30:00Z", "last_updated": "2024-09-15T15:45:00Z",
"overall_completion": 25, "overall_completion": 50,
"phases": { "phases": {
"webcontent": { "webcontent": {
@ -34,8 +34,8 @@
}, },
"browser": { "browser": {
"status": "waiting", "status": "completed",
"completion": 0, "completion": 100,
"assigned_agent": "debugging-expert + performance-optimization-expert", "assigned_agent": "debugging-expert + performance-optimization-expert",
"branch": "feature/js-browser-enhancement", "branch": "feature/js-browser-enhancement",
"dependencies": ["webcontent"], "dependencies": ["webcontent"],
@ -51,7 +51,15 @@
"test_browser_fetch_page_with_scripts", "test_browser_fetch_page_with_scripts",
"test_browser_script_timeout" "test_browser_script_timeout"
], ],
"success_criteria": "All TestBrowserJavaScriptExecution tests pass" "success_criteria": "All TestBrowserJavaScriptExecution tests pass",
"implementation_notes": {
"fetch_page_enhanced": "Added script_before and script_after parameters",
"script_execution_flow": "script_before -> content extraction -> script_after",
"result_structure": "script_result and script_error fields in page data",
"error_handling": "Graceful degradation when JavaScript fails",
"backward_compatibility": "100% - all existing fetch_page calls work unchanged",
"test_coverage": "12 comprehensive tests covering all scenarios"
}
}, },
"api_integration": { "api_integration": {

View File

@ -98,3 +98,8 @@ warn_unused_configs = true
[tool.pytest.ini_options] [tool.pytest.ini_options]
testpaths = ["tests"] testpaths = ["tests"]
asyncio_mode = "auto" asyncio_mode = "auto"
[dependency-groups]
dev = [
"aiohttp>=3.12.15",
]

View File

@ -89,6 +89,8 @@ class Browser:
wait_for: Optional[str] = None, wait_for: Optional[str] = None,
timeout: int = 30, timeout: int = 30,
stealth: bool = False, stealth: bool = False,
script_before: Optional[str] = None,
script_after: Optional[str] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Fetch a single page and return structured data. Fetch a single page and return structured data.
@ -98,9 +100,11 @@ class Browser:
wait_for: CSS selector to wait for before returning wait_for: CSS selector to wait for before returning
timeout: Timeout in seconds timeout: Timeout in seconds
stealth: Whether to use stealth mode (anti-detection) stealth: Whether to use stealth mode (anti-detection)
script_before: JavaScript to execute after page load, before content extraction
script_after: JavaScript to execute after content extraction (if needed)
Returns: Returns:
Dict with url, html, status, load_time, title Dict with url, html, status, load_time, title, script_result, script_error
""" """
if not self._is_started: if not self._is_started:
await self.start() await self.start()
@ -139,24 +143,57 @@ class Browser:
if wait_for: if wait_for:
await page.wait_for_selector(wait_for, timeout=timeout * 1000) await page.wait_for_selector(wait_for, timeout=timeout * 1000)
# Execute script_before if provided
script_result = None
script_error = None
if script_before:
try:
script_result = await page.evaluate(script_before)
except Exception as e:
script_error = f"Script execution error: {str(e)}"
# Extract page data # Extract page data
html = await page.content() html = await page.content()
title = await page.title() title = await page.title()
# Execute script_after if provided (can access extracted content)
if script_after and script_error is None:
try:
script_after_result = await page.evaluate(script_after)
# If we had a previous result, combine them
if script_result is not None:
script_result = {
"script_before": script_result,
"script_after": script_after_result
}
else:
script_result = script_after_result
except Exception as e:
script_error = f"Script after execution error: {str(e)}"
load_time = time.time() - start_time load_time = time.time() - start_time
return { # Build result dictionary
result = {
"url": url, "url": url,
"html": html, "html": html,
"title": title, "title": title,
"status": response.status if response else 0, "status": response.status if response else 0,
"load_time": load_time, "load_time": load_time,
} }
# Add script results if any scripts were executed
if script_before or script_after:
result["script_result"] = script_result
result["script_error"] = script_error
return result
except Exception as e: except Exception as e:
load_time = time.time() - start_time load_time = time.time() - start_time
# Return error information # Return error information
return { result = {
"url": url, "url": url,
"html": "", "html": "",
"title": "", "title": "",
@ -164,6 +201,13 @@ class Browser:
"load_time": load_time, "load_time": load_time,
"error": str(e), "error": str(e),
} }
# Add script fields if scripts were requested
if script_before or script_after:
result["script_result"] = None
result["script_error"] = f"Page load failed, scripts not executed: {str(e)}"
return result
finally: finally:
# Clean up page # Clean up page

View File

@ -869,6 +869,179 @@ class TestBrowserJavaScriptExecution:
timeout=1 timeout=1
) )
@pytest.mark.asyncio
async def test_browser_execute_script_basic(self):
"""Test basic script execution (alias for compatibility)."""
await self.test_execute_script_basic()
@pytest.mark.asyncio
async def test_browser_execute_script_error(self):
"""Test script execution error handling (alias for compatibility)."""
await self.test_execute_script_error()
@pytest.mark.asyncio
async def test_browser_script_timeout(self):
"""Test script execution timeout (alias for compatibility)."""
await self.test_execute_script_timeout()
@pytest.mark.asyncio
async def test_browser_fetch_page_with_scripts(self):
"""Test fetch_page with script_before and script_after parameters."""
browser = Browser(BrowserConfig())
# Mock Playwright components
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.set_viewport_size = AsyncMock()
mock_page.content.return_value = "<html><body><h1>Test</h1></body></html>"
mock_page.title.return_value = "Test Page"
mock_page.close = AsyncMock()
# Mock script execution results
script_calls = []
def mock_evaluate(script):
script_calls.append(script)
if "before" in script:
return {"before_result": "success"}
elif "after" in script:
return {"after_result": "complete"}
return None
mock_page.evaluate.side_effect = mock_evaluate
mock_browser = AsyncMock()
mock_browser.new_page.return_value = mock_page
mock_response = AsyncMock()
mock_response.status = 200
mock_page.goto.return_value = mock_response
browser._browser = mock_browser
browser._is_started = True
# Test with both script_before and script_after
result = await browser.fetch_page(
"https://example.com",
script_before="return {before: true}",
script_after="return {after: true}"
)
# Verify the result structure
assert result["url"] == "https://example.com"
assert result["status"] == 200
assert result["html"] == "<html><body><h1>Test</h1></body></html>"
assert result["title"] == "Test Page"
assert "script_result" in result
assert "script_error" in result
# Script result should contain both before and after results
assert result["script_result"] == {
"script_before": {"before_result": "success"},
"script_after": {"after_result": "complete"}
}
assert result["script_error"] is None
# Verify script execution order (before content extraction, after content extraction)
assert len(script_calls) == 2
mock_page.evaluate.assert_any_call("return {before: true}")
mock_page.evaluate.assert_any_call("return {after: true}")
@pytest.mark.asyncio
async def test_browser_fetch_page_script_before_only(self):
"""Test fetch_page with only script_before parameter."""
browser = Browser(BrowserConfig())
# Mock setup
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.set_viewport_size = AsyncMock()
mock_page.content.return_value = "<html><body><h1>Test</h1></body></html>"
mock_page.title.return_value = "Test Page"
mock_page.evaluate.return_value = {"data": "extracted"}
mock_page.close = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_page.return_value = mock_page
mock_response = AsyncMock()
mock_response.status = 200
mock_page.goto.return_value = mock_response
browser._browser = mock_browser
browser._is_started = True
result = await browser.fetch_page(
"https://example.com",
script_before="return document.querySelector('h1').innerText"
)
assert result["script_result"] == {"data": "extracted"}
assert result["script_error"] is None
mock_page.evaluate.assert_called_once_with("return document.querySelector('h1').innerText")
@pytest.mark.asyncio
async def test_browser_fetch_page_script_error_handling(self):
"""Test fetch_page script error handling."""
browser = Browser(BrowserConfig())
# Mock setup
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.set_viewport_size = AsyncMock()
mock_page.content.return_value = "<html><body><h1>Test</h1></body></html>"
mock_page.title.return_value = "Test Page"
mock_page.evaluate.side_effect = Exception("Script syntax error")
mock_page.close = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_page.return_value = mock_page
mock_response = AsyncMock()
mock_response.status = 200
mock_page.goto.return_value = mock_response
browser._browser = mock_browser
browser._is_started = True
result = await browser.fetch_page(
"https://example.com",
script_before="invalid javascript syntax %@#$"
)
assert result["script_result"] is None
assert "Script execution error: Script syntax error" in result["script_error"]
# Page should still load successfully
assert result["status"] == 200
assert result["html"] == "<html><body><h1>Test</h1></body></html>"
@pytest.mark.asyncio
async def test_browser_fetch_page_page_load_error_with_scripts(self):
"""Test fetch_page when page load fails but scripts were requested."""
browser = Browser(BrowserConfig())
# Mock setup
mock_page = AsyncMock()
mock_page.goto.side_effect = Exception("Network error")
mock_page.set_viewport_size = AsyncMock()
mock_page.close = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_page.return_value = mock_page
browser._browser = mock_browser
browser._is_started = True
result = await browser.fetch_page(
"https://unreachable-site.com",
script_before="return true"
)
# Should handle the error gracefully
assert result["status"] == 0
assert result["error"] == "Network error"
assert result["script_result"] is None
assert "Page load failed, scripts not executed: Network error" in result["script_error"]
# Test utilities and integration # Test utilities and integration
class TestJavaScriptIntegration: class TestJavaScriptIntegration:

8
uv.lock generated
View File

@ -367,6 +367,11 @@ mcp = [
{ name = "mcp" }, { name = "mcp" },
] ]
[package.dev-dependencies]
dev = [
{ name = "aiohttp" },
]
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "aiohttp", marker = "extra == 'all'", specifier = ">=3.9.0" }, { name = "aiohttp", marker = "extra == 'all'", specifier = ">=3.9.0" },
@ -402,6 +407,9 @@ requires-dist = [
] ]
provides-extras = ["ai", "all", "dev", "mcp"] provides-extras = ["ai", "all", "dev", "mcp"]
[package.metadata.requires-dev]
dev = [{ name = "aiohttp", specifier = ">=3.12.15" }]
[[package]] [[package]]
name = "cymem" name = "cymem"
version = "2.0.11" version = "2.0.11"