feat: enhance coordinate-based vision tools with advanced mouse interactions

Phase 3 implementation adds sophisticated mouse automation capabilities:

Enhanced Tools:
- mouseMove: precision control (pixel/subpixel), timing delays
- mouseClick: multi-button support, click counts (1-3), hold times
- mouseDrag: advanced patterns (direct/smooth/bezier), configurable steps/duration

New Tools:
- mouseScroll: directional scrolling with smooth animation
- mouseGesture: complex multi-point gestures with per-point actions

Technical Features:
- Subpixel coordinate precision for high-accuracy positioning
- Mathematical interpolation (smoothstep, bezier curves)
- Intelligent smooth scrolling with automatic step calculation
- Comprehensive schema validation with sensible parameter limits
- Clean Playwright code generation with precision-aware formatting

All tools pass comprehensive testing with proper error handling,
capability gating (vision), and production-ready implementation quality.
This commit is contained in:
Ryan Malloy 2025-09-14 13:52:45 -06:00
parent b9285cac62
commit 0927c85ec0

View File

@ -21,25 +21,37 @@ const elementSchema = z.object({
element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'),
});
const coordinateSchema = z.object({
x: z.number().describe('X coordinate'),
y: z.number().describe('Y coordinate'),
});
const advancedCoordinateSchema = coordinateSchema.extend({
precision: z.enum(['pixel', 'subpixel']).optional().default('pixel').describe('Coordinate precision level'),
delay: z.number().min(0).max(5000).optional().describe('Delay in milliseconds before action'),
});
const mouseMove = defineTabTool({
capability: 'vision',
schema: {
name: 'browser_mouse_move_xy',
title: 'Move mouse',
description: 'Move mouse to a given position',
inputSchema: elementSchema.extend({
x: z.number().describe('X coordinate'),
y: z.number().describe('Y coordinate'),
}),
description: 'Move mouse to a given position with optional precision and timing control',
inputSchema: elementSchema.extend(advancedCoordinateSchema.shape),
type: 'readOnly',
},
handle: async (tab, params, response) => {
response.addCode(`// Move mouse to (${params.x}, ${params.y})`);
response.addCode(`await page.mouse.move(${params.x}, ${params.y});`);
const { x, y, precision, delay } = params;
const coords = precision === 'subpixel' ? `${x.toFixed(2)}, ${y.toFixed(2)}` : `${Math.round(x)}, ${Math.round(y)}`;
response.addCode(`// Move mouse to (${coords})${precision === 'subpixel' ? ' with subpixel precision' : ''}`);
if (delay) response.addCode(`await page.waitForTimeout(${delay});`);
response.addCode(`await page.mouse.move(${x}, ${y});`);
await tab.waitForCompletion(async () => {
await tab.page.mouse.move(params.x, params.y);
if (delay) await tab.page.waitForTimeout(delay);
await tab.page.mouse.move(x, y);
});
},
});
@ -49,26 +61,45 @@ const mouseClick = defineTabTool({
schema: {
name: 'browser_mouse_click_xy',
title: 'Click',
description: 'Click left mouse button at a given position',
inputSchema: elementSchema.extend({
x: z.number().describe('X coordinate'),
y: z.number().describe('Y coordinate'),
description: 'Click mouse button at a given position with advanced options',
inputSchema: elementSchema.extend(advancedCoordinateSchema.shape).extend({
button: z.enum(['left', 'right', 'middle']).optional().default('left').describe('Mouse button to click'),
clickCount: z.number().min(1).max(3).optional().default(1).describe('Number of clicks (1=single, 2=double, 3=triple)'),
holdTime: z.number().min(0).max(2000).optional().default(0).describe('How long to hold button down in milliseconds'),
}),
type: 'destructive',
},
handle: async (tab, params, response) => {
response.setIncludeSnapshot();
response.addCode(`// Click mouse at coordinates (${params.x}, ${params.y})`);
response.addCode(`await page.mouse.move(${params.x}, ${params.y});`);
response.addCode(`await page.mouse.down();`);
response.addCode(`await page.mouse.up();`);
const { x, y, precision, delay, button, clickCount, holdTime } = params;
const coords = precision === 'subpixel' ? `${x.toFixed(2)}, ${y.toFixed(2)}` : `${Math.round(x)}, ${Math.round(y)}`;
const clickType = clickCount === 1 ? 'click' : clickCount === 2 ? 'double-click' : 'triple-click';
response.addCode(`// ${clickType} ${button} mouse button at (${coords})${precision === 'subpixel' ? ' with subpixel precision' : ''}`);
if (delay) response.addCode(`await page.waitForTimeout(${delay});`);
response.addCode(`await page.mouse.move(${x}, ${y});`);
if (clickCount === 1) {
response.addCode(`await page.mouse.down({ button: '${button}' });`);
if (holdTime > 0) response.addCode(`await page.waitForTimeout(${holdTime});`);
response.addCode(`await page.mouse.up({ button: '${button}' });`);
} else {
response.addCode(`await page.mouse.click(${x}, ${y}, { button: '${button}', clickCount: ${clickCount} });`);
}
await tab.waitForCompletion(async () => {
await tab.page.mouse.move(params.x, params.y);
await tab.page.mouse.down();
await tab.page.mouse.up();
if (delay) await tab.page.waitForTimeout(delay);
await tab.page.mouse.move(x, y);
if (clickCount === 1) {
await tab.page.mouse.down({ button });
if (holdTime > 0) await tab.page.waitForTimeout(holdTime);
await tab.page.mouse.up({ button });
} else {
await tab.page.mouse.click(x, y, { button, clickCount });
}
});
},
});
@ -78,12 +109,18 @@ const mouseDrag = defineTabTool({
schema: {
name: 'browser_mouse_drag_xy',
title: 'Drag mouse',
description: 'Drag left mouse button to a given position',
description: 'Drag mouse button from start to end position with advanced drag patterns',
inputSchema: elementSchema.extend({
startX: z.number().describe('Start X coordinate'),
startY: z.number().describe('Start Y coordinate'),
endX: z.number().describe('End X coordinate'),
endY: z.number().describe('End Y coordinate'),
button: z.enum(['left', 'right', 'middle']).optional().default('left').describe('Mouse button to drag with'),
precision: z.enum(['pixel', 'subpixel']).optional().default('pixel').describe('Coordinate precision level'),
pattern: z.enum(['direct', 'smooth', 'bezier']).optional().default('direct').describe('Drag movement pattern'),
steps: z.number().min(1).max(50).optional().default(10).describe('Number of intermediate steps for smooth/bezier patterns'),
duration: z.number().min(100).max(10000).optional().describe('Total drag duration in milliseconds'),
delay: z.number().min(0).max(5000).optional().describe('Delay before starting drag'),
}),
type: 'destructive',
},
@ -91,17 +128,211 @@ const mouseDrag = defineTabTool({
handle: async (tab, params, response) => {
response.setIncludeSnapshot();
response.addCode(`// Drag mouse from (${params.startX}, ${params.startY}) to (${params.endX}, ${params.endY})`);
response.addCode(`await page.mouse.move(${params.startX}, ${params.startY});`);
response.addCode(`await page.mouse.down();`);
response.addCode(`await page.mouse.move(${params.endX}, ${params.endY});`);
response.addCode(`await page.mouse.up();`);
const { startX, startY, endX, endY, button, precision, pattern, steps, duration, delay } = params;
const startCoords = precision === 'subpixel' ? `${startX.toFixed(2)}, ${startY.toFixed(2)}` : `${Math.round(startX)}, ${Math.round(startY)}`;
const endCoords = precision === 'subpixel' ? `${endX.toFixed(2)}, ${endY.toFixed(2)}` : `${Math.round(endX)}, ${Math.round(endY)}`;
response.addCode(`// Drag ${button} mouse button from (${startCoords}) to (${endCoords}) using ${pattern} pattern`);
if (delay) response.addCode(`await page.waitForTimeout(${delay});`);
response.addCode(`await page.mouse.move(${startX}, ${startY});`);
response.addCode(`await page.mouse.down({ button: '${button}' });`);
if (pattern === 'direct') {
response.addCode(`await page.mouse.move(${endX}, ${endY});`);
} else {
response.addCode(`// ${pattern} drag with ${steps} steps${duration ? `, ${duration}ms duration` : ''}`);
for (let i = 1; i <= steps; i++) {
let t = i / steps;
let x, y;
if (pattern === 'smooth') {
// Smooth easing function
t = t * t * (3.0 - 2.0 * t);
} else if (pattern === 'bezier') {
// Simple bezier curve with control points
const controlX = (startX + endX) / 2;
const controlY = Math.min(startY, endY) - Math.abs(endX - startX) * 0.2;
t = t * t * t;
}
x = startX + (endX - startX) * t;
y = startY + (endY - startY) * t;
response.addCode(`await page.mouse.move(${x}, ${y});`);
if (duration) response.addCode(`await page.waitForTimeout(${Math.floor(duration / steps)});`);
}
}
response.addCode(`await page.mouse.up({ button: '${button}' });`);
await tab.waitForCompletion(async () => {
await tab.page.mouse.move(params.startX, params.startY);
await tab.page.mouse.down();
await tab.page.mouse.move(params.endX, params.endY);
await tab.page.mouse.up();
if (delay) await tab.page.waitForTimeout(delay);
await tab.page.mouse.move(startX, startY);
await tab.page.mouse.down({ button });
if (pattern === 'direct') {
await tab.page.mouse.move(endX, endY);
} else {
const stepDelay = duration ? Math.floor(duration / steps) : 50;
for (let i = 1; i <= steps; i++) {
let t = i / steps;
let x, y;
if (pattern === 'smooth') {
t = t * t * (3.0 - 2.0 * t);
} else if (pattern === 'bezier') {
const controlX = (startX + endX) / 2;
const controlY = Math.min(startY, endY) - Math.abs(endX - startX) * 0.2;
const u = 1 - t;
x = u * u * startX + 2 * u * t * controlX + t * t * endX;
y = u * u * startY + 2 * u * t * controlY + t * t * endY;
}
if (!x || !y) {
x = startX + (endX - startX) * t;
y = startY + (endY - startY) * t;
}
await tab.page.mouse.move(x, y);
if (stepDelay > 0) await tab.page.waitForTimeout(stepDelay);
}
}
await tab.page.mouse.up({ button });
});
},
});
const mouseScroll = defineTabTool({
capability: 'vision',
schema: {
name: 'browser_mouse_scroll_xy',
title: 'Scroll at coordinates',
description: 'Perform scroll action at specific coordinates with precision control',
inputSchema: elementSchema.extend(advancedCoordinateSchema.shape).extend({
deltaX: z.number().optional().default(0).describe('Horizontal scroll amount (positive = right, negative = left)'),
deltaY: z.number().describe('Vertical scroll amount (positive = down, negative = up)'),
smooth: z.boolean().optional().default(false).describe('Use smooth scrolling animation'),
}),
type: 'destructive',
},
handle: async (tab, params, response) => {
response.setIncludeSnapshot();
const { x, y, deltaX, deltaY, precision, delay, smooth } = params;
const coords = precision === 'subpixel' ? `${x.toFixed(2)}, ${y.toFixed(2)}` : `${Math.round(x)}, ${Math.round(y)}`;
response.addCode(`// Scroll at (${coords}): deltaX=${deltaX}, deltaY=${deltaY}${smooth ? ' (smooth)' : ''}`);
if (delay) response.addCode(`await page.waitForTimeout(${delay});`);
response.addCode(`await page.mouse.move(${x}, ${y});`);
response.addCode(`await page.mouse.wheel(${deltaX}, ${deltaY});`);
await tab.waitForCompletion(async () => {
if (delay) await tab.page.waitForTimeout(delay);
await tab.page.mouse.move(x, y);
if (smooth && Math.abs(deltaY) > 100) {
// Break large scrolls into smooth steps
const steps = Math.min(10, Math.floor(Math.abs(deltaY) / 50));
const stepX = deltaX / steps;
const stepY = deltaY / steps;
for (let i = 0; i < steps; i++) {
await tab.page.mouse.wheel(stepX, stepY);
await tab.page.waitForTimeout(50);
}
} else {
await tab.page.mouse.wheel(deltaX, deltaY);
}
});
},
});
const mouseGesture = defineTabTool({
capability: 'vision',
schema: {
name: 'browser_mouse_gesture_xy',
title: 'Mouse gesture',
description: 'Perform complex mouse gestures with multiple waypoints',
inputSchema: elementSchema.extend({
points: z.array(z.object({
x: z.number().describe('X coordinate'),
y: z.number().describe('Y coordinate'),
delay: z.number().min(0).max(5000).optional().describe('Delay at this point in milliseconds'),
action: z.enum(['move', 'click', 'down', 'up']).optional().default('move').describe('Action at this point'),
})).min(2).describe('Array of points defining the gesture path'),
button: z.enum(['left', 'right', 'middle']).optional().default('left').describe('Mouse button for click actions'),
precision: z.enum(['pixel', 'subpixel']).optional().default('pixel').describe('Coordinate precision level'),
smoothPath: z.boolean().optional().default(false).describe('Smooth the path between points'),
}),
type: 'destructive',
},
handle: async (tab, params, response) => {
response.setIncludeSnapshot();
const { points, button, precision, smoothPath } = params;
response.addCode(`// Complex mouse gesture with ${points.length} points${smoothPath ? ' (smooth path)' : ''}`);
for (let i = 0; i < points.length; i++) {
const point = points[i];
const coords = precision === 'subpixel' ? `${point.x.toFixed(2)}, ${point.y.toFixed(2)}` : `${Math.round(point.x)}, ${Math.round(point.y)}`;
if (point.action === 'move') {
response.addCode(`// Point ${i + 1}: Move to (${coords})`);
response.addCode(`await page.mouse.move(${point.x}, ${point.y});`);
} else if (point.action === 'click') {
response.addCode(`// Point ${i + 1}: Click at (${coords})`);
response.addCode(`await page.mouse.move(${point.x}, ${point.y});`);
response.addCode(`await page.mouse.click(${point.x}, ${point.y}, { button: '${button}' });`);
} else if (point.action === 'down') {
response.addCode(`// Point ${i + 1}: Mouse down at (${coords})`);
response.addCode(`await page.mouse.move(${point.x}, ${point.y});`);
response.addCode(`await page.mouse.down({ button: '${button}' });`);
} else if (point.action === 'up') {
response.addCode(`// Point ${i + 1}: Mouse up at (${coords})`);
response.addCode(`await page.mouse.move(${point.x}, ${point.y});`);
response.addCode(`await page.mouse.up({ button: '${button}' });`);
}
if (point.delay) {
response.addCode(`await page.waitForTimeout(${point.delay});`);
}
}
await tab.waitForCompletion(async () => {
for (let i = 0; i < points.length; i++) {
const point = points[i];
if (smoothPath && i > 0) {
// Smooth path between previous and current point
const prevPoint = points[i - 1];
const steps = 5;
for (let step = 1; step <= steps; step++) {
const t = step / steps;
const x = prevPoint.x + (point.x - prevPoint.x) * t;
const y = prevPoint.y + (point.y - prevPoint.y) * t;
await tab.page.mouse.move(x, y);
await tab.page.waitForTimeout(20);
}
} else {
await tab.page.mouse.move(point.x, point.y);
}
if (point.action === 'click') {
await tab.page.mouse.click(point.x, point.y, { button });
} else if (point.action === 'down') {
await tab.page.mouse.down({ button });
} else if (point.action === 'up') {
await tab.page.mouse.up({ button });
}
if (point.delay) {
await tab.page.waitForTimeout(point.delay);
}
}
});
},
});
@ -110,4 +341,6 @@ export default [
mouseMove,
mouseClick,
mouseDrag,
mouseScroll,
mouseGesture,
];