From d61aa16fee67ed78d18120bbd9d93e55d1d48f32 Mon Sep 17 00:00:00 2001 From: Pavel Feldman Date: Wed, 16 Jul 2025 16:40:00 -0700 Subject: [PATCH] chore: turn vision into capability (#679) Fixes https://github.com/microsoft/playwright-mcp/issues/420 --- README.md | 351 +++++++++++------------------- config.d.ts | 13 +- src/config.ts | 2 - src/connection.ts | 5 +- src/program.ts | 11 +- src/tools.ts | 34 +-- src/tools/common.ts | 10 +- src/tools/dialogs.ts | 10 +- src/tools/files.ts | 12 +- src/tools/install.ts | 2 +- src/tools/keyboard.ts | 63 +++++- src/tools/{vision.ts => mouse.ts} | 107 ++------- src/tools/navigate.ts | 26 +-- src/tools/screenshot.ts | 2 +- src/tools/snapshot.ts | 51 +---- src/tools/tabs.ts | 30 +-- src/tools/tool.ts | 2 - src/tools/wait.ts | 14 +- tests/capabilities.spec.ts | 63 +++--- tests/evaluate.spec.ts | 51 +++++ tests/fixtures.ts | 6 - tests/pdf.spec.ts | 6 +- utils/update-readme.js | 70 ++---- 23 files changed, 366 insertions(+), 575 deletions(-) rename src/tools/{vision.ts => mouse.ts} (59%) create mode 100644 tests/evaluate.spec.ts diff --git a/README.md b/README.md index 6bcc17e..f5e153b 100644 --- a/README.md +++ b/README.md @@ -193,9 +193,8 @@ Playwright MCP server supports following arguments. They can be provided in the --browser browser or chrome channel to use, possible values: chrome, firefox, webkit, msedge. --browser-agent Use browser agent (experimental). - --caps comma-separated list of capabilities to enable, - possible values: tabs, pdf, history, wait, files, - install. Default is all. + --caps comma-separated list of additional capabilities + to enable, possible values: vision, pdf. --cdp-endpoint CDP endpoint to connect to. --config path to the configuration file. --device device to emulate, for example: "iPhone 15" @@ -227,8 +226,6 @@ Playwright MCP server supports following arguments. They can be provided in the specified, a temporary directory will be created. --viewport-size specify browser viewport size in pixels, for example "1280, 720" - --vision Run server that uses screenshots (Aria snapshots - are used by default) ``` @@ -329,21 +326,14 @@ npx @playwright/mcp@latest --config path/to/config.json host?: string; // Host to bind to (default: localhost) }, - // List of enabled capabilities + // List of additional capabilities capabilities?: Array< - 'core' | // Core browser automation 'tabs' | // Tab management - 'pdf' | // PDF generation - 'history' | // Browser history - 'wait' | // Wait utilities - 'files' | // File handling 'install' | // Browser installation - 'testing' // Testing + 'pdf' | // PDF generation + 'vision' | // Coordinate-based interactions >; - // Enable vision mode (screenshots instead of accessibility snapshots) - vision?: boolean; - // Directory for output files outputDir?: string; @@ -433,42 +423,10 @@ http.createServer(async (req, res) => { ### Tools -The tools are available in two modes: - -1. **Snapshot Mode** (default): Uses accessibility snapshots for better performance and reliability -2. **Vision Mode**: Uses screenshots for visual-based interactions - -To use Vision Mode, add the `--vision` flag when starting the server: - -```js -{ - "mcpServers": { - "playwright": { - "command": "npx", - "args": [ - "@playwright/mcp@latest", - "--vision" - ] - } - } -} -``` - -Vision Mode works best with the computer use models that are able to interact with elements using -X Y coordinate space, based on the provided screenshot. -
-Interactions - - - -- **browser_snapshot** - - Title: Page snapshot - - Description: Capture accessibility snapshot of the current page, this is better than screenshot - - Parameters: None - - Read-only: **true** +Core automation @@ -483,6 +441,22 @@ X Y coordinate space, based on the provided screenshot. +- **browser_close** + - Title: Close browser + - Description: Close the page + - Parameters: None + - Read-only: **true** + + + +- **browser_console_messages** + - Title: Get console messages + - Description: Returns all console messages + - Parameters: None + - Read-only: **true** + + + - **browser_drag** - Title: Drag mouse - Description: Perform drag and drop between two elements @@ -495,60 +469,17 @@ X Y coordinate space, based on the provided screenshot. -- **browser_hover** - - Title: Hover mouse - - Description: Hover over element on page +- **browser_evaluate** + - Title: Evaluate JavaScript + - Description: Evaluate JavaScript expression on page or element - Parameters: - - `element` (string): Human-readable element description used to obtain permission to interact with the element - - `ref` (string): Exact target element reference from the page snapshot - - Read-only: **true** - - - -- **browser_type** - - Title: Type text - - Description: Type text into editable element - - Parameters: - - `element` (string): Human-readable element description used to obtain permission to interact with the element - - `ref` (string): Exact target element reference from the page snapshot - - `text` (string): Text to type into the element - - `submit` (boolean, optional): Whether to submit entered text (press Enter after) - - `slowly` (boolean, optional): Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once. + - `function` (string): () => { /* code */ } or (element) => { /* code */ } when element is provided + - `element` (string, optional): Human-readable element description used to obtain permission to interact with the element + - `ref` (string, optional): Exact target element reference from the page snapshot - Read-only: **false** -- **browser_select_option** - - Title: Select option - - Description: Select an option in a dropdown - - Parameters: - - `element` (string): Human-readable element description used to obtain permission to interact with the element - - `ref` (string): Exact target element reference from the page snapshot - - `values` (array): Array of values to select in the dropdown. This can be a single value or multiple values. - - Read-only: **false** - - - -- **browser_press_key** - - Title: Press a key - - Description: Press a key on the keyboard - - Parameters: - - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a` - - Read-only: **false** - - - -- **browser_wait_for** - - Title: Wait for - - Description: Wait for text to appear or disappear or a specified time to pass - - Parameters: - - `time` (number, optional): The time to wait in seconds - - `text` (string, optional): The text to wait for - - `textGone` (string, optional): The text to wait for to disappear - - Read-only: **true** - - - - **browser_file_upload** - Title: Upload files - Description: Upload one or multiple files @@ -566,10 +497,15 @@ X Y coordinate space, based on the provided screenshot. - `promptText` (string, optional): The text of the prompt in case of a prompt dialog. - Read-only: **false** -
+ -
-Navigation +- **browser_hover** + - Title: Hover mouse + - Description: Hover over element on page + - Parameters: + - `element` (string): Human-readable element description used to obtain permission to interact with the element + - `ref` (string): Exact target element reference from the page snapshot + - Read-only: **true** @@ -596,26 +532,51 @@ X Y coordinate space, based on the provided screenshot. - Parameters: None - Read-only: **true** -
+ -
-Evaluation +- **browser_network_requests** + - Title: List network requests + - Description: Returns all network requests since loading the page + - Parameters: None + - Read-only: **true** -- **browser_evaluate** - - Title: Evaluate JavaScript - - Description: Evaluate JavaScript expression on page or element +- **browser_press_key** + - Title: Press a key + - Description: Press a key on the keyboard - Parameters: - - `function` (string): () => { /* code */ } or (element) => { /* code */ } when element is provided - - `element` (string, optional): Human-readable element description used to obtain permission to interact with the element - - `ref` (string, optional): Exact target element reference from the page snapshot + - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a` - Read-only: **false** -
+ -
-Resources +- **browser_resize** + - Title: Resize browser window + - Description: Resize the browser window + - Parameters: + - `width` (number): Width of the browser window + - `height` (number): Height of the browser window + - Read-only: **true** + + + +- **browser_select_option** + - Title: Select option + - Description: Select an option in a dropdown + - Parameters: + - `element` (string): Human-readable element description used to obtain permission to interact with the element + - `ref` (string): Exact target element reference from the page snapshot + - `values` (array): Array of values to select in the dropdown. This can be a single value or multiple values. + - Read-only: **false** + + + +- **browser_snapshot** + - Title: Page snapshot + - Description: Capture accessibility snapshot of the current page, this is better than screenshot + - Parameters: None + - Read-only: **true** @@ -631,64 +592,41 @@ X Y coordinate space, based on the provided screenshot. -- **browser_pdf_save** - - Title: Save as PDF - - Description: Save page as PDF +- **browser_type** + - Title: Type text + - Description: Type text into editable element - Parameters: - - `filename` (string, optional): File name to save the pdf to. Defaults to `page-{timestamp}.pdf` if not specified. - - Read-only: **true** - - - -- **browser_network_requests** - - Title: List network requests - - Description: Returns all network requests since loading the page - - Parameters: None - - Read-only: **true** - - - -- **browser_console_messages** - - Title: Get console messages - - Description: Returns all console messages - - Parameters: None - - Read-only: **true** - -
- -
-Utilities - - - -- **browser_install** - - Title: Install the browser specified in the config - - Description: Install the browser specified in the config. Call this if you get an error about the browser not being installed. - - Parameters: None + - `element` (string): Human-readable element description used to obtain permission to interact with the element + - `ref` (string): Exact target element reference from the page snapshot + - `text` (string): Text to type into the element + - `submit` (boolean, optional): Whether to submit entered text (press Enter after) + - `slowly` (boolean, optional): Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once. - Read-only: **false** -- **browser_close** - - Title: Close browser - - Description: Close the page - - Parameters: None - - Read-only: **true** - - - -- **browser_resize** - - Title: Resize browser window - - Description: Resize the browser window +- **browser_wait_for** + - Title: Wait for + - Description: Wait for text to appear or disappear or a specified time to pass - Parameters: - - `width` (number): Width of the browser window - - `height` (number): Height of the browser window + - `time` (number, optional): The time to wait in seconds + - `text` (string, optional): The text to wait for + - `textGone` (string, optional): The text to wait for to disappear - Read-only: **true**
-Tabs +Tab management + + + +- **browser_tab_close** + - Title: Close a tab + - Description: Close a tab + - Parameters: + - `index` (number, optional): The index of the tab to close. Closes current tab if not provided. + - Read-only: **false** @@ -716,44 +654,29 @@ X Y coordinate space, based on the provided screenshot. - `index` (number): The index of the tab to select - Read-only: **true** +
+ +
+Browser installation + -- **browser_tab_close** - - Title: Close a tab - - Description: Close a tab - - Parameters: - - `index` (number, optional): The index of the tab to close. Closes current tab if not provided. +- **browser_install** + - Title: Install the browser specified in the config + - Description: Install the browser specified in the config. Call this if you get an error about the browser not being installed. + - Parameters: None - Read-only: **false**
-Vision mode +Coordinate-based (opt-in via --caps=vision) -- **browser_screen_capture** - - Title: Take a screenshot - - Description: Take a screenshot of the current page - - Parameters: None - - Read-only: **true** - - - -- **browser_screen_move_mouse** - - Title: Move mouse - - Description: Move mouse to a given position - - Parameters: - - `element` (string): Human-readable element description used to obtain permission to interact with the element - - `x` (number): X coordinate - - `y` (number): Y coordinate - - Read-only: **true** - - - -- **browser_screen_click** +- **browser_mouse_click_xy** - Title: Click - - Description: Click left mouse button + - Description: Click left mouse button at a given position - Parameters: - `element` (string): Human-readable element description used to obtain permission to interact with the element - `x` (number): X coordinate @@ -762,9 +685,9 @@ X Y coordinate space, based on the provided screenshot. -- **browser_screen_drag** +- **browser_mouse_drag_xy** - Title: Drag mouse - - Description: Drag left mouse button + - Description: Drag left mouse button to a given position - Parameters: - `element` (string): Human-readable element description used to obtain permission to interact with the element - `startX` (number): Start X coordinate @@ -775,52 +698,28 @@ X Y coordinate space, based on the provided screenshot. -- **browser_screen_type** - - Title: Type text - - Description: Type text +- **browser_mouse_move_xy** + - Title: Move mouse + - Description: Move mouse to a given position - Parameters: - - `text` (string): Text to type into the element - - `submit` (boolean, optional): Whether to submit entered text (press Enter after) - - Read-only: **false** - - - -- **browser_press_key** - - Title: Press a key - - Description: Press a key on the keyboard - - Parameters: - - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a` - - Read-only: **false** - - - -- **browser_wait_for** - - Title: Wait for - - Description: Wait for text to appear or disappear or a specified time to pass - - Parameters: - - `time` (number, optional): The time to wait in seconds - - `text` (string, optional): The text to wait for - - `textGone` (string, optional): The text to wait for to disappear + - `element` (string): Human-readable element description used to obtain permission to interact with the element + - `x` (number): X coordinate + - `y` (number): Y coordinate - Read-only: **true** - +
-- **browser_file_upload** - - Title: Upload files - - Description: Upload one or multiple files - - Parameters: - - `paths` (array): The absolute paths to the files to upload. Can be a single file or multiple files. - - Read-only: **false** +
+PDF generation (opt-in via --caps=pdf) -- **browser_handle_dialog** - - Title: Handle a dialog - - Description: Handle a dialog +- **browser_pdf_save** + - Title: Save as PDF + - Description: Save page as PDF - Parameters: - - `accept` (boolean): Whether to accept the dialog. - - `promptText` (string, optional): The text of the prompt in case of a prompt dialog. - - Read-only: **false** + - `filename` (string, optional): File name to save the pdf to. Defaults to `page-{timestamp}.pdf` if not specified. + - Read-only: **true**
diff --git a/config.d.ts b/config.d.ts index a935918..c36d5fe 100644 --- a/config.d.ts +++ b/config.d.ts @@ -16,7 +16,7 @@ import type * as playwright from 'playwright'; -export type ToolCapability = 'core' | 'tabs' | 'pdf' | 'history' | 'wait' | 'files' | 'install' | 'testing'; +export type ToolCapability = 'core' | 'core-tabs' | 'core-install' | 'vision' | 'pdf'; export type Config = { /** @@ -85,20 +85,11 @@ export type Config = { /** * List of enabled tool capabilities. Possible values: * - 'core': Core browser automation features. - * - 'tabs': Tab management features. * - 'pdf': PDF generation and manipulation. - * - 'history': Browser history access. - * - 'wait': Wait and timing utilities. - * - 'files': File upload/download support. - * - 'install': Browser installation utilities. + * - 'vision': Coordinate-based interactions. */ capabilities?: ToolCapability[]; - /** - * Run server that uses screenshots (Aria snapshots are used by default). - */ - vision?: boolean; - /** * Whether to save the Playwright trace of the session into the output directory. */ diff --git a/src/config.ts b/src/config.ts index d2cbd67..f9773da 100644 --- a/src/config.ts +++ b/src/config.ts @@ -49,7 +49,6 @@ export type CLIOptions = { userAgent?: string; userDataDir?: string; viewportSize?: string; - vision?: boolean; }; const defaultConfig: FullConfig = { @@ -185,7 +184,6 @@ export async function configFromCLIOptions(cliOptions: CLIOptions): Promise c.trim() as ToolCapability), - vision: !!cliOptions.vision, network: { allowedOrigins: cliOptions.allowedOrigins, blockedOrigins: cliOptions.blockedOrigins, diff --git a/src/connection.ts b/src/connection.ts index a9508bb..1ee4c75 100644 --- a/src/connection.ts +++ b/src/connection.ts @@ -19,7 +19,7 @@ import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from ' import { zodToJsonSchema } from 'zod-to-json-schema'; import { Context } from './context.js'; -import { snapshotTools, visionTools } from './tools.js'; +import { allTools } from './tools.js'; import { packageJSON } from './package.js'; import { FullConfig } from './config.js'; @@ -27,8 +27,7 @@ import { FullConfig } from './config.js'; import type { BrowserContextFactory } from './browserContextFactory.js'; export function createConnection(config: FullConfig, browserContextFactory: BrowserContextFactory): Connection { - const allTools = config.vision ? visionTools : snapshotTools; - const tools = allTools.filter(tool => !config.capabilities || tool.capability === 'core' || config.capabilities.includes(tool.capability)); + const tools = allTools.filter(tool => tool.capability.startsWith('core') || config.capabilities?.includes(tool.capability)); const context = new Context(tools, config, browserContextFactory); const server = new McpServer({ name: 'Playwright', version: packageJSON.version }, { capabilities: { diff --git a/src/program.ts b/src/program.ts index 62109fa..c5ea13b 100644 --- a/src/program.ts +++ b/src/program.ts @@ -14,7 +14,7 @@ * limitations under the License. */ -import { program } from 'commander'; +import { program, Option } from 'commander'; // @ts-ignore import { startTraceViewerServer } from 'playwright-core/lib/server'; @@ -31,7 +31,7 @@ program .option('--block-service-workers', 'block service workers') .option('--browser ', 'browser or chrome channel to use, possible values: chrome, firefox, webkit, msedge.') .option('--browser-agent ', 'Use browser agent (experimental).') - .option('--caps ', 'comma-separated list of capabilities to enable, possible values: tabs, pdf, history, wait, files, install. Default is all.') + .option('--caps ', 'comma-separated list of additional capabilities to enable, possible values: vision, pdf.') .option('--cdp-endpoint ', 'CDP endpoint to connect to.') .option('--config ', 'path to the configuration file.') .option('--device ', 'device to emulate, for example: "iPhone 15"') @@ -51,8 +51,13 @@ program .option('--user-agent ', 'specify user agent string') .option('--user-data-dir ', 'path to the user data directory. If not specified, a temporary directory will be created.') .option('--viewport-size ', 'specify browser viewport size in pixels, for example "1280, 720"') - .option('--vision', 'Run server that uses screenshots (Aria snapshots are used by default)') + .addOption(new Option('--vision', 'Legacy option, use --caps=vision instead').hideHelp()) .action(async options => { + if (options.vision) { + // eslint-disable-next-line no-console + console.error('The --vision option is deprecated, use --caps=vision instead'); + options.caps = 'vision'; + } const config = await resolveCLIConfig(options); const httpServer = config.server.port !== undefined ? await startHttpServer(config.server) : undefined; diff --git a/src/tools.ts b/src/tools.ts index 2f20713..9b7c2a3 100644 --- a/src/tools.ts +++ b/src/tools.ts @@ -27,39 +27,25 @@ import pdf from './tools/pdf.js'; import snapshot from './tools/snapshot.js'; import tabs from './tools/tabs.js'; import screenshot from './tools/screenshot.js'; -import vision from './tools/vision.js'; import wait from './tools/wait.js'; +import mouse from './tools/mouse.js'; import type { Tool } from './tools/tool.js'; -export const snapshotTools: Tool[] = [ - ...common(true), +export const allTools: Tool[] = [ + ...common, ...console, - ...dialogs(true), + ...dialogs, ...evaluate, - ...files(true), + ...files, ...install, - ...keyboard(true), - ...navigate(true), + ...keyboard, + ...navigate, ...network, + ...mouse, ...pdf, ...screenshot, ...snapshot, - ...tabs(true), - ...wait(true), -]; - -export const visionTools: Tool[] = [ - ...common(false), - ...console, - ...dialogs(false), - ...files(false), - ...install, - ...keyboard(false), - ...navigate(false), - ...network, - ...pdf, - ...tabs(false), - ...vision, - ...wait(false), + ...tabs, + ...wait, ]; diff --git a/src/tools/common.ts b/src/tools/common.ts index 8a16c35..5a8e064 100644 --- a/src/tools/common.ts +++ b/src/tools/common.ts @@ -15,7 +15,7 @@ */ import { z } from 'zod'; -import { defineTool, type ToolFactory } from './tool.js'; +import { defineTool } from './tool.js'; const close = defineTool({ capability: 'core', @@ -38,7 +38,7 @@ const close = defineTool({ }, }); -const resize: ToolFactory = captureSnapshot => defineTool({ +const resize = defineTool({ capability: 'core', schema: { name: 'browser_resize', @@ -66,13 +66,13 @@ const resize: ToolFactory = captureSnapshot => defineTool({ return { code, action, - captureSnapshot, + captureSnapshot: true, waitForNetwork: true }; }, }); -export default (captureSnapshot: boolean) => [ +export default [ close, - resize(captureSnapshot) + resize ]; diff --git a/src/tools/dialogs.ts b/src/tools/dialogs.ts index 348e461..5eaf905 100644 --- a/src/tools/dialogs.ts +++ b/src/tools/dialogs.ts @@ -15,9 +15,9 @@ */ import { z } from 'zod'; -import { defineTool, type ToolFactory } from './tool.js'; +import { defineTool } from './tool.js'; -const handleDialog: ToolFactory = captureSnapshot => defineTool({ +const handleDialog = defineTool({ capability: 'core', schema: { @@ -49,7 +49,7 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({ return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false, }; }, @@ -57,6 +57,6 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({ clearsModalState: 'dialog', }); -export default (captureSnapshot: boolean) => [ - handleDialog(captureSnapshot), +export default [ + handleDialog, ]; diff --git a/src/tools/files.ts b/src/tools/files.ts index 2dc7837..a396cf7 100644 --- a/src/tools/files.ts +++ b/src/tools/files.ts @@ -15,10 +15,10 @@ */ import { z } from 'zod'; -import { defineTool, type ToolFactory } from './tool.js'; +import { defineTool } from './tool.js'; -const uploadFile: ToolFactory = captureSnapshot => defineTool({ - capability: 'files', +const uploadFile = defineTool({ + capability: 'core', schema: { name: 'browser_file_upload', @@ -47,13 +47,13 @@ const uploadFile: ToolFactory = captureSnapshot => defineTool({ return { code, action, - captureSnapshot, + captureSnapshot: true, waitForNetwork: true, }; }, clearsModalState: 'fileChooser', }); -export default (captureSnapshot: boolean) => [ - uploadFile(captureSnapshot), +export default [ + uploadFile, ]; diff --git a/src/tools/install.ts b/src/tools/install.ts index d0d5145..3b45e37 100644 --- a/src/tools/install.ts +++ b/src/tools/install.ts @@ -23,7 +23,7 @@ import { defineTool } from './tool.js'; import { fileURLToPath } from 'node:url'; const install = defineTool({ - capability: 'install', + capability: 'core-install', schema: { name: 'browser_install', title: 'Install the browser specified in the config', diff --git a/src/tools/keyboard.ts b/src/tools/keyboard.ts index 521aab2..1687ddd 100644 --- a/src/tools/keyboard.ts +++ b/src/tools/keyboard.ts @@ -15,9 +15,13 @@ */ import { z } from 'zod'; -import { defineTool, type ToolFactory } from './tool.js'; -const pressKey: ToolFactory = captureSnapshot => defineTool({ +import { defineTool } from './tool.js'; +import { elementSchema } from './snapshot.js'; +import { generateLocator } from './utils.js'; +import * as javascript from '../javascript.js'; + +const pressKey = defineTool({ capability: 'core', schema: { @@ -43,12 +47,61 @@ const pressKey: ToolFactory = captureSnapshot => defineTool({ return { code, action, - captureSnapshot, + captureSnapshot: true, waitForNetwork: true }; }, }); -export default (captureSnapshot: boolean) => [ - pressKey(captureSnapshot), +const typeSchema = elementSchema.extend({ + text: z.string().describe('Text to type into the element'), + submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'), + slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'), +}); + +const type = defineTool({ + capability: 'core', + schema: { + name: 'browser_type', + title: 'Type text', + description: 'Type text into editable element', + inputSchema: typeSchema, + type: 'destructive', + }, + + handle: async (context, params) => { + const snapshot = context.currentTabOrDie().snapshotOrDie(); + const locator = snapshot.refLocator(params); + + const code: string[] = []; + const steps: (() => Promise)[] = []; + + if (params.slowly) { + code.push(`// Press "${params.text}" sequentially into "${params.element}"`); + code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`); + steps.push(() => locator.pressSequentially(params.text)); + } else { + code.push(`// Fill "${params.text}" into "${params.element}"`); + code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`); + steps.push(() => locator.fill(params.text)); + } + + if (params.submit) { + code.push(`// Submit text`); + code.push(`await page.${await generateLocator(locator)}.press('Enter');`); + steps.push(() => locator.press('Enter')); + } + + return { + code, + action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()), + captureSnapshot: true, + waitForNetwork: true, + }; + }, +}); + +export default [ + pressKey, + type, ]; diff --git a/src/tools/vision.ts b/src/tools/mouse.ts similarity index 59% rename from src/tools/vision.ts rename to src/tools/mouse.ts index a380311..9171eb7 100644 --- a/src/tools/vision.ts +++ b/src/tools/mouse.ts @@ -17,50 +17,14 @@ import { z } from 'zod'; import { defineTool } from './tool.js'; -import * as javascript from '../javascript.js'; - const elementSchema = z.object({ element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'), }); -const screenshot = defineTool({ - capability: 'core', +const mouseMove = defineTool({ + capability: 'vision', schema: { - name: 'browser_screen_capture', - title: 'Take a screenshot', - description: 'Take a screenshot of the current page', - inputSchema: z.object({}), - type: 'readOnly', - }, - - handle: async context => { - const tab = await context.ensureTab(); - const options = { type: 'jpeg' as 'jpeg', quality: 50, scale: 'css' as 'css' }; - - const code = [ - `// Take a screenshot of the current page`, - `await page.screenshot(${javascript.formatObject(options)});`, - ]; - - const action = () => tab.page.screenshot(options).then(buffer => { - return { - content: [{ type: 'image' as 'image', data: buffer.toString('base64'), mimeType: 'image/jpeg' }], - }; - }); - - return { - code, - action, - captureSnapshot: false, - waitForNetwork: false - }; - }, -}); - -const moveMouse = defineTool({ - capability: 'core', - schema: { - name: 'browser_screen_move_mouse', + name: 'browser_mouse_move_xy', title: 'Move mouse', description: 'Move mouse to a given position', inputSchema: elementSchema.extend({ @@ -86,12 +50,12 @@ const moveMouse = defineTool({ }, }); -const click = defineTool({ - capability: 'core', +const mouseClick = defineTool({ + capability: 'vision', schema: { - name: 'browser_screen_click', + name: 'browser_mouse_click_xy', title: 'Click', - description: 'Click left mouse button', + description: 'Click left mouse button at a given position', inputSchema: elementSchema.extend({ x: z.number().describe('X coordinate'), y: z.number().describe('Y coordinate'), @@ -121,12 +85,12 @@ const click = defineTool({ }, }); -const drag = defineTool({ - capability: 'core', +const mouseDrag = defineTool({ + capability: 'vision', schema: { - name: 'browser_screen_drag', + name: 'browser_mouse_drag_xy', title: 'Drag mouse', - description: 'Drag left mouse button', + description: 'Drag left mouse button to a given position', inputSchema: elementSchema.extend({ startX: z.number().describe('Start X coordinate'), startY: z.number().describe('Start Y coordinate'), @@ -163,51 +127,8 @@ const drag = defineTool({ }, }); -const type = defineTool({ - capability: 'core', - schema: { - name: 'browser_screen_type', - title: 'Type text', - description: 'Type text', - inputSchema: z.object({ - text: z.string().describe('Text to type into the element'), - submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'), - }), - type: 'destructive', - }, - - handle: async (context, params) => { - const tab = context.currentTabOrDie(); - - const code = [ - `// Type ${params.text}`, - `await page.keyboard.type('${params.text}');`, - ]; - - const action = async () => { - await tab.page.keyboard.type(params.text); - if (params.submit) - await tab.page.keyboard.press('Enter'); - }; - - if (params.submit) { - code.push(`// Submit text`); - code.push(`await page.keyboard.press('Enter');`); - } - - return { - code, - action, - captureSnapshot: false, - waitForNetwork: true, - }; - }, -}); - export default [ - screenshot, - moveMouse, - click, - drag, - type, + mouseMove, + mouseClick, + mouseDrag, ]; diff --git a/src/tools/navigate.ts b/src/tools/navigate.ts index 501576e..a210a13 100644 --- a/src/tools/navigate.ts +++ b/src/tools/navigate.ts @@ -15,9 +15,9 @@ */ import { z } from 'zod'; -import { defineTool, type ToolFactory } from './tool.js'; +import { defineTool } from './tool.js'; -const navigate: ToolFactory = captureSnapshot => defineTool({ +const navigate = defineTool({ capability: 'core', schema: { @@ -41,14 +41,14 @@ const navigate: ToolFactory = captureSnapshot => defineTool({ return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false, }; }, }); -const goBack: ToolFactory = captureSnapshot => defineTool({ - capability: 'history', +const goBack = defineTool({ + capability: 'core', schema: { name: 'browser_navigate_back', title: 'Go back', @@ -67,14 +67,14 @@ const goBack: ToolFactory = captureSnapshot => defineTool({ return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false, }; }, }); -const goForward: ToolFactory = captureSnapshot => defineTool({ - capability: 'history', +const goForward = defineTool({ + capability: 'core', schema: { name: 'browser_navigate_forward', title: 'Go forward', @@ -91,14 +91,14 @@ const goForward: ToolFactory = captureSnapshot => defineTool({ ]; return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false, }; }, }); -export default (captureSnapshot: boolean) => [ - navigate(captureSnapshot), - goBack(captureSnapshot), - goForward(captureSnapshot), +export default [ + navigate, + goBack, + goForward, ]; diff --git a/src/tools/screenshot.ts b/src/tools/screenshot.ts index 439d79a..5e41491 100644 --- a/src/tools/screenshot.ts +++ b/src/tools/screenshot.ts @@ -79,7 +79,7 @@ const screenshot = defineTool({ return { code, action, - captureSnapshot: true, + captureSnapshot: false, waitForNetwork: false, }; } diff --git a/src/tools/snapshot.ts b/src/tools/snapshot.ts index 7d1ef32..8e43c68 100644 --- a/src/tools/snapshot.ts +++ b/src/tools/snapshot.ts @@ -41,7 +41,7 @@ const snapshot = defineTool({ }, }); -const elementSchema = z.object({ +export const elementSchema = z.object({ element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'), ref: z.string().describe('Exact target element reference from the page snapshot'), }); @@ -144,54 +144,6 @@ const hover = defineTool({ }, }); -const typeSchema = elementSchema.extend({ - text: z.string().describe('Text to type into the element'), - submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'), - slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'), -}); - -const type = defineTool({ - capability: 'core', - schema: { - name: 'browser_type', - title: 'Type text', - description: 'Type text into editable element', - inputSchema: typeSchema, - type: 'destructive', - }, - - handle: async (context, params) => { - const snapshot = context.currentTabOrDie().snapshotOrDie(); - const locator = snapshot.refLocator(params); - - const code: string[] = []; - const steps: (() => Promise)[] = []; - - if (params.slowly) { - code.push(`// Press "${params.text}" sequentially into "${params.element}"`); - code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`); - steps.push(() => locator.pressSequentially(params.text)); - } else { - code.push(`// Fill "${params.text}" into "${params.element}"`); - code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`); - steps.push(() => locator.fill(params.text)); - } - - if (params.submit) { - code.push(`// Submit text`); - code.push(`await page.${await generateLocator(locator)}.press('Enter');`); - steps.push(() => locator.press('Enter')); - } - - return { - code, - action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()), - captureSnapshot: true, - waitForNetwork: true, - }; - }, -}); - const selectOptionSchema = elementSchema.extend({ values: z.array(z.string()).describe('Array of values to select in the dropdown. This can be a single value or multiple values.'), }); @@ -229,6 +181,5 @@ export default [ click, drag, hover, - type, selectOption, ]; diff --git a/src/tools/tabs.ts b/src/tools/tabs.ts index 4133bf1..5256fee 100644 --- a/src/tools/tabs.ts +++ b/src/tools/tabs.ts @@ -15,10 +15,10 @@ */ import { z } from 'zod'; -import { defineTool, type ToolFactory } from './tool.js'; +import { defineTool } from './tool.js'; const listTabs = defineTool({ - capability: 'tabs', + capability: 'core-tabs', schema: { name: 'browser_tab_list', @@ -44,8 +44,8 @@ const listTabs = defineTool({ }, }); -const selectTab: ToolFactory = captureSnapshot => defineTool({ - capability: 'tabs', +const selectTab = defineTool({ + capability: 'core-tabs', schema: { name: 'browser_tab_select', @@ -65,14 +65,14 @@ const selectTab: ToolFactory = captureSnapshot => defineTool({ return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false }; }, }); -const newTab: ToolFactory = captureSnapshot => defineTool({ - capability: 'tabs', +const newTab = defineTool({ + capability: 'core-tabs', schema: { name: 'browser_tab_new', @@ -94,14 +94,14 @@ const newTab: ToolFactory = captureSnapshot => defineTool({ ]; return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false }; }, }); -const closeTab: ToolFactory = captureSnapshot => defineTool({ - capability: 'tabs', +const closeTab = defineTool({ + capability: 'core-tabs', schema: { name: 'browser_tab_close', @@ -120,15 +120,15 @@ const closeTab: ToolFactory = captureSnapshot => defineTool({ ]; return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false }; }, }); -export default (captureSnapshot: boolean) => [ +export default [ listTabs, - newTab(captureSnapshot), - selectTab(captureSnapshot), - closeTab(captureSnapshot), + newTab, + selectTab, + closeTab, ]; diff --git a/src/tools/tool.ts b/src/tools/tool.ts index 4b88c89..2f9f5d0 100644 --- a/src/tools/tool.ts +++ b/src/tools/tool.ts @@ -61,8 +61,6 @@ export type Tool = { handle: (context: Context, params: z.output) => Promise; }; -export type ToolFactory = (snapshot: boolean) => Tool; - export function defineTool(tool: Tool): Tool { return tool; } diff --git a/src/tools/wait.ts b/src/tools/wait.ts index fc8be82..519148d 100644 --- a/src/tools/wait.ts +++ b/src/tools/wait.ts @@ -15,10 +15,10 @@ */ import { z } from 'zod'; -import { defineTool, type ToolFactory } from './tool.js'; +import { defineTool } from './tool.js'; -const wait: ToolFactory = captureSnapshot => defineTool({ - capability: 'wait', +const wait = defineTool({ + capability: 'core', schema: { name: 'browser_wait_for', @@ -40,7 +40,7 @@ const wait: ToolFactory = captureSnapshot => defineTool({ if (params.time) { code.push(`await new Promise(f => setTimeout(f, ${params.time!} * 1000));`); - await new Promise(f => setTimeout(f, Math.min(10000, params.time! * 1000))); + await new Promise(f => setTimeout(f, Math.min(30000, params.time! * 1000))); } const tab = context.currentTabOrDie(); @@ -59,12 +59,12 @@ const wait: ToolFactory = captureSnapshot => defineTool({ return { code, - captureSnapshot, + captureSnapshot: true, waitForNetwork: false, }; }, }); -export default (captureSnapshot: boolean) => [ - wait(captureSnapshot), +export default [ + wait, ]; diff --git a/tests/capabilities.spec.ts b/tests/capabilities.spec.ts index cd7defd..5f33035 100644 --- a/tests/capabilities.spec.ts +++ b/tests/capabilities.spec.ts @@ -34,7 +34,6 @@ test('test snapshot tool list', async ({ client }) => { 'browser_navigate_forward', 'browser_navigate', 'browser_network_requests', - 'browser_pdf_save', 'browser_press_key', 'browser_resize', 'browser_snapshot', @@ -47,45 +46,33 @@ test('test snapshot tool list', async ({ client }) => { ])); }); -test('test vision tool list', async ({ visionClient }) => { - const { tools: visionTools } = await visionClient.listTools(); - expect(new Set(visionTools.map(t => t.name))).toEqual(new Set([ - 'browser_close', - 'browser_console_messages', - 'browser_file_upload', - 'browser_handle_dialog', - 'browser_install', - 'browser_navigate_back', - 'browser_navigate_forward', - 'browser_navigate', - 'browser_network_requests', - 'browser_pdf_save', - 'browser_press_key', - 'browser_resize', - 'browser_screen_capture', - 'browser_screen_click', - 'browser_screen_drag', - 'browser_screen_move_mouse', - 'browser_screen_type', - 'browser_tab_close', - 'browser_tab_list', - 'browser_tab_new', - 'browser_tab_select', - 'browser_wait_for', - ])); -}); - -test('test capabilities', async ({ startClient }) => { +test('test capabilities (pdf)', async ({ startClient }) => { const { client } = await startClient({ - args: ['--caps="core"'], + args: ['--caps=pdf'], }); const { tools } = await client.listTools(); const toolNames = tools.map(t => t.name); - expect(toolNames).not.toContain('browser_file_upload'); - expect(toolNames).not.toContain('browser_pdf_save'); - expect(toolNames).not.toContain('browser_screen_capture'); - expect(toolNames).not.toContain('browser_screen_click'); - expect(toolNames).not.toContain('browser_screen_drag'); - expect(toolNames).not.toContain('browser_screen_move_mouse'); - expect(toolNames).not.toContain('browser_screen_type'); + expect(toolNames).toContain('browser_pdf_save'); +}); + +test('test capabilities (vision)', async ({ startClient }) => { + const { client } = await startClient({ + args: ['--caps=vision'], + }); + const { tools } = await client.listTools(); + const toolNames = tools.map(t => t.name); + expect(toolNames).toContain('browser_mouse_move_xy'); + expect(toolNames).toContain('browser_mouse_click_xy'); + expect(toolNames).toContain('browser_mouse_drag_xy'); +}); + +test('support for legacy --vision option', async ({ startClient }) => { + const { client } = await startClient({ + args: ['--vision'], + }); + const { tools } = await client.listTools(); + const toolNames = tools.map(t => t.name); + expect(toolNames).toContain('browser_mouse_move_xy'); + expect(toolNames).toContain('browser_mouse_click_xy'); + expect(toolNames).toContain('browser_mouse_drag_xy'); }); diff --git a/tests/evaluate.spec.ts b/tests/evaluate.spec.ts new file mode 100644 index 0000000..ceb86d3 --- /dev/null +++ b/tests/evaluate.spec.ts @@ -0,0 +1,51 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { test, expect } from './fixtures.js'; + +test('browser_evaluate', async ({ client, server }) => { + expect(await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.HELLO_WORLD }, + })).toContainTextContent(`- Page Title: Title`); + + const result = await client.callTool({ + name: 'browser_evaluate', + arguments: { + function: '() => document.title', + }, + }); + expect(result).toContainTextContent(`"Title"`); +}); + +test('browser_evaluate (element)', async ({ client, server }) => { + server.setContent('/', ` + Hello, world! + `, 'text/html'); + await client.callTool({ + name: 'browser_navigate', + arguments: { url: server.PREFIX }, + }); + + expect(await client.callTool({ + name: 'browser_evaluate', + arguments: { + function: 'element => element.style.backgroundColor', + element: 'body', + ref: 'e1', + }, + })).toContainTextContent(`- Result: "red"`); +}); diff --git a/tests/fixtures.ts b/tests/fixtures.ts index 3668a64..e4f23c4 100644 --- a/tests/fixtures.ts +++ b/tests/fixtures.ts @@ -41,7 +41,6 @@ type CDPServer = { type TestFixtures = { client: Client; - visionClient: Client; startClient: (options?: { clientName?: string, args?: string[], config?: Config }) => Promise<{ client: Client, stderr: () => string }>; wsEndpoint: string; cdpServer: CDPServer; @@ -61,11 +60,6 @@ export const test = baseTest.extend( await use(client); }, - visionClient: async ({ startClient }, use) => { - const { client } = await startClient({ args: ['--vision'] }); - await use(client); - }, - startClient: async ({ mcpHeadless, mcpBrowser, mcpMode }, use, testInfo) => { const userDataDir = mcpMode !== 'docker' ? testInfo.outputPath('user-data-dir') : undefined; const configDir = path.dirname(test.info().config.configFile!); diff --git a/tests/pdf.spec.ts b/tests/pdf.spec.ts index 6032b87..c3cc901 100644 --- a/tests/pdf.spec.ts +++ b/tests/pdf.spec.ts @@ -19,7 +19,7 @@ import fs from 'fs'; import { test, expect } from './fixtures.js'; test('save as pdf unavailable', async ({ startClient, server }) => { - const { client } = await startClient({ args: ['--caps="no-pdf"'] }); + const { client } = await startClient(); await client.callTool({ name: 'browser_navigate', arguments: { url: server.HELLO_WORLD }, @@ -32,7 +32,7 @@ test('save as pdf unavailable', async ({ startClient, server }) => { test('save as pdf', async ({ startClient, mcpBrowser, server }, testInfo) => { const { client } = await startClient({ - config: { outputDir: testInfo.outputPath('output') }, + config: { outputDir: testInfo.outputPath('output'), capabilities: ['pdf'] }, }); test.skip(!!mcpBrowser && !['chromium', 'chrome', 'msedge'].includes(mcpBrowser), 'Save as PDF is only supported in Chromium.'); @@ -52,7 +52,7 @@ test('save as pdf (filename: output.pdf)', async ({ startClient, mcpBrowser, ser const outputDir = testInfo.outputPath('output'); test.skip(!!mcpBrowser && !['chromium', 'chrome', 'msedge'].includes(mcpBrowser), 'Save as PDF is only supported in Chromium.'); const { client } = await startClient({ - config: { outputDir }, + config: { outputDir, capabilities: ['pdf'] }, }); expect(await client.callTool({ diff --git a/utils/update-readme.js b/utils/update-readme.js index 144838d..0bb1b01 100644 --- a/utils/update-readme.js +++ b/utils/update-readme.js @@ -20,60 +20,20 @@ import fs from 'node:fs' import path from 'node:path' import url from 'node:url' import zodToJsonSchema from 'zod-to-json-schema' - -import commonTools from '../lib/tools/common.js'; -import consoleTools from '../lib/tools/console.js'; -import dialogsTools from '../lib/tools/dialogs.js'; -import evaluateTools from '../lib/tools/evaluate.js'; -import filesTools from '../lib/tools/files.js'; -import installTools from '../lib/tools/install.js'; -import keyboardTools from '../lib/tools/keyboard.js'; -import navigateTools from '../lib/tools/navigate.js'; -import networkTools from '../lib/tools/network.js'; -import pdfTools from '../lib/tools/pdf.js'; -import snapshotTools from '../lib/tools/snapshot.js'; -import tabsTools from '../lib/tools/tabs.js'; -import screenshotTools from '../lib/tools/screenshot.js'; -import visionTools from '../lib/tools/vision.js'; -import waitTools from '../lib/tools/wait.js'; import { execSync } from 'node:child_process'; -const categories = { - 'Interactions': [ - ...snapshotTools, - ...keyboardTools(true), - ...waitTools(true), - ...filesTools(true), - ...dialogsTools(true), - ], - 'Navigation': [ - ...navigateTools(true), - ], - 'Evaluation': [ - ...evaluateTools, - ], - 'Resources': [ - ...screenshotTools, - ...pdfTools, - ...networkTools, - ...consoleTools, - ], - 'Utilities': [ - ...installTools, - ...commonTools(true), - ], - 'Tabs': [ - ...tabsTools(true), - ], - 'Vision mode': [ - ...visionTools, - ...keyboardTools(), - ...waitTools(false), - ...filesTools(false), - ...dialogsTools(false), - ], +import { allTools } from '../lib/tools.js'; + +const capabilities = { + 'core': 'Core automation', + 'core-tabs': 'Tab management', + 'core-install': 'Browser installation', + 'vision': 'Coordinate-based (opt-in via --caps=vision)', + 'pdf': 'PDF generation (opt-in via --caps=pdf)', }; +const toolsByCapability = Object.fromEntries(Object.entries(capabilities).map(([capability, title]) => [title, allTools.filter(tool => tool.capability === capability).sort((a, b) => a.schema.name.localeCompare(b.schema.name))])); + // NOTE: Can be removed when we drop Node.js 18 support and changed to import.meta.filename. const __filename = url.fileURLToPath(import.meta.url); @@ -139,14 +99,12 @@ async function updateSection(content, startMarker, endMarker, generatedLines) { async function updateTools(content) { console.log('Loading tool information from compiled modules...'); - const totalTools = Object.values(categories).flat().length; - console.log(`Found ${totalTools} tools`); - const generatedLines = /** @type {string[]} */ ([]); - for (const [category, categoryTools] of Object.entries(categories)) { - generatedLines.push(`
\n${category}`); + for (const [capability, tools] of Object.entries(toolsByCapability)) { + console.log('Updating tools for capability:', capability); + generatedLines.push(`
\n${capability}`); generatedLines.push(''); - for (const tool of categoryTools) + for (const tool of tools) generatedLines.push(...formatToolForReadme(tool.schema)); generatedLines.push(`
`); generatedLines.push('');