From d61aa16fee67ed78d18120bbd9d93e55d1d48f32 Mon Sep 17 00:00:00 2001
From: Pavel Feldman <pavel.feldman@gmail.com>
Date: Wed, 16 Jul 2025 16:40:00 -0700
Subject: [PATCH]  chore: turn vision into capability (#679)

Fixes https://github.com/microsoft/playwright-mcp/issues/420
---
 README.md                         | 351 +++++++++++-------------------
 config.d.ts                       |  13 +-
 src/config.ts                     |   2 -
 src/connection.ts                 |   5 +-
 src/program.ts                    |  11 +-
 src/tools.ts                      |  34 +--
 src/tools/common.ts               |  10 +-
 src/tools/dialogs.ts              |  10 +-
 src/tools/files.ts                |  12 +-
 src/tools/install.ts              |   2 +-
 src/tools/keyboard.ts             |  63 +++++-
 src/tools/{vision.ts => mouse.ts} | 107 ++-------
 src/tools/navigate.ts             |  26 +--
 src/tools/screenshot.ts           |   2 +-
 src/tools/snapshot.ts             |  51 +----
 src/tools/tabs.ts                 |  30 +--
 src/tools/tool.ts                 |   2 -
 src/tools/wait.ts                 |  14 +-
 tests/capabilities.spec.ts        |  63 +++---
 tests/evaluate.spec.ts            |  51 +++++
 tests/fixtures.ts                 |   6 -
 tests/pdf.spec.ts                 |   6 +-
 utils/update-readme.js            |  70 ++----
 23 files changed, 366 insertions(+), 575 deletions(-)
 rename src/tools/{vision.ts => mouse.ts} (59%)
 create mode 100644 tests/evaluate.spec.ts
diff --git a/README.md b/README.md
index 6bcc17e..f5e153b 100644
--- a/README.md
+++ b/README.md
@@ -193,9 +193,8 @@ Playwright MCP server supports following arguments. They can be provided in the
   --browser <browser>          browser or chrome channel to use, possible
                                values: chrome, firefox, webkit, msedge.
   --browser-agent <endpoint>   Use browser agent (experimental).
-  --caps <caps>                comma-separated list of capabilities to enable,
-                               possible values: tabs, pdf, history, wait, files,
-                               install. Default is all.
+  --caps <caps>                comma-separated list of additional capabilities
+                               to enable, possible values: vision, pdf.
   --cdp-endpoint <endpoint>    CDP endpoint to connect to.
   --config <path>              path to the configuration file.
   --device <device>            device to emulate, for example: "iPhone 15"
@@ -227,8 +226,6 @@ Playwright MCP server supports following arguments. They can be provided in the
                                specified, a temporary directory will be created.
   --viewport-size <size>       specify browser viewport size in pixels, for
                                example "1280, 720"
-  --vision                     Run server that uses screenshots (Aria snapshots
-                               are used by default)
 ```
 
 <!--- End of options generated section -->
@@ -329,21 +326,14 @@ npx @playwright/mcp@latest --config path/to/config.json
     host?: string;  // Host to bind to (default: localhost)
   },
 
-  // List of enabled capabilities
+  // List of additional capabilities
   capabilities?: Array<
-    'core' |    // Core browser automation
     'tabs' |    // Tab management
-    'pdf' |     // PDF generation
-    'history' | // Browser history
-    'wait' |    // Wait utilities
-    'files' |   // File handling
     'install' | // Browser installation
-    'testing'   // Testing
+    'pdf' |     // PDF generation
+    'vision' |  // Coordinate-based interactions
   >;
 
-  // Enable vision mode (screenshots instead of accessibility snapshots)
-  vision?: boolean;
-
   // Directory for output files
   outputDir?: string;
 
@@ -433,42 +423,10 @@ http.createServer(async (req, res) => {
 
 ### Tools
 
-The tools are available in two modes:
-
-1. **Snapshot Mode** (default): Uses accessibility snapshots for better performance and reliability
-2. **Vision Mode**: Uses screenshots for visual-based interactions
-
-To use Vision Mode, add the `--vision` flag when starting the server:
-
-```js
-{
-  "mcpServers": {
-    "playwright": {
-      "command": "npx",
-      "args": [
-        "@playwright/mcp@latest",
-        "--vision"
-      ]
-    }
-  }
-}
-```
-
-Vision Mode works best with the computer use models that are able to interact with elements using
-X Y coordinate space, based on the provided screenshot.
-
 <!--- Tools generated by update-readme.js -->
 
 <details>
-<summary><b>Interactions</b></summary>
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_snapshot**
-  - Title: Page snapshot
-  - Description: Capture accessibility snapshot of the current page, this is better than screenshot
-  - Parameters: None
-  - Read-only: **true**
+<summary><b>Core automation</b></summary>
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
@@ -483,6 +441,22 @@ X Y coordinate space, based on the provided screenshot.
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
+- **browser_close**
+  - Title: Close browser
+  - Description: Close the page
+  - Parameters: None
+  - Read-only: **true**
+
+<!-- NOTE: This has been generated via update-readme.js -->
+
+- **browser_console_messages**
+  - Title: Get console messages
+  - Description: Returns all console messages
+  - Parameters: None
+  - Read-only: **true**
+
+<!-- NOTE: This has been generated via update-readme.js -->
+
 - **browser_drag**
   - Title: Drag mouse
   - Description: Perform drag and drop between two elements
@@ -495,60 +469,17 @@ X Y coordinate space, based on the provided screenshot.
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_hover**
-  - Title: Hover mouse
-  - Description: Hover over element on page
+- **browser_evaluate**
+  - Title: Evaluate JavaScript
+  - Description: Evaluate JavaScript expression on page or element
   - Parameters:
-    - `element` (string): Human-readable element description used to obtain permission to interact with the element
-    - `ref` (string): Exact target element reference from the page snapshot
-  - Read-only: **true**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_type**
-  - Title: Type text
-  - Description: Type text into editable element
-  - Parameters:
-    - `element` (string): Human-readable element description used to obtain permission to interact with the element
-    - `ref` (string): Exact target element reference from the page snapshot
-    - `text` (string): Text to type into the element
-    - `submit` (boolean, optional): Whether to submit entered text (press Enter after)
-    - `slowly` (boolean, optional): Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.
+    - `function` (string): () => { /* code */ } or (element) => { /* code */ } when element is provided
+    - `element` (string, optional): Human-readable element description used to obtain permission to interact with the element
+    - `ref` (string, optional): Exact target element reference from the page snapshot
   - Read-only: **false**
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_select_option**
-  - Title: Select option
-  - Description: Select an option in a dropdown
-  - Parameters:
-    - `element` (string): Human-readable element description used to obtain permission to interact with the element
-    - `ref` (string): Exact target element reference from the page snapshot
-    - `values` (array): Array of values to select in the dropdown. This can be a single value or multiple values.
-  - Read-only: **false**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_press_key**
-  - Title: Press a key
-  - Description: Press a key on the keyboard
-  - Parameters:
-    - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a`
-  - Read-only: **false**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_wait_for**
-  - Title: Wait for
-  - Description: Wait for text to appear or disappear or a specified time to pass
-  - Parameters:
-    - `time` (number, optional): The time to wait in seconds
-    - `text` (string, optional): The text to wait for
-    - `textGone` (string, optional): The text to wait for to disappear
-  - Read-only: **true**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
 - **browser_file_upload**
   - Title: Upload files
   - Description: Upload one or multiple files
@@ -566,10 +497,15 @@ X Y coordinate space, based on the provided screenshot.
     - `promptText` (string, optional): The text of the prompt in case of a prompt dialog.
   - Read-only: **false**
 
-</details>
+<!-- NOTE: This has been generated via update-readme.js -->
 
-<details>
-<summary><b>Navigation</b></summary>
+- **browser_hover**
+  - Title: Hover mouse
+  - Description: Hover over element on page
+  - Parameters:
+    - `element` (string): Human-readable element description used to obtain permission to interact with the element
+    - `ref` (string): Exact target element reference from the page snapshot
+  - Read-only: **true**
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
@@ -596,26 +532,51 @@ X Y coordinate space, based on the provided screenshot.
   - Parameters: None
   - Read-only: **true**
 
-</details>
+<!-- NOTE: This has been generated via update-readme.js -->
 
-<details>
-<summary><b>Evaluation</b></summary>
+- **browser_network_requests**
+  - Title: List network requests
+  - Description: Returns all network requests since loading the page
+  - Parameters: None
+  - Read-only: **true**
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_evaluate**
-  - Title: Evaluate JavaScript
-  - Description: Evaluate JavaScript expression on page or element
+- **browser_press_key**
+  - Title: Press a key
+  - Description: Press a key on the keyboard
   - Parameters:
-    - `function` (string): () => { /* code */ } or (element) => { /* code */ } when element is provided
-    - `element` (string, optional): Human-readable element description used to obtain permission to interact with the element
-    - `ref` (string, optional): Exact target element reference from the page snapshot
+    - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a`
   - Read-only: **false**
 
-</details>
+<!-- NOTE: This has been generated via update-readme.js -->
 
-<details>
-<summary><b>Resources</b></summary>
+- **browser_resize**
+  - Title: Resize browser window
+  - Description: Resize the browser window
+  - Parameters:
+    - `width` (number): Width of the browser window
+    - `height` (number): Height of the browser window
+  - Read-only: **true**
+
+<!-- NOTE: This has been generated via update-readme.js -->
+
+- **browser_select_option**
+  - Title: Select option
+  - Description: Select an option in a dropdown
+  - Parameters:
+    - `element` (string): Human-readable element description used to obtain permission to interact with the element
+    - `ref` (string): Exact target element reference from the page snapshot
+    - `values` (array): Array of values to select in the dropdown. This can be a single value or multiple values.
+  - Read-only: **false**
+
+<!-- NOTE: This has been generated via update-readme.js -->
+
+- **browser_snapshot**
+  - Title: Page snapshot
+  - Description: Capture accessibility snapshot of the current page, this is better than screenshot
+  - Parameters: None
+  - Read-only: **true**
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
@@ -631,64 +592,41 @@ X Y coordinate space, based on the provided screenshot.
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_pdf_save**
-  - Title: Save as PDF
-  - Description: Save page as PDF
+- **browser_type**
+  - Title: Type text
+  - Description: Type text into editable element
   - Parameters:
-    - `filename` (string, optional): File name to save the pdf to. Defaults to `page-{timestamp}.pdf` if not specified.
-  - Read-only: **true**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_network_requests**
-  - Title: List network requests
-  - Description: Returns all network requests since loading the page
-  - Parameters: None
-  - Read-only: **true**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_console_messages**
-  - Title: Get console messages
-  - Description: Returns all console messages
-  - Parameters: None
-  - Read-only: **true**
-
-</details>
-
-<details>
-<summary><b>Utilities</b></summary>
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_install**
-  - Title: Install the browser specified in the config
-  - Description: Install the browser specified in the config. Call this if you get an error about the browser not being installed.
-  - Parameters: None
+    - `element` (string): Human-readable element description used to obtain permission to interact with the element
+    - `ref` (string): Exact target element reference from the page snapshot
+    - `text` (string): Text to type into the element
+    - `submit` (boolean, optional): Whether to submit entered text (press Enter after)
+    - `slowly` (boolean, optional): Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.
   - Read-only: **false**
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_close**
-  - Title: Close browser
-  - Description: Close the page
-  - Parameters: None
-  - Read-only: **true**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_resize**
-  - Title: Resize browser window
-  - Description: Resize the browser window
+- **browser_wait_for**
+  - Title: Wait for
+  - Description: Wait for text to appear or disappear or a specified time to pass
   - Parameters:
-    - `width` (number): Width of the browser window
-    - `height` (number): Height of the browser window
+    - `time` (number, optional): The time to wait in seconds
+    - `text` (string, optional): The text to wait for
+    - `textGone` (string, optional): The text to wait for to disappear
   - Read-only: **true**
 
 </details>
 
 <details>
-<summary><b>Tabs</b></summary>
+<summary><b>Tab management</b></summary>
+
+<!-- NOTE: This has been generated via update-readme.js -->
+
+- **browser_tab_close**
+  - Title: Close a tab
+  - Description: Close a tab
+  - Parameters:
+    - `index` (number, optional): The index of the tab to close. Closes current tab if not provided.
+  - Read-only: **false**
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
@@ -716,44 +654,29 @@ X Y coordinate space, based on the provided screenshot.
     - `index` (number): The index of the tab to select
   - Read-only: **true**
 
+</details>
+
+<details>
+<summary><b>Browser installation</b></summary>
+
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_tab_close**
-  - Title: Close a tab
-  - Description: Close a tab
-  - Parameters:
-    - `index` (number, optional): The index of the tab to close. Closes current tab if not provided.
+- **browser_install**
+  - Title: Install the browser specified in the config
+  - Description: Install the browser specified in the config. Call this if you get an error about the browser not being installed.
+  - Parameters: None
   - Read-only: **false**
 
 </details>
 
 <details>
-<summary><b>Vision mode</b></summary>
+<summary><b>Coordinate-based (opt-in via --caps=vision)</b></summary>
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_screen_capture**
-  - Title: Take a screenshot
-  - Description: Take a screenshot of the current page
-  - Parameters: None
-  - Read-only: **true**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_screen_move_mouse**
-  - Title: Move mouse
-  - Description: Move mouse to a given position
-  - Parameters:
-    - `element` (string): Human-readable element description used to obtain permission to interact with the element
-    - `x` (number): X coordinate
-    - `y` (number): Y coordinate
-  - Read-only: **true**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_screen_click**
+- **browser_mouse_click_xy**
   - Title: Click
-  - Description: Click left mouse button
+  - Description: Click left mouse button at a given position
   - Parameters:
     - `element` (string): Human-readable element description used to obtain permission to interact with the element
     - `x` (number): X coordinate
@@ -762,9 +685,9 @@ X Y coordinate space, based on the provided screenshot.
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_screen_drag**
+- **browser_mouse_drag_xy**
   - Title: Drag mouse
-  - Description: Drag left mouse button
+  - Description: Drag left mouse button to a given position
   - Parameters:
     - `element` (string): Human-readable element description used to obtain permission to interact with the element
     - `startX` (number): Start X coordinate
@@ -775,52 +698,28 @@ X Y coordinate space, based on the provided screenshot.
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_screen_type**
-  - Title: Type text
-  - Description: Type text
+- **browser_mouse_move_xy**
+  - Title: Move mouse
+  - Description: Move mouse to a given position
   - Parameters:
-    - `text` (string): Text to type into the element
-    - `submit` (boolean, optional): Whether to submit entered text (press Enter after)
-  - Read-only: **false**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_press_key**
-  - Title: Press a key
-  - Description: Press a key on the keyboard
-  - Parameters:
-    - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a`
-  - Read-only: **false**
-
-<!-- NOTE: This has been generated via update-readme.js -->
-
-- **browser_wait_for**
-  - Title: Wait for
-  - Description: Wait for text to appear or disappear or a specified time to pass
-  - Parameters:
-    - `time` (number, optional): The time to wait in seconds
-    - `text` (string, optional): The text to wait for
-    - `textGone` (string, optional): The text to wait for to disappear
+    - `element` (string): Human-readable element description used to obtain permission to interact with the element
+    - `x` (number): X coordinate
+    - `y` (number): Y coordinate
   - Read-only: **true**
 
-<!-- NOTE: This has been generated via update-readme.js -->
+</details>
 
-- **browser_file_upload**
-  - Title: Upload files
-  - Description: Upload one or multiple files
-  - Parameters:
-    - `paths` (array): The absolute paths to the files to upload. Can be a single file or multiple files.
-  - Read-only: **false**
+<details>
+<summary><b>PDF generation (opt-in via --caps=pdf)</b></summary>
 
 <!-- NOTE: This has been generated via update-readme.js -->
 
-- **browser_handle_dialog**
-  - Title: Handle a dialog
-  - Description: Handle a dialog
+- **browser_pdf_save**
+  - Title: Save as PDF
+  - Description: Save page as PDF
   - Parameters:
-    - `accept` (boolean): Whether to accept the dialog.
-    - `promptText` (string, optional): The text of the prompt in case of a prompt dialog.
-  - Read-only: **false**
+    - `filename` (string, optional): File name to save the pdf to. Defaults to `page-{timestamp}.pdf` if not specified.
+  - Read-only: **true**
 
 </details>
 
diff --git a/config.d.ts b/config.d.ts
index a935918..c36d5fe 100644
--- a/config.d.ts
+++ b/config.d.ts
@@ -16,7 +16,7 @@
 
 import type * as playwright from 'playwright';
 
-export type ToolCapability = 'core' | 'tabs' | 'pdf' | 'history' | 'wait' | 'files' | 'install' | 'testing';
+export type ToolCapability = 'core' | 'core-tabs' | 'core-install' | 'vision' | 'pdf';
 
 export type Config = {
   /**
@@ -85,20 +85,11 @@ export type Config = {
   /**
    * List of enabled tool capabilities. Possible values:
    *   - 'core': Core browser automation features.
-   *   - 'tabs': Tab management features.
    *   - 'pdf': PDF generation and manipulation.
-   *   - 'history': Browser history access.
-   *   - 'wait': Wait and timing utilities.
-   *   - 'files': File upload/download support.
-   *   - 'install': Browser installation utilities.
+   *   - 'vision': Coordinate-based interactions.
    */
   capabilities?: ToolCapability[];
 
-  /**
-   * Run server that uses screenshots (Aria snapshots are used by default).
-   */
-  vision?: boolean;
-
   /**
    * Whether to save the Playwright trace of the session into the output directory.
    */
diff --git a/src/config.ts b/src/config.ts
index d2cbd67..f9773da 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -49,7 +49,6 @@ export type CLIOptions = {
   userAgent?: string;
   userDataDir?: string;
   viewportSize?: string;
-  vision?: boolean;
 };
 
 const defaultConfig: FullConfig = {
@@ -185,7 +184,6 @@ export async function configFromCLIOptions(cliOptions: CLIOptions): Promise<Conf
       host: cliOptions.host,
     },
     capabilities: cliOptions.caps?.split(',').map((c: string) => c.trim() as ToolCapability),
-    vision: !!cliOptions.vision,
     network: {
       allowedOrigins: cliOptions.allowedOrigins,
       blockedOrigins: cliOptions.blockedOrigins,
diff --git a/src/connection.ts b/src/connection.ts
index a9508bb..1ee4c75 100644
--- a/src/connection.ts
+++ b/src/connection.ts
@@ -19,7 +19,7 @@ import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from '
 import { zodToJsonSchema } from 'zod-to-json-schema';
 
 import { Context } from './context.js';
-import { snapshotTools, visionTools } from './tools.js';
+import { allTools } from './tools.js';
 import { packageJSON } from './package.js';
 
 import { FullConfig } from './config.js';
@@ -27,8 +27,7 @@ import { FullConfig } from './config.js';
 import type { BrowserContextFactory } from './browserContextFactory.js';
 
 export function createConnection(config: FullConfig, browserContextFactory: BrowserContextFactory): Connection {
-  const allTools = config.vision ? visionTools : snapshotTools;
-  const tools = allTools.filter(tool => !config.capabilities || tool.capability === 'core' || config.capabilities.includes(tool.capability));
+  const tools = allTools.filter(tool => tool.capability.startsWith('core') || config.capabilities?.includes(tool.capability));
   const context = new Context(tools, config, browserContextFactory);
   const server = new McpServer({ name: 'Playwright', version: packageJSON.version }, {
     capabilities: {
diff --git a/src/program.ts b/src/program.ts
index 62109fa..c5ea13b 100644
--- a/src/program.ts
+++ b/src/program.ts
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-import { program } from 'commander';
+import { program, Option } from 'commander';
 // @ts-ignore
 import { startTraceViewerServer } from 'playwright-core/lib/server';
 
@@ -31,7 +31,7 @@ program
     .option('--block-service-workers', 'block service workers')
     .option('--browser <browser>', 'browser or chrome channel to use, possible values: chrome, firefox, webkit, msedge.')
     .option('--browser-agent <endpoint>', 'Use browser agent (experimental).')
-    .option('--caps <caps>', 'comma-separated list of capabilities to enable, possible values: tabs, pdf, history, wait, files, install. Default is all.')
+    .option('--caps <caps>', 'comma-separated list of additional capabilities to enable, possible values: vision, pdf.')
     .option('--cdp-endpoint <endpoint>', 'CDP endpoint to connect to.')
     .option('--config <path>', 'path to the configuration file.')
     .option('--device <device>', 'device to emulate, for example: "iPhone 15"')
@@ -51,8 +51,13 @@ program
     .option('--user-agent <ua string>', 'specify user agent string')
     .option('--user-data-dir <path>', 'path to the user data directory. If not specified, a temporary directory will be created.')
     .option('--viewport-size <size>', 'specify browser viewport size in pixels, for example "1280, 720"')
-    .option('--vision', 'Run server that uses screenshots (Aria snapshots are used by default)')
+    .addOption(new Option('--vision', 'Legacy option, use --caps=vision instead').hideHelp())
     .action(async options => {
+      if (options.vision) {
+        // eslint-disable-next-line no-console
+        console.error('The --vision option is deprecated, use --caps=vision instead');
+        options.caps = 'vision';
+      }
       const config = await resolveCLIConfig(options);
       const httpServer = config.server.port !== undefined ? await startHttpServer(config.server) : undefined;
 
diff --git a/src/tools.ts b/src/tools.ts
index 2f20713..9b7c2a3 100644
--- a/src/tools.ts
+++ b/src/tools.ts
@@ -27,39 +27,25 @@ import pdf from './tools/pdf.js';
 import snapshot from './tools/snapshot.js';
 import tabs from './tools/tabs.js';
 import screenshot from './tools/screenshot.js';
-import vision from './tools/vision.js';
 import wait from './tools/wait.js';
+import mouse from './tools/mouse.js';
 
 import type { Tool } from './tools/tool.js';
 
-export const snapshotTools: Tool<any>[] = [
-  ...common(true),
+export const allTools: Tool<any>[] = [
+  ...common,
   ...console,
-  ...dialogs(true),
+  ...dialogs,
   ...evaluate,
-  ...files(true),
+  ...files,
   ...install,
-  ...keyboard(true),
-  ...navigate(true),
+  ...keyboard,
+  ...navigate,
   ...network,
+  ...mouse,
   ...pdf,
   ...screenshot,
   ...snapshot,
-  ...tabs(true),
-  ...wait(true),
-];
-
-export const visionTools: Tool<any>[] = [
-  ...common(false),
-  ...console,
-  ...dialogs(false),
-  ...files(false),
-  ...install,
-  ...keyboard(false),
-  ...navigate(false),
-  ...network,
-  ...pdf,
-  ...tabs(false),
-  ...vision,
-  ...wait(false),
+  ...tabs,
+  ...wait,
 ];
diff --git a/src/tools/common.ts b/src/tools/common.ts
index 8a16c35..5a8e064 100644
--- a/src/tools/common.ts
+++ b/src/tools/common.ts
@@ -15,7 +15,7 @@
  */
 
 import { z } from 'zod';
-import { defineTool, type ToolFactory } from './tool.js';
+import { defineTool } from './tool.js';
 
 const close = defineTool({
   capability: 'core',
@@ -38,7 +38,7 @@ const close = defineTool({
   },
 });
 
-const resize: ToolFactory = captureSnapshot => defineTool({
+const resize = defineTool({
   capability: 'core',
   schema: {
     name: 'browser_resize',
@@ -66,13 +66,13 @@ const resize: ToolFactory = captureSnapshot => defineTool({
     return {
       code,
       action,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: true
     };
   },
 });
 
-export default (captureSnapshot: boolean) => [
+export default [
   close,
-  resize(captureSnapshot)
+  resize
 ];
diff --git a/src/tools/dialogs.ts b/src/tools/dialogs.ts
index 348e461..5eaf905 100644
--- a/src/tools/dialogs.ts
+++ b/src/tools/dialogs.ts
@@ -15,9 +15,9 @@
  */
 
 import { z } from 'zod';
-import { defineTool, type ToolFactory } from './tool.js';
+import { defineTool } from './tool.js';
 
-const handleDialog: ToolFactory = captureSnapshot => defineTool({
+const handleDialog = defineTool({
   capability: 'core',
 
   schema: {
@@ -49,7 +49,7 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({
 
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false,
     };
   },
@@ -57,6 +57,6 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({
   clearsModalState: 'dialog',
 });
 
-export default (captureSnapshot: boolean) => [
-  handleDialog(captureSnapshot),
+export default [
+  handleDialog,
 ];
diff --git a/src/tools/files.ts b/src/tools/files.ts
index 2dc7837..a396cf7 100644
--- a/src/tools/files.ts
+++ b/src/tools/files.ts
@@ -15,10 +15,10 @@
  */
 
 import { z } from 'zod';
-import { defineTool, type ToolFactory } from './tool.js';
+import { defineTool } from './tool.js';
 
-const uploadFile: ToolFactory = captureSnapshot => defineTool({
-  capability: 'files',
+const uploadFile = defineTool({
+  capability: 'core',
 
   schema: {
     name: 'browser_file_upload',
@@ -47,13 +47,13 @@ const uploadFile: ToolFactory = captureSnapshot => defineTool({
     return {
       code,
       action,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: true,
     };
   },
   clearsModalState: 'fileChooser',
 });
 
-export default (captureSnapshot: boolean) => [
-  uploadFile(captureSnapshot),
+export default [
+  uploadFile,
 ];
diff --git a/src/tools/install.ts b/src/tools/install.ts
index d0d5145..3b45e37 100644
--- a/src/tools/install.ts
+++ b/src/tools/install.ts
@@ -23,7 +23,7 @@ import { defineTool } from './tool.js';
 import { fileURLToPath } from 'node:url';
 
 const install = defineTool({
-  capability: 'install',
+  capability: 'core-install',
   schema: {
     name: 'browser_install',
     title: 'Install the browser specified in the config',
diff --git a/src/tools/keyboard.ts b/src/tools/keyboard.ts
index 521aab2..1687ddd 100644
--- a/src/tools/keyboard.ts
+++ b/src/tools/keyboard.ts
@@ -15,9 +15,13 @@
  */
 
 import { z } from 'zod';
-import { defineTool, type ToolFactory } from './tool.js';
 
-const pressKey: ToolFactory = captureSnapshot => defineTool({
+import { defineTool } from './tool.js';
+import { elementSchema } from './snapshot.js';
+import { generateLocator } from './utils.js';
+import * as javascript from '../javascript.js';
+
+const pressKey = defineTool({
   capability: 'core',
 
   schema: {
@@ -43,12 +47,61 @@ const pressKey: ToolFactory = captureSnapshot => defineTool({
     return {
       code,
       action,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: true
     };
   },
 });
 
-export default (captureSnapshot: boolean) => [
-  pressKey(captureSnapshot),
+const typeSchema = elementSchema.extend({
+  text: z.string().describe('Text to type into the element'),
+  submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
+  slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'),
+});
+
+const type = defineTool({
+  capability: 'core',
+  schema: {
+    name: 'browser_type',
+    title: 'Type text',
+    description: 'Type text into editable element',
+    inputSchema: typeSchema,
+    type: 'destructive',
+  },
+
+  handle: async (context, params) => {
+    const snapshot = context.currentTabOrDie().snapshotOrDie();
+    const locator = snapshot.refLocator(params);
+
+    const code: string[] = [];
+    const steps: (() => Promise<void>)[] = [];
+
+    if (params.slowly) {
+      code.push(`// Press "${params.text}" sequentially into "${params.element}"`);
+      code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`);
+      steps.push(() => locator.pressSequentially(params.text));
+    } else {
+      code.push(`// Fill "${params.text}" into "${params.element}"`);
+      code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`);
+      steps.push(() => locator.fill(params.text));
+    }
+
+    if (params.submit) {
+      code.push(`// Submit text`);
+      code.push(`await page.${await generateLocator(locator)}.press('Enter');`);
+      steps.push(() => locator.press('Enter'));
+    }
+
+    return {
+      code,
+      action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()),
+      captureSnapshot: true,
+      waitForNetwork: true,
+    };
+  },
+});
+
+export default [
+  pressKey,
+  type,
 ];
diff --git a/src/tools/vision.ts b/src/tools/mouse.ts
similarity index 59%
rename from src/tools/vision.ts
rename to src/tools/mouse.ts
index a380311..9171eb7 100644
--- a/src/tools/vision.ts
+++ b/src/tools/mouse.ts
@@ -17,50 +17,14 @@
 import { z } from 'zod';
 import { defineTool } from './tool.js';
 
-import * as javascript from '../javascript.js';
-
 const elementSchema = z.object({
   element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'),
 });
 
-const screenshot = defineTool({
-  capability: 'core',
+const mouseMove = defineTool({
+  capability: 'vision',
   schema: {
-    name: 'browser_screen_capture',
-    title: 'Take a screenshot',
-    description: 'Take a screenshot of the current page',
-    inputSchema: z.object({}),
-    type: 'readOnly',
-  },
-
-  handle: async context => {
-    const tab = await context.ensureTab();
-    const options = { type: 'jpeg' as 'jpeg', quality: 50, scale: 'css' as 'css' };
-
-    const code = [
-      `// Take a screenshot of the current page`,
-      `await page.screenshot(${javascript.formatObject(options)});`,
-    ];
-
-    const action = () => tab.page.screenshot(options).then(buffer => {
-      return {
-        content: [{ type: 'image' as 'image', data: buffer.toString('base64'), mimeType: 'image/jpeg' }],
-      };
-    });
-
-    return {
-      code,
-      action,
-      captureSnapshot: false,
-      waitForNetwork: false
-    };
-  },
-});
-
-const moveMouse = defineTool({
-  capability: 'core',
-  schema: {
-    name: 'browser_screen_move_mouse',
+    name: 'browser_mouse_move_xy',
     title: 'Move mouse',
     description: 'Move mouse to a given position',
     inputSchema: elementSchema.extend({
@@ -86,12 +50,12 @@ const moveMouse = defineTool({
   },
 });
 
-const click = defineTool({
-  capability: 'core',
+const mouseClick = defineTool({
+  capability: 'vision',
   schema: {
-    name: 'browser_screen_click',
+    name: 'browser_mouse_click_xy',
     title: 'Click',
-    description: 'Click left mouse button',
+    description: 'Click left mouse button at a given position',
     inputSchema: elementSchema.extend({
       x: z.number().describe('X coordinate'),
       y: z.number().describe('Y coordinate'),
@@ -121,12 +85,12 @@ const click = defineTool({
   },
 });
 
-const drag = defineTool({
-  capability: 'core',
+const mouseDrag = defineTool({
+  capability: 'vision',
   schema: {
-    name: 'browser_screen_drag',
+    name: 'browser_mouse_drag_xy',
     title: 'Drag mouse',
-    description: 'Drag left mouse button',
+    description: 'Drag left mouse button to a given position',
     inputSchema: elementSchema.extend({
       startX: z.number().describe('Start X coordinate'),
       startY: z.number().describe('Start Y coordinate'),
@@ -163,51 +127,8 @@ const drag = defineTool({
   },
 });
 
-const type = defineTool({
-  capability: 'core',
-  schema: {
-    name: 'browser_screen_type',
-    title: 'Type text',
-    description: 'Type text',
-    inputSchema: z.object({
-      text: z.string().describe('Text to type into the element'),
-      submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
-    }),
-    type: 'destructive',
-  },
-
-  handle: async (context, params) => {
-    const tab = context.currentTabOrDie();
-
-    const code = [
-      `// Type ${params.text}`,
-      `await page.keyboard.type('${params.text}');`,
-    ];
-
-    const action = async () => {
-      await tab.page.keyboard.type(params.text);
-      if (params.submit)
-        await tab.page.keyboard.press('Enter');
-    };
-
-    if (params.submit) {
-      code.push(`// Submit text`);
-      code.push(`await page.keyboard.press('Enter');`);
-    }
-
-    return {
-      code,
-      action,
-      captureSnapshot: false,
-      waitForNetwork: true,
-    };
-  },
-});
-
 export default [
-  screenshot,
-  moveMouse,
-  click,
-  drag,
-  type,
+  mouseMove,
+  mouseClick,
+  mouseDrag,
 ];
diff --git a/src/tools/navigate.ts b/src/tools/navigate.ts
index 501576e..a210a13 100644
--- a/src/tools/navigate.ts
+++ b/src/tools/navigate.ts
@@ -15,9 +15,9 @@
  */
 
 import { z } from 'zod';
-import { defineTool, type ToolFactory } from './tool.js';
+import { defineTool } from './tool.js';
 
-const navigate: ToolFactory = captureSnapshot => defineTool({
+const navigate = defineTool({
   capability: 'core',
 
   schema: {
@@ -41,14 +41,14 @@ const navigate: ToolFactory = captureSnapshot => defineTool({
 
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false,
     };
   },
 });
 
-const goBack: ToolFactory = captureSnapshot => defineTool({
-  capability: 'history',
+const goBack = defineTool({
+  capability: 'core',
   schema: {
     name: 'browser_navigate_back',
     title: 'Go back',
@@ -67,14 +67,14 @@ const goBack: ToolFactory = captureSnapshot => defineTool({
 
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false,
     };
   },
 });
 
-const goForward: ToolFactory = captureSnapshot => defineTool({
-  capability: 'history',
+const goForward = defineTool({
+  capability: 'core',
   schema: {
     name: 'browser_navigate_forward',
     title: 'Go forward',
@@ -91,14 +91,14 @@ const goForward: ToolFactory = captureSnapshot => defineTool({
     ];
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false,
     };
   },
 });
 
-export default (captureSnapshot: boolean) => [
-  navigate(captureSnapshot),
-  goBack(captureSnapshot),
-  goForward(captureSnapshot),
+export default [
+  navigate,
+  goBack,
+  goForward,
 ];
diff --git a/src/tools/screenshot.ts b/src/tools/screenshot.ts
index 439d79a..5e41491 100644
--- a/src/tools/screenshot.ts
+++ b/src/tools/screenshot.ts
@@ -79,7 +79,7 @@ const screenshot = defineTool({
     return {
       code,
       action,
-      captureSnapshot: true,
+      captureSnapshot: false,
       waitForNetwork: false,
     };
   }
diff --git a/src/tools/snapshot.ts b/src/tools/snapshot.ts
index 7d1ef32..8e43c68 100644
--- a/src/tools/snapshot.ts
+++ b/src/tools/snapshot.ts
@@ -41,7 +41,7 @@ const snapshot = defineTool({
   },
 });
 
-const elementSchema = z.object({
+export const elementSchema = z.object({
   element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'),
   ref: z.string().describe('Exact target element reference from the page snapshot'),
 });
@@ -144,54 +144,6 @@ const hover = defineTool({
   },
 });
 
-const typeSchema = elementSchema.extend({
-  text: z.string().describe('Text to type into the element'),
-  submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
-  slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'),
-});
-
-const type = defineTool({
-  capability: 'core',
-  schema: {
-    name: 'browser_type',
-    title: 'Type text',
-    description: 'Type text into editable element',
-    inputSchema: typeSchema,
-    type: 'destructive',
-  },
-
-  handle: async (context, params) => {
-    const snapshot = context.currentTabOrDie().snapshotOrDie();
-    const locator = snapshot.refLocator(params);
-
-    const code: string[] = [];
-    const steps: (() => Promise<void>)[] = [];
-
-    if (params.slowly) {
-      code.push(`// Press "${params.text}" sequentially into "${params.element}"`);
-      code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`);
-      steps.push(() => locator.pressSequentially(params.text));
-    } else {
-      code.push(`// Fill "${params.text}" into "${params.element}"`);
-      code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`);
-      steps.push(() => locator.fill(params.text));
-    }
-
-    if (params.submit) {
-      code.push(`// Submit text`);
-      code.push(`await page.${await generateLocator(locator)}.press('Enter');`);
-      steps.push(() => locator.press('Enter'));
-    }
-
-    return {
-      code,
-      action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()),
-      captureSnapshot: true,
-      waitForNetwork: true,
-    };
-  },
-});
-
 const selectOptionSchema = elementSchema.extend({
   values: z.array(z.string()).describe('Array of values to select in the dropdown. This can be a single value or multiple values.'),
 });
@@ -229,6 +181,5 @@ export default [
   click,
   drag,
   hover,
-  type,
   selectOption,
 ];
diff --git a/src/tools/tabs.ts b/src/tools/tabs.ts
index 4133bf1..5256fee 100644
--- a/src/tools/tabs.ts
+++ b/src/tools/tabs.ts
@@ -15,10 +15,10 @@
  */
 
 import { z } from 'zod';
-import { defineTool, type ToolFactory } from './tool.js';
+import { defineTool } from './tool.js';
 
 const listTabs = defineTool({
-  capability: 'tabs',
+  capability: 'core-tabs',
 
   schema: {
     name: 'browser_tab_list',
@@ -44,8 +44,8 @@ const listTabs = defineTool({
   },
 });
 
-const selectTab: ToolFactory = captureSnapshot => defineTool({
-  capability: 'tabs',
+const selectTab = defineTool({
+  capability: 'core-tabs',
 
   schema: {
     name: 'browser_tab_select',
@@ -65,14 +65,14 @@ const selectTab: ToolFactory = captureSnapshot => defineTool({
 
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false
     };
   },
 });
 
-const newTab: ToolFactory = captureSnapshot => defineTool({
-  capability: 'tabs',
+const newTab = defineTool({
+  capability: 'core-tabs',
 
   schema: {
     name: 'browser_tab_new',
@@ -94,14 +94,14 @@ const newTab: ToolFactory = captureSnapshot => defineTool({
     ];
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false
     };
   },
 });
 
-const closeTab: ToolFactory = captureSnapshot => defineTool({
-  capability: 'tabs',
+const closeTab = defineTool({
+  capability: 'core-tabs',
 
   schema: {
     name: 'browser_tab_close',
@@ -120,15 +120,15 @@ const closeTab: ToolFactory = captureSnapshot => defineTool({
     ];
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false
     };
   },
 });
 
-export default (captureSnapshot: boolean) => [
+export default [
   listTabs,
-  newTab(captureSnapshot),
-  selectTab(captureSnapshot),
-  closeTab(captureSnapshot),
+  newTab,
+  selectTab,
+  closeTab,
 ];
diff --git a/src/tools/tool.ts b/src/tools/tool.ts
index 4b88c89..2f9f5d0 100644
--- a/src/tools/tool.ts
+++ b/src/tools/tool.ts
@@ -61,8 +61,6 @@ export type Tool<Input extends InputType = InputType> = {
   handle: (context: Context, params: z.output<Input>) => Promise<ToolResult>;
 };
 
-export type ToolFactory = (snapshot: boolean) => Tool<any>;
-
 export function defineTool<Input extends InputType>(tool: Tool<Input>): Tool<Input> {
   return tool;
 }
diff --git a/src/tools/wait.ts b/src/tools/wait.ts
index fc8be82..519148d 100644
--- a/src/tools/wait.ts
+++ b/src/tools/wait.ts
@@ -15,10 +15,10 @@
  */
 
 import { z } from 'zod';
-import { defineTool, type ToolFactory } from './tool.js';
+import { defineTool } from './tool.js';
 
-const wait: ToolFactory = captureSnapshot => defineTool({
-  capability: 'wait',
+const wait = defineTool({
+  capability: 'core',
 
   schema: {
     name: 'browser_wait_for',
@@ -40,7 +40,7 @@ const wait: ToolFactory = captureSnapshot => defineTool({
 
     if (params.time) {
       code.push(`await new Promise(f => setTimeout(f, ${params.time!} * 1000));`);
-      await new Promise(f => setTimeout(f, Math.min(10000, params.time! * 1000)));
+      await new Promise(f => setTimeout(f, Math.min(30000, params.time! * 1000)));
     }
 
     const tab = context.currentTabOrDie();
@@ -59,12 +59,12 @@ const wait: ToolFactory = captureSnapshot => defineTool({
 
     return {
       code,
-      captureSnapshot,
+      captureSnapshot: true,
       waitForNetwork: false,
     };
   },
 });
 
-export default (captureSnapshot: boolean) => [
-  wait(captureSnapshot),
+export default [
+  wait,
 ];
diff --git a/tests/capabilities.spec.ts b/tests/capabilities.spec.ts
index cd7defd..5f33035 100644
--- a/tests/capabilities.spec.ts
+++ b/tests/capabilities.spec.ts
@@ -34,7 +34,6 @@ test('test snapshot tool list', async ({ client }) => {
     'browser_navigate_forward',
     'browser_navigate',
     'browser_network_requests',
-    'browser_pdf_save',
     'browser_press_key',
     'browser_resize',
     'browser_snapshot',
@@ -47,45 +46,33 @@ test('test snapshot tool list', async ({ client }) => {
   ]));
 });
 
-test('test vision tool list', async ({ visionClient }) => {
-  const { tools: visionTools } = await visionClient.listTools();
-  expect(new Set(visionTools.map(t => t.name))).toEqual(new Set([
-    'browser_close',
-    'browser_console_messages',
-    'browser_file_upload',
-    'browser_handle_dialog',
-    'browser_install',
-    'browser_navigate_back',
-    'browser_navigate_forward',
-    'browser_navigate',
-    'browser_network_requests',
-    'browser_pdf_save',
-    'browser_press_key',
-    'browser_resize',
-    'browser_screen_capture',
-    'browser_screen_click',
-    'browser_screen_drag',
-    'browser_screen_move_mouse',
-    'browser_screen_type',
-    'browser_tab_close',
-    'browser_tab_list',
-    'browser_tab_new',
-    'browser_tab_select',
-    'browser_wait_for',
-  ]));
-});
-
-test('test capabilities', async ({ startClient }) => {
+test('test capabilities (pdf)', async ({ startClient }) => {
   const { client } = await startClient({
-    args: ['--caps="core"'],
+    args: ['--caps=pdf'],
   });
   const { tools } = await client.listTools();
   const toolNames = tools.map(t => t.name);
-  expect(toolNames).not.toContain('browser_file_upload');
-  expect(toolNames).not.toContain('browser_pdf_save');
-  expect(toolNames).not.toContain('browser_screen_capture');
-  expect(toolNames).not.toContain('browser_screen_click');
-  expect(toolNames).not.toContain('browser_screen_drag');
-  expect(toolNames).not.toContain('browser_screen_move_mouse');
-  expect(toolNames).not.toContain('browser_screen_type');
+  expect(toolNames).toContain('browser_pdf_save');
+});
+
+test('test capabilities (vision)', async ({ startClient }) => {
+  const { client } = await startClient({
+    args: ['--caps=vision'],
+  });
+  const { tools } = await client.listTools();
+  const toolNames = tools.map(t => t.name);
+  expect(toolNames).toContain('browser_mouse_move_xy');
+  expect(toolNames).toContain('browser_mouse_click_xy');
+  expect(toolNames).toContain('browser_mouse_drag_xy');
+});
+
+test('support for legacy --vision option', async ({ startClient }) => {
+  const { client } = await startClient({
+    args: ['--vision'],
+  });
+  const { tools } = await client.listTools();
+  const toolNames = tools.map(t => t.name);
+  expect(toolNames).toContain('browser_mouse_move_xy');
+  expect(toolNames).toContain('browser_mouse_click_xy');
+  expect(toolNames).toContain('browser_mouse_drag_xy');
 });
diff --git a/tests/evaluate.spec.ts b/tests/evaluate.spec.ts
new file mode 100644
index 0000000..ceb86d3
--- /dev/null
+++ b/tests/evaluate.spec.ts
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { test, expect } from './fixtures.js';
+
+test('browser_evaluate', async ({ client, server }) => {
+  expect(await client.callTool({
+    name: 'browser_navigate',
+    arguments: { url: server.HELLO_WORLD },
+  })).toContainTextContent(`- Page Title: Title`);
+
+  const result = await client.callTool({
+    name: 'browser_evaluate',
+    arguments: {
+      function: '() => document.title',
+    },
+  });
+  expect(result).toContainTextContent(`"Title"`);
+});
+
+test('browser_evaluate (element)', async ({ client, server }) => {
+  server.setContent('/', `
+    <body style="background-color: red">Hello, world!</body>
+  `, 'text/html');
+  await client.callTool({
+    name: 'browser_navigate',
+    arguments: { url: server.PREFIX },
+  });
+
+  expect(await client.callTool({
+    name: 'browser_evaluate',
+    arguments: {
+      function: 'element => element.style.backgroundColor',
+      element: 'body',
+      ref: 'e1',
+    },
+  })).toContainTextContent(`- Result: "red"`);
+});
diff --git a/tests/fixtures.ts b/tests/fixtures.ts
index 3668a64..e4f23c4 100644
--- a/tests/fixtures.ts
+++ b/tests/fixtures.ts
@@ -41,7 +41,6 @@ type CDPServer = {
 
 type TestFixtures = {
   client: Client;
-  visionClient: Client;
   startClient: (options?: { clientName?: string, args?: string[], config?: Config }) => Promise<{ client: Client, stderr: () => string }>;
   wsEndpoint: string;
   cdpServer: CDPServer;
@@ -61,11 +60,6 @@ export const test = baseTest.extend<TestFixtures & TestOptions, WorkerFixtures>(
     await use(client);
   },
 
-  visionClient: async ({ startClient }, use) => {
-    const { client } = await startClient({ args: ['--vision'] });
-    await use(client);
-  },
-
   startClient: async ({ mcpHeadless, mcpBrowser, mcpMode }, use, testInfo) => {
     const userDataDir = mcpMode !== 'docker' ? testInfo.outputPath('user-data-dir') : undefined;
     const configDir = path.dirname(test.info().config.configFile!);
diff --git a/tests/pdf.spec.ts b/tests/pdf.spec.ts
index 6032b87..c3cc901 100644
--- a/tests/pdf.spec.ts
+++ b/tests/pdf.spec.ts
@@ -19,7 +19,7 @@ import fs from 'fs';
 import { test, expect } from './fixtures.js';
 
 test('save as pdf unavailable', async ({ startClient, server }) => {
-  const { client } = await startClient({ args: ['--caps="no-pdf"'] });
+  const { client } = await startClient();
   await client.callTool({
     name: 'browser_navigate',
     arguments: { url: server.HELLO_WORLD },
@@ -32,7 +32,7 @@ test('save as pdf unavailable', async ({ startClient, server }) => {
 
 test('save as pdf', async ({ startClient, mcpBrowser, server }, testInfo) => {
   const { client } = await startClient({
-    config: { outputDir: testInfo.outputPath('output') },
+    config: { outputDir: testInfo.outputPath('output'), capabilities: ['pdf'] },
   });
 
   test.skip(!!mcpBrowser && !['chromium', 'chrome', 'msedge'].includes(mcpBrowser), 'Save as PDF is only supported in Chromium.');
@@ -52,7 +52,7 @@ test('save as pdf (filename: output.pdf)', async ({ startClient, mcpBrowser, ser
   const outputDir = testInfo.outputPath('output');
   test.skip(!!mcpBrowser && !['chromium', 'chrome', 'msedge'].includes(mcpBrowser), 'Save as PDF is only supported in Chromium.');
   const { client } = await startClient({
-    config: { outputDir },
+    config: { outputDir, capabilities: ['pdf'] },
   });
 
   expect(await client.callTool({
diff --git a/utils/update-readme.js b/utils/update-readme.js
index 144838d..0bb1b01 100644
--- a/utils/update-readme.js
+++ b/utils/update-readme.js
@@ -20,60 +20,20 @@ import fs from 'node:fs'
 import path from 'node:path'
 import url from 'node:url'
 import zodToJsonSchema from 'zod-to-json-schema'
-
-import commonTools from '../lib/tools/common.js';
-import consoleTools from '../lib/tools/console.js';
-import dialogsTools from '../lib/tools/dialogs.js';
-import evaluateTools from '../lib/tools/evaluate.js';
-import filesTools from '../lib/tools/files.js';
-import installTools from '../lib/tools/install.js';
-import keyboardTools from '../lib/tools/keyboard.js';
-import navigateTools from '../lib/tools/navigate.js';
-import networkTools from '../lib/tools/network.js';
-import pdfTools from '../lib/tools/pdf.js';
-import snapshotTools from '../lib/tools/snapshot.js';
-import tabsTools from '../lib/tools/tabs.js';
-import screenshotTools from '../lib/tools/screenshot.js';
-import visionTools from '../lib/tools/vision.js';
-import waitTools from '../lib/tools/wait.js';
 import { execSync } from 'node:child_process';
 
-const categories = {
-  'Interactions': [
-    ...snapshotTools,
-    ...keyboardTools(true),
-    ...waitTools(true),
-    ...filesTools(true),
-    ...dialogsTools(true),
-  ],
-  'Navigation': [
-    ...navigateTools(true),
-  ],
-  'Evaluation': [
-    ...evaluateTools,
-  ],
-  'Resources': [
-    ...screenshotTools,
-    ...pdfTools,
-    ...networkTools,
-    ...consoleTools,
-  ],
-  'Utilities': [
-    ...installTools,
-    ...commonTools(true),
-  ],
-  'Tabs': [
-    ...tabsTools(true),
-  ],
-  'Vision mode': [
-    ...visionTools,
-    ...keyboardTools(),
-    ...waitTools(false),
-    ...filesTools(false),
-    ...dialogsTools(false),
-  ],
+import { allTools } from '../lib/tools.js';
+
+const capabilities = {
+  'core': 'Core automation',
+  'core-tabs': 'Tab management',
+  'core-install': 'Browser installation',
+  'vision': 'Coordinate-based (opt-in via --caps=vision)',
+  'pdf': 'PDF generation (opt-in via --caps=pdf)',
 };
 
+const toolsByCapability = Object.fromEntries(Object.entries(capabilities).map(([capability, title]) => [title, allTools.filter(tool => tool.capability === capability).sort((a, b) => a.schema.name.localeCompare(b.schema.name))]));
+
 // NOTE: Can be removed when we drop Node.js 18 support and changed to import.meta.filename.
 const __filename = url.fileURLToPath(import.meta.url);
 
@@ -139,14 +99,12 @@ async function updateSection(content, startMarker, endMarker, generatedLines) {
 async function updateTools(content) {
   console.log('Loading tool information from compiled modules...');
 
-  const totalTools = Object.values(categories).flat().length;
-  console.log(`Found ${totalTools} tools`);
-
   const generatedLines = /** @type {string[]} */ ([]);
-  for (const [category, categoryTools] of Object.entries(categories)) {
-    generatedLines.push(`<details>\n<summary><b>${category}</b></summary>`);
+  for (const [capability, tools] of Object.entries(toolsByCapability)) {
+    console.log('Updating tools for capability:', capability);
+    generatedLines.push(`<details>\n<summary><b>${capability}</b></summary>`);
     generatedLines.push('');
-    for (const tool of categoryTools)
+    for (const tool of tools)
       generatedLines.push(...formatToolForReadme(tool.schema));
     generatedLines.push(`</details>`);
     generatedLines.push('');