From a1eee8351e8e39d44e846f0ed3ac6a747388d84e Mon Sep 17 00:00:00 2001 From: Pavel Feldman Date: Mon, 12 May 2025 16:42:47 -0700 Subject: [PATCH] chore: collapse readme (#404) --- README.md | 463 ++++++++++++++++++----------- src/connection.ts | 4 +- src/program.ts | 3 +- src/tools.ts | 11 +- src/tools/common.ts | 49 --- src/tools/screenshot.ts | 90 ++++++ src/tools/snapshot.ts | 71 +---- src/tools/utils.ts | 4 + src/tools/{screen.ts => vision.ts} | 0 src/tools/wait.ts | 70 +++++ utils/update-readme.js | 49 +-- 11 files changed, 491 insertions(+), 323 deletions(-) create mode 100644 src/tools/screenshot.ts rename src/tools/{screen.ts => vision.ts} (100%) create mode 100644 src/tools/wait.ts diff --git a/README.md b/README.md index 4b365ea..ee245ed 100644 --- a/README.md +++ b/README.md @@ -4,25 +4,22 @@ A Model Context Protocol (MCP) server that provides browser automation capabilit ### Key Features -- **Fast and lightweight**: Uses Playwright's accessibility tree, not pixel-based input. -- **LLM-friendly**: No vision models needed, operates purely on structured data. -- **Deterministic tool application**: Avoids ambiguity common with screenshot-based approaches. +- **Fast and lightweight**. Uses Playwright's accessibility tree, not pixel-based input. +- **LLM-friendly**. No vision models needed, operates purely on structured data. +- **Deterministic tool application**. Avoids ambiguity common with screenshot-based approaches. -### Use Cases - -- Web navigation and form-filling -- Data extraction from structured content -- Automated testing driven by LLMs -- General-purpose browser interaction for agents +### Requirements +- Node.js 18 or newer +- VS Code, Cursor, Windsurf, Claude Desktop or any other MCP client -[Install in VS Code](https://insiders.vscode.dev/redirect?url=vscode%3Amcp%2Finstall%3F%257B%2522name%2522%253A%2522playwright%2522%252C%2522command%2522%253A%2522npx%2522%252C%2522args%2522%253A%255B%2522%2540playwright%252Fmcp%2540latest%2522%255D%257D) [Install in VS Code Insiders](https://insiders.vscode.dev/redirect?url=vscode-insiders%3Amcp%2Finstall%3F%257B%2522name%2522%253A%2522playwright%2522%252C%2522command%2522%253A%2522npx%2522%252C%2522args%2522%253A%255B%2522%2540playwright%252Fmcp%2540latest%2522%255D%257D) +### Getting started -### Example config +First, install the Playwright MCP server with your client. A typical configuration looks like this: ```js { @@ -37,20 +34,12 @@ node utils/generate-links.js } ``` -### Table of Contents +[Install in VS Code](https://insiders.vscode.dev/redirect?url=vscode%3Amcp%2Finstall%3F%257B%2522name%2522%253A%2522playwright%2522%252C%2522command%2522%253A%2522npx%2522%252C%2522args%2522%253A%255B%2522%2540playwright%252Fmcp%2540latest%2522%255D%257D) [Install in VS Code Insiders](https://insiders.vscode.dev/redirect?url=vscode-insiders%3Amcp%2Finstall%3F%257B%2522name%2522%253A%2522playwright%2522%252C%2522command%2522%253A%2522npx%2522%252C%2522args%2522%253A%255B%2522%2540playwright%252Fmcp%2540latest%2522%255D%257D) -- [Installation in VS Code](#installation-in-vs-code) -- [Command line](#command-line) -- [User profile](#user-profile) -- [Configuration file](#configuration-file) -- [Running on Linux](#running-on-linux) -- [Docker](#docker) -- [Programmatic usage](#programmatic-usage) -- [Tool modes](#tool-modes) -### Installation in VS Code +
Install in VS Code -You can install the Playwright MCP server using the VS Code CLI: +You can also install the Playwright MCP server using the VS Code CLI: ```bash # For VS Code @@ -58,10 +47,68 @@ code --add-mcp '{"name":"playwright","command":"npx","args":["@playwright/mcp@la ``` After installation, the Playwright MCP server will be available for use with your GitHub Copilot agent in VS Code. +
-### Command line +
+Install in Cursor -The Playwright MCP server supports the following command-line options: +Go to `Cursor Settings` -> `MCP` -> `Add new MCP Server`. Name to your liking, use `command` type with the command `npx @playwright/mcp`. You can also verify config or add command like arguments via clicking `Edit`. + +```js +{ + "mcpServers": { + "playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest" + ] + } + } +} +``` +
+ +
+Install in Windsurf + +Follow Windsuff MCP [documentation](https://docs.windsurf.com/windsurf/cascade/mcp). Use following configuration: + +```js +{ + "mcpServers": { + "playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest" + ] + } + } +} +``` +
+ +
+Install in Claude Desktop + +Follow the MCP install [guide](https://modelcontextprotocol.io/quickstart/user), use following configuration: + +```js +{ + "mcpServers": { + "playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest" + ] + } + } +} +``` +
+ +### Configuration + +Playwright MCP server supports following arguments. They can be provided in the JSON configuration above, as a part of the `"args"` list: - `--browser `: Browser or chrome channel to use. Possible values: - `chrome`, `firefox`, `webkit`, `msedge` @@ -96,7 +143,15 @@ All the logged in information will be stored in that profile, you can delete it ### Configuration file -The Playwright MCP server can be configured using a JSON configuration file. Here's the complete configuration format: +The Playwright MCP server can be configured using a JSON configuration file. You can specify the configuration file +using the `--config` command line option: + +```bash +npx @playwright/mcp@latest --config path/to/config.json +``` + +
+Configuration file schema ```typescript { @@ -170,14 +225,9 @@ The Playwright MCP server can be configured using a JSON configuration file. Her noImageResponses?: boolean; } ``` +
-You can specify the configuration file using the `--config` command line option: - -```bash -npx @playwright/mcp@latest --config path/to/config.json -``` - -### Running on Linux +### Standalone MCP server When running headed browser on system w/o display or from worker processes of the IDEs, run the MCP server from environment with the DISPLAY and pass the `--port` flag to enable SSE transport. @@ -198,7 +248,8 @@ And then in MCP client config, set the `url` to the SSE endpoint: } ``` -### Docker +
+Docker **NOTE:** The Docker implementation only supports headless chromium at the moment. @@ -218,8 +269,10 @@ You can build the Docker image yourself. ``` docker build -t mcr.microsoft.com/playwright/mcp . ``` +
-### Programmatic usage +
+Programmatic usage ```js import http from 'http'; @@ -238,8 +291,9 @@ http.createServer(async (req, res) => { // ... }); ``` +
-### Tool modes +### Tools The tools are available in two modes: @@ -265,10 +319,10 @@ To use Vision Mode, add the `--vision` flag when starting the server: Vision Mode works best with the computer use models that are able to interact with elements using X Y coordinate space, based on the provided screenshot. - -### Snapshot-based Interactions +
+Interactions @@ -336,6 +390,80 @@ X Y coordinate space, based on the provided screenshot. +- **browser_press_key** + - Title: Press a key + - Description: Press a key on the keyboard + - Parameters: + - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a` + - Read-only: **false** + + + +- **browser_wait_for** + - Title: Wait for + - Description: Wait for text to appear or disappear or a specified time to pass + - Parameters: + - `time` (number, optional): The time to wait in seconds + - `text` (string, optional): The text to wait for + - `textGone` (string, optional): The text to wait for to disappear + - Read-only: **true** + + + +- **browser_file_upload** + - Title: Upload files + - Description: Upload one or multiple files + - Parameters: + - `paths` (array): The absolute paths to the files to upload. Can be a single file or multiple files. + - Read-only: **false** + + + +- **browser_handle_dialog** + - Title: Handle a dialog + - Description: Handle a dialog + - Parameters: + - `accept` (boolean): Whether to accept the dialog. + - `promptText` (string, optional): The text of the prompt in case of a prompt dialog. + - Read-only: **false** + +
+ +
+Navigation + + + +- **browser_navigate** + - Title: Navigate to a URL + - Description: Navigate to a URL + - Parameters: + - `url` (string): The URL to navigate to + - Read-only: **false** + + + +- **browser_navigate_back** + - Title: Go back + - Description: Go back to the previous page + - Parameters: None + - Read-only: **true** + + + +- **browser_navigate_forward** + - Title: Go forward + - Description: Go forward to the next page + - Parameters: None + - Read-only: **true** + +
+ +
+Resources + + + - **browser_take_screenshot** - Title: Take a screenshot - Description: Take a screenshot of the current page. You can't perform actions based on the screenshot, use browser_snapshot for actions. @@ -346,7 +474,122 @@ X Y coordinate space, based on the provided screenshot. - `ref` (string, optional): Exact target element reference from the page snapshot. If not provided, the screenshot will be taken of viewport. If ref is provided, element must be provided too. - Read-only: **true** -### Vision-based Interactions + + +- **browser_pdf_save** + - Title: Save as PDF + - Description: Save page as PDF + - Parameters: + - `filename` (string, optional): File name to save the pdf to. Defaults to `page-{timestamp}.pdf` if not specified. + - Read-only: **true** + + + +- **browser_network_requests** + - Title: List network requests + - Description: Returns all network requests since loading the page + - Parameters: None + - Read-only: **true** + + + +- **browser_console_messages** + - Title: Get console messages + - Description: Returns all console messages + - Parameters: None + - Read-only: **true** + +
+ +
+Utilities + + + +- **browser_install** + - Title: Install the browser specified in the config + - Description: Install the browser specified in the config. Call this if you get an error about the browser not being installed. + - Parameters: None + - Read-only: **false** + + + +- **browser_close** + - Title: Close browser + - Description: Close the page + - Parameters: None + - Read-only: **true** + + + +- **browser_resize** + - Title: Resize browser window + - Description: Resize the browser window + - Parameters: + - `width` (number): Width of the browser window + - `height` (number): Height of the browser window + - Read-only: **true** + +
+ +
+Tabs + + + +- **browser_tab_list** + - Title: List tabs + - Description: List browser tabs + - Parameters: None + - Read-only: **true** + + + +- **browser_tab_new** + - Title: Open a new tab + - Description: Open a new tab + - Parameters: + - `url` (string, optional): The URL to navigate to in the new tab. If not provided, the new tab will be blank. + - Read-only: **true** + + + +- **browser_tab_select** + - Title: Select a tab + - Description: Select a tab by index + - Parameters: + - `index` (number): The index of the tab to select + - Read-only: **true** + + + +- **browser_tab_close** + - Title: Close a tab + - Description: Close a tab + - Parameters: + - `index` (number, optional): The index of the tab to close. Closes current tab if not provided. + - Read-only: **false** + +
+ +
+Testing + + + +- **browser_generate_playwright_test** + - Title: Generate a Playwright test + - Description: Generate a Playwright test for given scenario + - Parameters: + - `name` (string): The name of the test + - `description` (string): The description of the test + - `steps` (array): The steps of the test + - Read-only: **true** + +
+ +
+Vision mode @@ -401,72 +644,6 @@ X Y coordinate space, based on the provided screenshot. - `submit` (boolean, optional): Whether to submit entered text (press Enter after) - Read-only: **false** -### Tab Management - - - -- **browser_tab_list** - - Title: List tabs - - Description: List browser tabs - - Parameters: None - - Read-only: **true** - - - -- **browser_tab_new** - - Title: Open a new tab - - Description: Open a new tab - - Parameters: - - `url` (string, optional): The URL to navigate to in the new tab. If not provided, the new tab will be blank. - - Read-only: **true** - - - -- **browser_tab_select** - - Title: Select a tab - - Description: Select a tab by index - - Parameters: - - `index` (number): The index of the tab to select - - Read-only: **true** - - - -- **browser_tab_close** - - Title: Close a tab - - Description: Close a tab - - Parameters: - - `index` (number, optional): The index of the tab to close. Closes current tab if not provided. - - Read-only: **false** - -### Navigation - - - -- **browser_navigate** - - Title: Navigate to a URL - - Description: Navigate to a URL - - Parameters: - - `url` (string): The URL to navigate to - - Read-only: **false** - - - -- **browser_navigate_back** - - Title: Go back - - Description: Go back to the previous page - - Parameters: None - - Read-only: **true** - - - -- **browser_navigate_forward** - - Title: Go forward - - Description: Go forward to the next page - - Parameters: None - - Read-only: **true** - -### Keyboard - - **browser_press_key** @@ -476,46 +653,6 @@ X Y coordinate space, based on the provided screenshot. - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a` - Read-only: **false** -### Console - - - -- **browser_console_messages** - - Title: Get console messages - - Description: Returns all console messages - - Parameters: None - - Read-only: **true** - -### Files and Media - - - -- **browser_file_upload** - - Title: Upload files - - Description: Upload one or multiple files - - Parameters: - - `paths` (array): The absolute paths to the files to upload. Can be a single file or multiple files. - - Read-only: **false** - - - -- **browser_pdf_save** - - Title: Save as PDF - - Description: Save page as PDF - - Parameters: - - `filename` (string, optional): File name to save the pdf to. Defaults to `page-{timestamp}.pdf` if not specified. - - Read-only: **true** - -### Utilities - - - -- **browser_close** - - Title: Close browser - - Description: Close the page - - Parameters: None - - Read-only: **true** - - **browser_wait_for** @@ -529,20 +666,11 @@ X Y coordinate space, based on the provided screenshot. -- **browser_resize** - - Title: Resize browser window - - Description: Resize the browser window +- **browser_file_upload** + - Title: Upload files + - Description: Upload one or multiple files - Parameters: - - `width` (number): Width of the browser window - - `height` (number): Height of the browser window - - Read-only: **true** - - - -- **browser_install** - - Title: Install the browser specified in the config - - Description: Install the browser specified in the config. Call this if you get an error about the browser not being installed. - - Parameters: None + - `paths` (array): The absolute paths to the files to upload. Can be a single file or multiple files. - Read-only: **false** @@ -555,25 +683,6 @@ X Y coordinate space, based on the provided screenshot. - `promptText` (string, optional): The text of the prompt in case of a prompt dialog. - Read-only: **false** - - -- **browser_network_requests** - - Title: List network requests - - Description: Returns all network requests since loading the page - - Parameters: None - - Read-only: **true** - -### Testing - - - -- **browser_generate_playwright_test** - - Title: Generate a Playwright test - - Description: Generate a Playwright test for given scenario - - Parameters: - - `name` (string): The name of the test - - `description` (string): The description of the test - - `steps` (array): The steps of the test - - Read-only: **true** +
diff --git a/src/connection.ts b/src/connection.ts index 7065387..e29e4ac 100644 --- a/src/connection.ts +++ b/src/connection.ts @@ -19,13 +19,13 @@ import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from ' import { zodToJsonSchema } from 'zod-to-json-schema'; import { Context, packageJSON } from './context.js'; -import { snapshotTools, screenshotTools } from './tools.js'; +import { snapshotTools, visionTools } from './tools.js'; import type { Config } from '../config.js'; import type { Transport } from '@modelcontextprotocol/sdk/shared/transport.js'; export async function createConnection(config: Config): Promise { - const allTools = config.vision ? screenshotTools : snapshotTools; + const allTools = config.vision ? visionTools : snapshotTools; const tools = allTools.filter(tool => !config.capabilities || tool.capability === 'core' || config.capabilities.includes(tool.capability)); const context = new Context(tools, config); diff --git a/src/program.ts b/src/program.ts index ddf699c..77fb50a 100644 --- a/src/program.ts +++ b/src/program.ts @@ -31,7 +31,8 @@ program .option('--executable-path ', 'Path to the browser executable.') .option('--headless', 'Run browser in headless mode, headed by default') .option('--device ', 'Device to emulate, for example: "iPhone 15"') - .option('--user-data-dir ', 'Path to the user data directory') + .option('--user-data-dir ', 'Path to the user data directory. If not specified, a temporary directory will be created.') + .option('--in-memory', 'Use in-memory storage for user data directory.') .option('--port ', 'Port to listen on for SSE transport.') .option('--host ', 'Host to bind server to. Default is localhost. Use 0.0.0.0 to bind to all interfaces.') .option('--allowed-origins ', 'Semicolon-separated list of origins to allow the browser to request. Default is to allow all.', semicolonSeparatedList) diff --git a/src/tools.ts b/src/tools.ts index 8613d92..bd6db0f 100644 --- a/src/tools.ts +++ b/src/tools.ts @@ -25,8 +25,10 @@ import network from './tools/network.js'; import pdf from './tools/pdf.js'; import snapshot from './tools/snapshot.js'; import tabs from './tools/tabs.js'; -import screen from './tools/screen.js'; +import screenshot from './tools/screenshot.js'; import testing from './tools/testing.js'; +import vision from './tools/vision.js'; +import wait from './tools/wait.js'; import type { Tool } from './tools/tool.js'; @@ -40,12 +42,14 @@ export const snapshotTools: Tool[] = [ ...navigate(true), ...network, ...pdf, + ...screenshot, ...snapshot, ...tabs(true), ...testing, + ...wait(true), ]; -export const screenshotTools: Tool[] = [ +export const visionTools: Tool[] = [ ...common(false), ...console, ...dialogs(false), @@ -55,7 +59,8 @@ export const screenshotTools: Tool[] = [ ...navigate(false), ...network, ...pdf, - ...screen, ...tabs(false), ...testing, + ...vision, + ...wait(false), ]; diff --git a/src/tools/common.ts b/src/tools/common.ts index ca2dab0..d140380 100644 --- a/src/tools/common.ts +++ b/src/tools/common.ts @@ -17,54 +17,6 @@ import { z } from 'zod'; import { defineTool, type ToolFactory } from './tool.js'; -const wait: ToolFactory = captureSnapshot => defineTool({ - capability: 'wait', - - schema: { - name: 'browser_wait_for', - title: 'Wait for', - description: 'Wait for text to appear or disappear or a specified time to pass', - inputSchema: z.object({ - time: z.number().optional().describe('The time to wait in seconds'), - text: z.string().optional().describe('The text to wait for'), - textGone: z.string().optional().describe('The text to wait for to disappear'), - }), - type: 'readOnly', - }, - - handle: async (context, params) => { - if (!params.text && !params.textGone && !params.time) - throw new Error('Either time, text or textGone must be provided'); - - const code: string[] = []; - - if (params.time) { - code.push(`await new Promise(f => setTimeout(f, ${params.time!} * 1000));`); - await new Promise(f => setTimeout(f, Math.min(10000, params.time! * 1000))); - } - - const tab = context.currentTabOrDie(); - const locator = params.text ? tab.page.getByText(params.text).first() : undefined; - const goneLocator = params.textGone ? tab.page.getByText(params.textGone).first() : undefined; - - if (goneLocator) { - code.push(`await page.getByText(${JSON.stringify(params.textGone)}).first().waitFor({ state: 'hidden' });`); - await goneLocator.waitFor({ state: 'hidden' }); - } - - if (locator) { - code.push(`await page.getByText(${JSON.stringify(params.text)}).first().waitFor({ state: 'visible' });`); - await locator.waitFor({ state: 'visible' }); - } - - return { - code, - captureSnapshot, - waitForNetwork: false, - }; - }, -}); - const close = defineTool({ capability: 'core', @@ -122,6 +74,5 @@ const resize: ToolFactory = captureSnapshot => defineTool({ export default (captureSnapshot: boolean) => [ close, - wait(captureSnapshot), resize(captureSnapshot) ]; diff --git a/src/tools/screenshot.ts b/src/tools/screenshot.ts new file mode 100644 index 0000000..db5b7f1 --- /dev/null +++ b/src/tools/screenshot.ts @@ -0,0 +1,90 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from 'zod'; + +import { defineTool } from './tool.js'; +import * as javascript from '../javascript.js'; +import { outputFile } from '../config.js'; +import { generateLocator } from './utils.js'; + +import type * as playwright from 'playwright'; + +const screenshotSchema = z.object({ + raw: z.boolean().optional().describe('Whether to return without compression (in PNG format). Default is false, which returns a JPEG image.'), + filename: z.string().optional().describe('File name to save the screenshot to. Defaults to `page-{timestamp}.{png|jpeg}` if not specified.'), + element: z.string().optional().describe('Human-readable element description used to obtain permission to screenshot the element. If not provided, the screenshot will be taken of viewport. If element is provided, ref must be provided too.'), + ref: z.string().optional().describe('Exact target element reference from the page snapshot. If not provided, the screenshot will be taken of viewport. If ref is provided, element must be provided too.'), +}).refine(data => { + return !!data.element === !!data.ref; +}, { + message: 'Both element and ref must be provided or neither.', + path: ['ref', 'element'] +}); + +const screenshot = defineTool({ + capability: 'core', + schema: { + name: 'browser_take_screenshot', + title: 'Take a screenshot', + description: `Take a screenshot of the current page. You can't perform actions based on the screenshot, use browser_snapshot for actions.`, + inputSchema: screenshotSchema, + type: 'readOnly', + }, + + handle: async (context, params) => { + const tab = context.currentTabOrDie(); + const snapshot = tab.snapshotOrDie(); + const fileType = params.raw ? 'png' : 'jpeg'; + const fileName = await outputFile(context.config, params.filename ?? `page-${new Date().toISOString()}.${fileType}`); + const options: playwright.PageScreenshotOptions = { type: fileType, quality: fileType === 'png' ? undefined : 50, scale: 'css', path: fileName }; + const isElementScreenshot = params.element && params.ref; + + const code = [ + `// Screenshot ${isElementScreenshot ? params.element : 'viewport'} and save it as ${fileName}`, + ]; + + const locator = params.ref ? snapshot.refLocator(params.ref) : null; + + if (locator) + code.push(`await page.${await generateLocator(locator)}.screenshot(${javascript.formatObject(options)});`); + else + code.push(`await page.screenshot(${javascript.formatObject(options)});`); + + const includeBase64 = !context.config.noImageResponses; + const action = async () => { + const screenshot = locator ? await locator.screenshot(options) : await tab.page.screenshot(options); + return { + content: includeBase64 ? [{ + type: 'image' as 'image', + data: screenshot.toString('base64'), + mimeType: fileType === 'png' ? 'image/png' : 'image/jpeg', + }] : [] + }; + }; + + return { + code, + action, + captureSnapshot: true, + waitForNetwork: false, + }; + } +}); + +export default [ + screenshot, +]; diff --git a/src/tools/snapshot.ts b/src/tools/snapshot.ts index e6a2d66..576d578 100644 --- a/src/tools/snapshot.ts +++ b/src/tools/snapshot.ts @@ -18,9 +18,7 @@ import { z } from 'zod'; import { defineTool } from './tool.js'; import * as javascript from '../javascript.js'; -import { outputFile } from '../config.js'; - -import type * as playwright from 'playwright'; +import { generateLocator } from './utils.js'; const snapshot = defineTool({ capability: 'core', @@ -218,72 +216,6 @@ const selectOption = defineTool({ }, }); -const screenshotSchema = z.object({ - raw: z.boolean().optional().describe('Whether to return without compression (in PNG format). Default is false, which returns a JPEG image.'), - filename: z.string().optional().describe('File name to save the screenshot to. Defaults to `page-{timestamp}.{png|jpeg}` if not specified.'), - element: z.string().optional().describe('Human-readable element description used to obtain permission to screenshot the element. If not provided, the screenshot will be taken of viewport. If element is provided, ref must be provided too.'), - ref: z.string().optional().describe('Exact target element reference from the page snapshot. If not provided, the screenshot will be taken of viewport. If ref is provided, element must be provided too.'), -}).refine(data => { - return !!data.element === !!data.ref; -}, { - message: 'Both element and ref must be provided or neither.', - path: ['ref', 'element'] -}); - -const screenshot = defineTool({ - capability: 'core', - schema: { - name: 'browser_take_screenshot', - title: 'Take a screenshot', - description: `Take a screenshot of the current page. You can't perform actions based on the screenshot, use browser_snapshot for actions.`, - inputSchema: screenshotSchema, - type: 'readOnly', - }, - - handle: async (context, params) => { - const tab = context.currentTabOrDie(); - const snapshot = tab.snapshotOrDie(); - const fileType = params.raw ? 'png' : 'jpeg'; - const fileName = await outputFile(context.config, params.filename ?? `page-${new Date().toISOString()}.${fileType}`); - const options: playwright.PageScreenshotOptions = { type: fileType, quality: fileType === 'png' ? undefined : 50, scale: 'css', path: fileName }; - const isElementScreenshot = params.element && params.ref; - - const code = [ - `// Screenshot ${isElementScreenshot ? params.element : 'viewport'} and save it as ${fileName}`, - ]; - - const locator = params.ref ? snapshot.refLocator(params.ref) : null; - - if (locator) - code.push(`await page.${await generateLocator(locator)}.screenshot(${javascript.formatObject(options)});`); - else - code.push(`await page.screenshot(${javascript.formatObject(options)});`); - - const includeBase64 = !context.config.noImageResponses; - const action = async () => { - const screenshot = locator ? await locator.screenshot(options) : await tab.page.screenshot(options); - return { - content: includeBase64 ? [{ - type: 'image' as 'image', - data: screenshot.toString('base64'), - mimeType: fileType === 'png' ? 'image/png' : 'image/jpeg', - }] : [] - }; - }; - - return { - code, - action, - captureSnapshot: true, - waitForNetwork: false, - }; - } -}); - -export async function generateLocator(locator: playwright.Locator): Promise { - return (locator as any)._generateLocatorString(); -} - export default [ snapshot, click, @@ -291,5 +223,4 @@ export default [ hover, type, selectOption, - screenshot, ]; diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 6fd16d4..4a0a2a4 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -77,3 +77,7 @@ export function sanitizeForFilePath(s: string) { return sanitize(s); return sanitize(s.substring(0, separator)) + '.' + sanitize(s.substring(separator + 1)); } + +export async function generateLocator(locator: playwright.Locator): Promise { + return (locator as any)._generateLocatorString(); +} diff --git a/src/tools/screen.ts b/src/tools/vision.ts similarity index 100% rename from src/tools/screen.ts rename to src/tools/vision.ts diff --git a/src/tools/wait.ts b/src/tools/wait.ts new file mode 100644 index 0000000..fc8be82 --- /dev/null +++ b/src/tools/wait.ts @@ -0,0 +1,70 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from 'zod'; +import { defineTool, type ToolFactory } from './tool.js'; + +const wait: ToolFactory = captureSnapshot => defineTool({ + capability: 'wait', + + schema: { + name: 'browser_wait_for', + title: 'Wait for', + description: 'Wait for text to appear or disappear or a specified time to pass', + inputSchema: z.object({ + time: z.number().optional().describe('The time to wait in seconds'), + text: z.string().optional().describe('The text to wait for'), + textGone: z.string().optional().describe('The text to wait for to disappear'), + }), + type: 'readOnly', + }, + + handle: async (context, params) => { + if (!params.text && !params.textGone && !params.time) + throw new Error('Either time, text or textGone must be provided'); + + const code: string[] = []; + + if (params.time) { + code.push(`await new Promise(f => setTimeout(f, ${params.time!} * 1000));`); + await new Promise(f => setTimeout(f, Math.min(10000, params.time! * 1000))); + } + + const tab = context.currentTabOrDie(); + const locator = params.text ? tab.page.getByText(params.text).first() : undefined; + const goneLocator = params.textGone ? tab.page.getByText(params.textGone).first() : undefined; + + if (goneLocator) { + code.push(`await page.getByText(${JSON.stringify(params.textGone)}).first().waitFor({ state: 'hidden' });`); + await goneLocator.waitFor({ state: 'hidden' }); + } + + if (locator) { + code.push(`await page.getByText(${JSON.stringify(params.text)}).first().waitFor({ state: 'visible' });`); + await locator.waitFor({ state: 'visible' }); + } + + return { + code, + captureSnapshot, + waitForNetwork: false, + }; + }, +}); + +export default (captureSnapshot: boolean) => [ + wait(captureSnapshot), +]; diff --git a/utils/update-readme.js b/utils/update-readme.js index d0f6cea..fde7864 100644 --- a/utils/update-readme.js +++ b/utils/update-readme.js @@ -32,42 +32,46 @@ import networkTools from '../lib/tools/network.js'; import pdfTools from '../lib/tools/pdf.js'; import snapshotTools from '../lib/tools/snapshot.js'; import tabsTools from '../lib/tools/tabs.js'; -import screenTools from '../lib/tools/screen.js'; +import screenshotTools from '../lib/tools/screenshot.js'; import testTools from '../lib/tools/testing.js'; +import visionTools from '../lib/tools/vision.js'; +import waitTools from '../lib/tools/wait.js'; // Category definitions for tools const categories = { - 'Snapshot-based Interactions': [ + 'Interactions': [ ...snapshotTools, - ], - 'Vision-based Interactions': [ - ...screenTools - ], - 'Tab Management': [ - ...tabsTools(true), + ...keyboardTools(true), + ...waitTools(true), + ...filesTools(true), + ...dialogsTools(true), ], 'Navigation': [ ...navigateTools(true), ], - 'Keyboard': [ - ...keyboardTools(true) - ], - 'Console': [ - ...consoleTools - ], - 'Files and Media': [ - ...filesTools(true), - ...pdfTools + 'Resources': [ + ...screenshotTools, + ...pdfTools, + ...networkTools, + ...consoleTools, ], 'Utilities': [ - ...commonTools(true), ...installTools, - ...dialogsTools(true), - ...networkTools, + ...commonTools(true), + ], + 'Tabs': [ + ...tabsTools(true), ], 'Testing': [ ...testTools, ], + 'Vision mode': [ + ...visionTools, + ...keyboardTools(), + ...waitTools(false), + ...filesTools(false), + ...dialogsTools(false), + ], }; // NOTE: Can be removed when we drop Node.js 18 support and changed to import.meta.filename. @@ -118,9 +122,12 @@ async function updateReadme() { const generatedLines = /** @type {string[]} */ ([]); for (const [category, categoryTools] of Object.entries(categories)) { - generatedLines.push(`### ${category}\n\n`); + generatedLines.push(`
\n${category}\n\n`); + for (const tool of categoryTools) generatedLines.push(formatToolForReadme(tool.schema)); + + generatedLines.push(`
\n\n`); } const readmePath = path.join(path.dirname(__filename), '..', 'README.md');