diff --git a/README.md b/README.md index 18da004..3919fda 100644 --- a/README.md +++ b/README.md @@ -167,22 +167,7 @@ transport = new SSEServerTransport("/messages", res); server.connect(transport); ``` -### Snapshot Mode - -The Playwright MCP provides a set of tools for browser automation. Here are all available tools: - -- **browser_navigate** - - Description: Navigate to a URL - - Parameters: - - `url` (string): The URL to navigate to - -- **browser_go_back** - - Description: Go back to the previous page - - Parameters: None - -- **browser_go_forward** - - Description: Go forward to the next page - - Parameters: None +### Snapshot-based Interactions - **browser_click** - Description: Perform click on a web page @@ -210,109 +195,121 @@ The Playwright MCP provides a set of tools for browser automation. Here are all - `element` (string): Human-readable element description used to obtain permission to interact with the element - `ref` (string): Exact target element reference from the page snapshot - `text` (string): Text to type into the element - - `submit` (boolean): Whether to submit entered text (press Enter after) + - `submit` (boolean, optional): Whether to submit entered text (press Enter after) + - `slowly` (boolean, optional): Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once. - **browser_select_option** - - Description: Select option in a dropdown + - Description: Select an option in a dropdown - Parameters: - `element` (string): Human-readable element description used to obtain permission to interact with the element - `ref` (string): Exact target element reference from the page snapshot - - `values` (array): Array of values to select in the dropdown. + - `values` (array): Array of values to select in the dropdown. This can be a single value or multiple values. -- **browser_choose_file** - - Description: Choose one or multiple files to upload +- **browser_snapshot** + - Description: Capture accessibility snapshot of the current page, this is better than screenshot + - Parameters: None + +- **browser_take_screenshot** + - Description: Take a screenshot of the current page. You can't perform actions based on the screenshot, use browser_snapshot for actions. - Parameters: - - `paths` (array): The absolute paths to the files to upload. Can be a single file or multiple files. + - `raw` (boolean, optional): Whether to return without compression (in PNG format). Default is false, which returns a JPEG image. + +### Vision-based Interactions + +- **browser_screen_move_mouse** + - Description: Move mouse to a given position + - Parameters: + - `element` (string): Human-readable element description used to obtain permission to interact with the element + - `x` (number): X coordinate + - `y` (number): Y coordinate + +- **browser_screen_capture** + - Description: Take a screenshot of the current page + - Parameters: None + +- **browser_screen_click** + - Description: Click left mouse button + - Parameters: + - `element` (string): Human-readable element description used to obtain permission to interact with the element + - `x` (number): X coordinate + - `y` (number): Y coordinate + +- **browser_screen_drag** + - Description: Drag left mouse button + - Parameters: + - `element` (string): Human-readable element description used to obtain permission to interact with the element + - `startX` (number): Start X coordinate + - `startY` (number): Start Y coordinate + - `endX` (number): End X coordinate + - `endY` (number): End Y coordinate + +- **browser_screen_type** + - Description: Type text + - Parameters: + - `text` (string): Text to type + - `submit` (boolean, optional): Whether to submit entered text (press Enter after) - **browser_press_key** - Description: Press a key on the keyboard - Parameters: - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a` -- **browser_snapshot** - - Description: Capture accessibility snapshot of the current page (better than screenshot) +### Tab Management + +- **browser_tab_list** + - Description: List browser tabs - Parameters: None -- **browser_save_as_pdf** - - Description: Save page as PDF - - Parameters: None - -- **browser_take_screenshot** - - Description: Capture screenshot of the page +- **browser_tab_new** + - Description: Open a new tab - Parameters: - - `raw` (string): Optionally returns lossless PNG screenshot. JPEG by default. + - `url` (string, optional): The URL to navigate to in the new tab. If not provided, the new tab will be blank. -- **browser_wait** - - Description: Wait for a specified time in seconds +- **browser_tab_select** + - Description: Select a tab by index - Parameters: - - `time` (number): The time to wait in seconds (capped at 10 seconds) + - `index` (number): The index of the tab to select -- **browser_close** - - Description: Close the page - - Parameters: None +- **browser_tab_close** + - Description: Close a tab + - Parameters: + - `index` (number, optional): The index of the tab to close. Closes current tab if not provided. - -### Vision Mode - -Vision Mode provides tools for visual-based interactions using screenshots. Here are all available tools: +### Navigation - **browser_navigate** - Description: Navigate to a URL - Parameters: - `url` (string): The URL to navigate to -- **browser_go_back** +- **browser_navigate_back** - Description: Go back to the previous page - Parameters: None -- **browser_go_forward** +- **browser_navigate_forward** - Description: Go forward to the next page - Parameters: None -- **browser_screenshot** - - Description: Capture screenshot of the current page - - Parameters: None - -- **browser_move_mouse** - - Description: Move mouse to specified coordinates - - Parameters: - - `x` (number): X coordinate - - `y` (number): Y coordinate - -- **browser_click** - - Description: Click at specified coordinates - - Parameters: - - `x` (number): X coordinate to click at - - `y` (number): Y coordinate to click at - -- **browser_drag** - - Description: Perform drag and drop operation - - Parameters: - - `startX` (number): Start X coordinate - - `startY` (number): Start Y coordinate - - `endX` (number): End X coordinate - - `endY` (number): End Y coordinate - -- **browser_type** - - Description: Type text at specified coordinates - - Parameters: - - `text` (string): Text to type - - `submit` (boolean): Whether to submit entered text (press Enter after) +### Keyboard - **browser_press_key** - Description: Press a key on the keyboard - Parameters: - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a` +### Files and Media + - **browser_choose_file** - Description: Choose one or multiple files to upload - Parameters: - `paths` (array): The absolute paths to the files to upload. Can be a single file or multiple files. -- **browser_save_as_pdf** +- **browser_pdf_save** - Description: Save page as PDF - Parameters: None +### Utilities + - **browser_wait** - Description: Wait for a specified time in seconds - Parameters: @@ -321,3 +318,10 @@ Vision Mode provides tools for visual-based interactions using screenshots. Here - **browser_close** - Description: Close the page - Parameters: None + +- **browser_install** + - Description: Install the browser specified in the config. Call this if you get an error about the browser not being installed. + - Parameters: None + +### Vision Mode + diff --git a/src/context.ts b/src/context.ts index 6d5cd31..4c8fc02 100644 --- a/src/context.ts +++ b/src/context.ts @@ -14,9 +14,6 @@ * limitations under the License. */ -import { fork } from 'child_process'; -import path from 'path'; - import * as playwright from 'playwright'; import yaml from 'yaml'; @@ -41,14 +38,14 @@ type RunOptions = { }; export class Context { - private _options: ContextOptions; + readonly options: ContextOptions; private _browser: playwright.Browser | undefined; private _browserContext: playwright.BrowserContext | undefined; private _tabs: Tab[] = []; private _currentTab: Tab | undefined; constructor(options: ContextOptions) { - this._options = options; + this.options = options; } tabs(): Tab[] { @@ -123,25 +120,6 @@ export class Context { } } - async install(): Promise { - const channel = this._options.launchOptions?.channel ?? this._options.browserName ?? 'chrome'; - const cli = path.join(require.resolve('playwright/package.json'), '..', 'cli.js'); - const child = fork(cli, ['install', channel], { - stdio: 'pipe', - }); - const output: string[] = []; - child.stdout?.on('data', data => output.push(data.toString())); - child.stderr?.on('data', data => output.push(data.toString())); - return new Promise((resolve, reject) => { - child.on('close', code => { - if (code === 0) - resolve(channel); - else - reject(new Error(`Failed to install browser: ${output.join('')}`)); - }); - }); - } - async close() { if (!this._browserContext) return; @@ -161,19 +139,19 @@ export class Context { } private async _createBrowserContext(): Promise<{ browser?: playwright.Browser, browserContext: playwright.BrowserContext }> { - if (this._options.remoteEndpoint) { - const url = new URL(this._options.remoteEndpoint); - if (this._options.browserName) - url.searchParams.set('browser', this._options.browserName); - if (this._options.launchOptions) - url.searchParams.set('launch-options', JSON.stringify(this._options.launchOptions)); - const browser = await playwright[this._options.browserName ?? 'chromium'].connect(String(url)); + if (this.options.remoteEndpoint) { + const url = new URL(this.options.remoteEndpoint); + if (this.options.browserName) + url.searchParams.set('browser', this.options.browserName); + if (this.options.launchOptions) + url.searchParams.set('launch-options', JSON.stringify(this.options.launchOptions)); + const browser = await playwright[this.options.browserName ?? 'chromium'].connect(String(url)); const browserContext = await browser.newContext(); return { browser, browserContext }; } - if (this._options.cdpEndpoint) { - const browser = await playwright.chromium.connectOverCDP(this._options.cdpEndpoint); + if (this.options.cdpEndpoint) { + const browser = await playwright.chromium.connectOverCDP(this.options.cdpEndpoint); const browserContext = browser.contexts()[0]; return { browser, browserContext }; } @@ -184,8 +162,8 @@ export class Context { private async _launchPersistentContext(): Promise { try { - const browserType = this._options.browserName ? playwright[this._options.browserName] : playwright.chromium; - return await browserType.launchPersistentContext(this._options.userDataDir, this._options.launchOptions); + const browserType = this.options.browserName ? playwright[this.options.browserName] : playwright.chromium; + return await browserType.launchPersistentContext(this.options.userDataDir, this.options.launchOptions); } catch (error: any) { if (error.message.includes('Executable doesn\'t exist')) throw new Error(`Browser specified in your config is not installed. Either install it (likely) or change the config.`); diff --git a/src/index.ts b/src/index.ts index 3437b11..0be508f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -15,10 +15,15 @@ */ import { createServerWithTools } from './server'; -import * as snapshot from './tools/snapshot'; -import * as common from './tools/common'; -import * as screenshot from './tools/screenshot'; -import * as tabs from './tools/tabs'; +import common from './tools/common'; +import fileChooser from './tools/fileChooser'; +import install from './tools/install'; +import keyboard from './tools/keyboard'; +import navigate from './tools/navigate'; +import pdf from './tools/pdf'; +import snapshot from './tools/snapshot'; +import tabs from './tools/tabs'; +import screen from './tools/screen'; import { console } from './resources/console'; import type { Tool } from './tools/tool'; @@ -26,46 +31,26 @@ import type { Resource } from './resources/resource'; import type { Server } from '@modelcontextprotocol/sdk/server/index.js'; import type { LaunchOptions } from 'playwright'; -const commonTools: Tool[] = [ - common.wait, - common.pdf, - common.close, - common.install, - tabs.listTabs, - tabs.newTab, -]; - const snapshotTools: Tool[] = [ - common.navigate(true), - snapshot.snapshot, - snapshot.click, - snapshot.hover, - snapshot.type, - snapshot.selectOption, - snapshot.screenshot, - common.goBack(true), - common.goForward(true), - common.chooseFile(true), - common.pressKey(true), - ...commonTools, - tabs.selectTab(true), - tabs.closeTab(true), + ...common, + ...fileChooser(true), + ...install, + ...keyboard(true), + ...navigate(true), + ...pdf, + ...snapshot, + ...tabs(true), ]; const screenshotTools: Tool[] = [ - common.navigate(false), - screenshot.screenshot, - screenshot.moveMouse, - screenshot.click, - screenshot.drag, - screenshot.type, - common.goBack(false), - common.goForward(false), - common.chooseFile(false), - common.pressKey(false), - ...commonTools, - tabs.selectTab(false), - tabs.closeTab(false), + ...common, + ...fileChooser(false), + ...install, + ...keyboard(false), + ...navigate(false), + ...pdf, + ...screen, + ...tabs(false), ]; const resources: Resource[] = [ diff --git a/src/tools/common.ts b/src/tools/common.ts index 62acb91..51a42c6 100644 --- a/src/tools/common.ts +++ b/src/tools/common.ts @@ -14,79 +14,16 @@ * limitations under the License. */ -import os from 'os'; -import path from 'path'; - import { z } from 'zod'; import { zodToJsonSchema } from 'zod-to-json-schema'; -import { sanitizeForFilePath } from './utils'; - -import type { ToolFactory, Tool } from './tool'; - -const navigateSchema = z.object({ - url: z.string().describe('The URL to navigate to'), -}); - -export const navigate: ToolFactory = captureSnapshot => ({ - schema: { - name: 'browser_navigate', - description: 'Navigate to a URL', - inputSchema: zodToJsonSchema(navigateSchema), - }, - handle: async (context, params) => { - const validatedParams = navigateSchema.parse(params); - const currentTab = await context.ensureTab(); - return await currentTab.run(async tab => { - await tab.navigate(validatedParams.url); - }, { - status: `Navigated to ${validatedParams.url}`, - captureSnapshot, - }); - }, -}); - -const goBackSchema = z.object({}); - -export const goBack: ToolFactory = snapshot => ({ - schema: { - name: 'browser_go_back', - description: 'Go back to the previous page', - inputSchema: zodToJsonSchema(goBackSchema), - }, - handle: async context => { - return await context.currentTab().runAndWait(async tab => { - await tab.page.goBack(); - }, { - status: 'Navigated back', - captureSnapshot: snapshot, - }); - }, -}); - -const goForwardSchema = z.object({}); - -export const goForward: ToolFactory = snapshot => ({ - schema: { - name: 'browser_go_forward', - description: 'Go forward to the next page', - inputSchema: zodToJsonSchema(goForwardSchema), - }, - handle: async context => { - return await context.currentTab().runAndWait(async tab => { - await tab.page.goForward(); - }, { - status: 'Navigated forward', - captureSnapshot: snapshot, - }); - }, -}); +import type { Tool } from './tool'; const waitSchema = z.object({ time: z.number().describe('The time to wait in seconds'), }); -export const wait: Tool = { +const wait: Tool = { schema: { name: 'browser_wait', description: 'Wait for a specified time in seconds', @@ -104,51 +41,9 @@ export const wait: Tool = { }, }; -const pressKeySchema = z.object({ - key: z.string().describe('Name of the key to press or a character to generate, such as `ArrowLeft` or `a`'), -}); - -export const pressKey: (captureSnapshot: boolean) => Tool = captureSnapshot => ({ - schema: { - name: 'browser_press_key', - description: 'Press a key on the keyboard', - inputSchema: zodToJsonSchema(pressKeySchema), - }, - handle: async (context, params) => { - const validatedParams = pressKeySchema.parse(params); - return await context.currentTab().runAndWait(async tab => { - await tab.page.keyboard.press(validatedParams.key); - }, { - status: `Pressed key ${validatedParams.key}`, - captureSnapshot, - }); - }, -}); - -const pdfSchema = z.object({}); - -export const pdf: Tool = { - schema: { - name: 'browser_save_as_pdf', - description: 'Save page as PDF', - inputSchema: zodToJsonSchema(pdfSchema), - }, - handle: async context => { - const tab = context.currentTab(); - const fileName = path.join(os.tmpdir(), sanitizeForFilePath(`page-${new Date().toISOString()}`)) + '.pdf'; - await tab.page.pdf({ path: fileName }); - return { - content: [{ - type: 'text', - text: `Saved as ${fileName}`, - }], - }; - }, -}; - const closeSchema = z.object({}); -export const close: Tool = { +const close: Tool = { schema: { name: 'browser_close', description: 'Close the page', @@ -165,42 +60,7 @@ export const close: Tool = { }, }; -const chooseFileSchema = z.object({ - paths: z.array(z.string()).describe('The absolute paths to the files to upload. Can be a single file or multiple files.'), -}); - -export const chooseFile: ToolFactory = captureSnapshot => ({ - schema: { - name: 'browser_choose_file', - description: 'Choose one or multiple files to upload', - inputSchema: zodToJsonSchema(chooseFileSchema), - }, - handle: async (context, params) => { - const validatedParams = chooseFileSchema.parse(params); - const tab = context.currentTab(); - return await tab.runAndWait(async () => { - await tab.submitFileChooser(validatedParams.paths); - }, { - status: `Chose files ${validatedParams.paths.join(', ')}`, - captureSnapshot, - noClearFileChooser: true, - }); - }, -}); - -export const install: Tool = { - schema: { - name: 'browser_install', - description: 'Install the browser specified in the config. Call this if you get an error about the browser not being installed.', - inputSchema: zodToJsonSchema(z.object({})), - }, - handle: async context => { - const channel = await context.install(); - return { - content: [{ - type: 'text', - text: `Browser ${channel} installed`, - }], - }; - }, -}; +export default [ + close, + wait, +]; diff --git a/src/tools/fileChooser.ts b/src/tools/fileChooser.ts new file mode 100644 index 0000000..865f480 --- /dev/null +++ b/src/tools/fileChooser.ts @@ -0,0 +1,47 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +import type { ToolFactory } from './tool'; + +const chooseFileSchema = z.object({ + paths: z.array(z.string()).describe('The absolute paths to the files to upload. Can be a single file or multiple files.'), +}); + +const chooseFile: ToolFactory = captureSnapshot => ({ + schema: { + name: 'browser_choose_file', + description: 'Choose one or multiple files to upload', + inputSchema: zodToJsonSchema(chooseFileSchema), + }, + handle: async (context, params) => { + const validatedParams = chooseFileSchema.parse(params); + const tab = context.currentTab(); + return await tab.runAndWait(async () => { + await tab.submitFileChooser(validatedParams.paths); + }, { + status: `Chose files ${validatedParams.paths.join(', ')}`, + captureSnapshot, + noClearFileChooser: true, + }); + }, +}); + +export default (captureSnapshot: boolean) => [ + chooseFile(captureSnapshot), +]; diff --git a/src/tools/install.ts b/src/tools/install.ts new file mode 100644 index 0000000..9998253 --- /dev/null +++ b/src/tools/install.ts @@ -0,0 +1,60 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { fork } from 'child_process'; +import path from 'path'; + +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +import type { Tool } from './tool'; + +const install: Tool = { + schema: { + name: 'browser_install', + description: 'Install the browser specified in the config. Call this if you get an error about the browser not being installed.', + inputSchema: zodToJsonSchema(z.object({})), + }, + + handle: async context => { + const channel = context.options.launchOptions?.channel ?? context.options.browserName ?? 'chrome'; + const cli = path.join(require.resolve('playwright/package.json'), '..', 'cli.js'); + const child = fork(cli, ['install', channel], { + stdio: 'pipe', + }); + const output: string[] = []; + child.stdout?.on('data', data => output.push(data.toString())); + child.stderr?.on('data', data => output.push(data.toString())); + await new Promise((resolve, reject) => { + child.on('close', code => { + if (code === 0) + resolve(); + else + reject(new Error(`Failed to install browser: ${output.join('')}`)); + }); + }); + return { + content: [{ + type: 'text', + text: `Browser ${channel} installed`, + }], + }; + }, +}; + +export default [ + install, +]; diff --git a/src/tools/keyboard.ts b/src/tools/keyboard.ts new file mode 100644 index 0000000..c06974e --- /dev/null +++ b/src/tools/keyboard.ts @@ -0,0 +1,45 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from 'zod'; +import zodToJsonSchema from 'zod-to-json-schema'; + +import type { ToolFactory } from './tool'; + +const pressKeySchema = z.object({ + key: z.string().describe('Name of the key to press or a character to generate, such as `ArrowLeft` or `a`'), +}); + +const pressKey: ToolFactory = captureSnapshot => ({ + schema: { + name: 'browser_press_key', + description: 'Press a key on the keyboard', + inputSchema: zodToJsonSchema(pressKeySchema), + }, + handle: async (context, params) => { + const validatedParams = pressKeySchema.parse(params); + return await context.currentTab().runAndWait(async tab => { + await tab.page.keyboard.press(validatedParams.key); + }, { + status: `Pressed key ${validatedParams.key}`, + captureSnapshot, + }); + }, +}); + +export default (captureSnapshot: boolean) => [ + pressKey(captureSnapshot), +]; diff --git a/src/tools/navigate.ts b/src/tools/navigate.ts new file mode 100644 index 0000000..05647a0 --- /dev/null +++ b/src/tools/navigate.ts @@ -0,0 +1,84 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +import type { ToolFactory } from './tool'; + +const navigateSchema = z.object({ + url: z.string().describe('The URL to navigate to'), +}); + +const navigate: ToolFactory = captureSnapshot => ({ + schema: { + name: 'browser_navigate', + description: 'Navigate to a URL', + inputSchema: zodToJsonSchema(navigateSchema), + }, + handle: async (context, params) => { + const validatedParams = navigateSchema.parse(params); + const currentTab = await context.ensureTab(); + return await currentTab.run(async tab => { + await tab.navigate(validatedParams.url); + }, { + status: `Navigated to ${validatedParams.url}`, + captureSnapshot, + }); + }, +}); + +const goBackSchema = z.object({}); + +const goBack: ToolFactory = snapshot => ({ + schema: { + name: 'browser_navigate_back', + description: 'Go back to the previous page', + inputSchema: zodToJsonSchema(goBackSchema), + }, + handle: async context => { + return await context.currentTab().runAndWait(async tab => { + await tab.page.goBack(); + }, { + status: 'Navigated back', + captureSnapshot: snapshot, + }); + }, +}); + +const goForwardSchema = z.object({}); + +const goForward: ToolFactory = snapshot => ({ + schema: { + name: 'browser_navigate_forward', + description: 'Go forward to the next page', + inputSchema: zodToJsonSchema(goForwardSchema), + }, + handle: async context => { + return await context.currentTab().runAndWait(async tab => { + await tab.page.goForward(); + }, { + status: 'Navigated forward', + captureSnapshot: snapshot, + }); + }, +}); + +export default (captureSnapshot: boolean) => [ + navigate(captureSnapshot), + goBack(captureSnapshot), + goForward(captureSnapshot), +]; diff --git a/src/tools/pdf.ts b/src/tools/pdf.ts new file mode 100644 index 0000000..76bacd5 --- /dev/null +++ b/src/tools/pdf.ts @@ -0,0 +1,50 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import os from 'os'; +import path from 'path'; + +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +import { sanitizeForFilePath } from './utils'; + +import type { Tool } from './tool'; + +const pdfSchema = z.object({}); + +const pdf: Tool = { + schema: { + name: 'browser_pdf_save', + description: 'Save page as PDF', + inputSchema: zodToJsonSchema(pdfSchema), + }, + handle: async context => { + const tab = context.currentTab(); + const fileName = path.join(os.tmpdir(), sanitizeForFilePath(`page-${new Date().toISOString()}`)) + '.pdf'; + await tab.page.pdf({ path: fileName }); + return { + content: [{ + type: 'text', + text: `Saved as ${fileName}`, + }], + }; + }, +}; + +export default [ + pdf, +]; diff --git a/src/tools/screenshot.ts b/src/tools/screen.ts similarity index 91% rename from src/tools/screenshot.ts rename to src/tools/screen.ts index 936cf35..d9622d7 100644 --- a/src/tools/screenshot.ts +++ b/src/tools/screen.ts @@ -19,9 +19,9 @@ import { zodToJsonSchema } from 'zod-to-json-schema'; import type { Tool } from './tool'; -export const screenshot: Tool = { +const screenshot: Tool = { schema: { - name: 'browser_screenshot', + name: 'browser_screen_capture', description: 'Take a screenshot of the current page', inputSchema: zodToJsonSchema(z.object({})), }, @@ -44,9 +44,9 @@ const moveMouseSchema = elementSchema.extend({ y: z.number().describe('Y coordinate'), }); -export const moveMouse: Tool = { +const moveMouse: Tool = { schema: { - name: 'browser_move_mouse', + name: 'browser_screen_move_mouse', description: 'Move mouse to a given position', inputSchema: zodToJsonSchema(moveMouseSchema), }, @@ -66,9 +66,9 @@ const clickSchema = elementSchema.extend({ y: z.number().describe('Y coordinate'), }); -export const click: Tool = { +const click: Tool = { schema: { - name: 'browser_click', + name: 'browser_screen_click', description: 'Click left mouse button', inputSchema: zodToJsonSchema(clickSchema), }, @@ -92,9 +92,9 @@ const dragSchema = elementSchema.extend({ endY: z.number().describe('End Y coordinate'), }); -export const drag: Tool = { +const drag: Tool = { schema: { - name: 'browser_drag', + name: 'browser_screen_drag', description: 'Drag left mouse button', inputSchema: zodToJsonSchema(dragSchema), }, @@ -117,9 +117,9 @@ const typeSchema = z.object({ submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'), }); -export const type: Tool = { +const type: Tool = { schema: { - name: 'browser_type', + name: 'browser_screen_type', description: 'Type text', inputSchema: zodToJsonSchema(typeSchema), }, @@ -135,3 +135,11 @@ export const type: Tool = { }); }, }; + +export default [ + screenshot, + moveMouse, + click, + drag, + type, +]; diff --git a/src/tools/snapshot.ts b/src/tools/snapshot.ts index c6a87dc..8e1cb47 100644 --- a/src/tools/snapshot.ts +++ b/src/tools/snapshot.ts @@ -20,7 +20,7 @@ import zodToJsonSchema from 'zod-to-json-schema'; import type * as playwright from 'playwright'; import type { Tool } from './tool'; -export const snapshot: Tool = { +const snapshot: Tool = { schema: { name: 'browser_snapshot', description: 'Capture accessibility snapshot of the current page, this is better than screenshot', @@ -37,7 +37,7 @@ const elementSchema = z.object({ ref: z.string().describe('Exact target element reference from the page snapshot'), }); -export const click: Tool = { +const click: Tool = { schema: { name: 'browser_click', description: 'Perform click on a web page', @@ -62,7 +62,7 @@ const dragSchema = z.object({ endRef: z.string().describe('Exact target element reference from the page snapshot'), }); -export const drag: Tool = { +const drag: Tool = { schema: { name: 'browser_drag', description: 'Perform drag and drop between two elements', @@ -81,7 +81,7 @@ export const drag: Tool = { }, }; -export const hover: Tool = { +const hover: Tool = { schema: { name: 'browser_hover', description: 'Hover over element on page', @@ -105,7 +105,7 @@ const typeSchema = elementSchema.extend({ slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'), }); -export const type: Tool = { +const type: Tool = { schema: { name: 'browser_type', description: 'Type text into editable element', @@ -132,7 +132,7 @@ const selectOptionSchema = elementSchema.extend({ values: z.array(z.string()).describe('Array of values to select in the dropdown. This can be a single value or multiple values.'), }); -export const selectOption: Tool = { +const selectOption: Tool = { schema: { name: 'browser_select_option', description: 'Select an option in a dropdown', @@ -154,7 +154,7 @@ const screenshotSchema = z.object({ raw: z.boolean().optional().describe('Whether to return without compression (in PNG format). Default is false, which returns a JPEG image.'), }); -export const screenshot: Tool = { +const screenshot: Tool = { schema: { name: 'browser_take_screenshot', description: `Take a screenshot of the current page. You can't perform actions based on the screenshot, use browser_snapshot for actions.`, @@ -171,3 +171,13 @@ export const screenshot: Tool = { }; }, }; + +export default [ + snapshot, + click, + drag, + hover, + type, + selectOption, + screenshot, +]; diff --git a/src/tools/tabs.ts b/src/tools/tabs.ts index 8dfdafd..1add315 100644 --- a/src/tools/tabs.ts +++ b/src/tools/tabs.ts @@ -19,9 +19,9 @@ import { zodToJsonSchema } from 'zod-to-json-schema'; import type { ToolFactory, Tool } from './tool'; -export const listTabs: Tool = { +const listTabs: Tool = { schema: { - name: 'browser_list_tabs', + name: 'browser_tab_list', description: 'List browser tabs', inputSchema: zodToJsonSchema(z.object({})), }, @@ -39,9 +39,9 @@ const selectTabSchema = z.object({ index: z.number().describe('The index of the tab to select'), }); -export const selectTab: ToolFactory = captureSnapshot => ({ +const selectTab: ToolFactory = captureSnapshot => ({ schema: { - name: 'browser_select_tab', + name: 'browser_tab_select', description: 'Select a tab by index', inputSchema: zodToJsonSchema(selectTabSchema), }, @@ -57,9 +57,9 @@ const newTabSchema = z.object({ url: z.string().optional().describe('The URL to navigate to in the new tab. If not provided, the new tab will be blank.'), }); -export const newTab: Tool = { +const newTab: Tool = { schema: { - name: 'browser_new_tab', + name: 'browser_tab_new', description: 'Open a new tab', inputSchema: zodToJsonSchema(newTabSchema), }, @@ -76,9 +76,9 @@ const closeTabSchema = z.object({ index: z.number().optional().describe('The index of the tab to close. Closes current tab if not provided.'), }); -export const closeTab: ToolFactory = captureSnapshot => ({ +const closeTab: ToolFactory = captureSnapshot => ({ schema: { - name: 'browser_close_tab', + name: 'browser_tab_close', description: 'Close a tab', inputSchema: zodToJsonSchema(closeTabSchema), }, @@ -96,3 +96,10 @@ export const closeTab: ToolFactory = captureSnapshot => ({ }; }, }); + +export default (captureSnapshot: boolean) => [ + listTabs, + newTab, + selectTab(captureSnapshot), + closeTab(captureSnapshot), +]; diff --git a/tests/basic.spec.ts b/tests/basic.spec.ts index 9457385..d6e8741 100644 --- a/tests/basic.spec.ts +++ b/tests/basic.spec.ts @@ -21,49 +21,50 @@ import { test, expect } from './fixtures'; test('test tool list', async ({ client, visionClient }) => { const { tools } = await client.listTools(); - expect(tools.map(t => t.name)).toEqual([ - 'browser_navigate', - 'browser_snapshot', - 'browser_click', - 'browser_hover', - 'browser_type', - 'browser_select_option', - 'browser_take_screenshot', - 'browser_go_back', - 'browser_go_forward', - 'browser_choose_file', - 'browser_press_key', - 'browser_wait', - 'browser_save_as_pdf', - 'browser_close', - 'browser_install', - 'browser_list_tabs', - 'browser_new_tab', - 'browser_select_tab', - 'browser_close_tab', - ]); - - const { tools: visionTools } = await visionClient.listTools(); - expect(visionTools.map(t => t.name)).toEqual([ - 'browser_navigate', - 'browser_screenshot', - 'browser_move_mouse', + expect(new Set(tools.map(t => t.name))).toEqual(new Set([ 'browser_click', 'browser_drag', + 'browser_hover', + 'browser_select_option', 'browser_type', - 'browser_go_back', - 'browser_go_forward', 'browser_choose_file', - 'browser_press_key', - 'browser_wait', - 'browser_save_as_pdf', 'browser_close', 'browser_install', - 'browser_list_tabs', - 'browser_new_tab', - 'browser_select_tab', - 'browser_close_tab', - ]); + 'browser_navigate_back', + 'browser_navigate_forward', + 'browser_navigate', + 'browser_pdf_save', + 'browser_press_key', + 'browser_snapshot', + 'browser_tab_close', + 'browser_tab_list', + 'browser_tab_new', + 'browser_tab_select', + 'browser_take_screenshot', + 'browser_wait', + ])); + + const { tools: visionTools } = await visionClient.listTools(); + expect(new Set(visionTools.map(t => t.name))).toEqual(new Set([ + 'browser_choose_file', + 'browser_close', + 'browser_install', + 'browser_navigate_back', + 'browser_navigate_forward', + 'browser_navigate', + 'browser_pdf_save', + 'browser_press_key', + 'browser_screen_capture', + 'browser_screen_click', + 'browser_screen_drag', + 'browser_screen_move_mouse', + 'browser_screen_type', + 'browser_tab_close', + 'browser_tab_list', + 'browser_tab_new', + 'browser_tab_select', + 'browser_wait', + ])); }); test('test resources list', async ({ client }) => { @@ -369,7 +370,7 @@ Navigated to data:text/html,TitleHello, world! ); const response = await client.callTool({ - name: 'browser_save_as_pdf', + name: 'browser_pdf_save', }); expect(response).toHaveTextContent(/^Saved as.*page-[^:]+.pdf$/); }); diff --git a/tests/tabs.spec.ts b/tests/tabs.spec.ts index 4410048..ea1f3f6 100644 --- a/tests/tabs.spec.ts +++ b/tests/tabs.spec.ts @@ -22,7 +22,7 @@ import type { Client } from '@modelcontextprotocol/sdk/client/index.js'; async function createTab(client: Client, title: string, body: string) { return await client.callTool({ - name: 'browser_new_tab', + name: 'browser_tab_new', arguments: { url: `data:text/html,${title}${body}`, }, @@ -62,7 +62,7 @@ test('select tab', async ({ client }) => { await createTab(client, 'Tab one', 'Body one'); await createTab(client, 'Tab two', 'Body two'); expect(await client.callTool({ - name: 'browser_select_tab', + name: 'browser_tab_select', arguments: { index: 2, }, @@ -85,7 +85,7 @@ test('close tab', async ({ client }) => { await createTab(client, 'Tab one', 'Body one'); await createTab(client, 'Tab two', 'Body two'); expect(await client.callTool({ - name: 'browser_close_tab', + name: 'browser_tab_close', arguments: { index: 3, },