diff --git a/src/context.ts b/src/context.ts index a0fd4d6..77fb725 100644 --- a/src/context.ts +++ b/src/context.ts @@ -18,7 +18,8 @@ import * as playwright from 'playwright'; import yaml from 'yaml'; import { waitForCompletion } from './tools/utils'; -import { ToolResult } from './tools/tool'; + +import type { ModalState, Tool, ToolResult } from './tools/tool'; export type ContextOptions = { browserName?: 'chromium' | 'firefox' | 'webkit'; @@ -33,20 +34,43 @@ type PageOrFrameLocator = playwright.Page | playwright.FrameLocator; type RunOptions = { captureSnapshot?: boolean; waitForCompletion?: boolean; - noClearFileChooser?: boolean; }; export class Context { + readonly tools: Tool[]; readonly options: ContextOptions; private _browser: playwright.Browser | undefined; private _browserContext: playwright.BrowserContext | undefined; private _tabs: Tab[] = []; private _currentTab: Tab | undefined; + private _modalStates: (ModalState & { tab: Tab })[] = []; - constructor(options: ContextOptions) { + constructor(tools: Tool[], options: ContextOptions) { + this.tools = tools; this.options = options; } + modalStates(): ModalState[] { + return this._modalStates; + } + + setModalState(modalState: ModalState, inTab: Tab) { + this._modalStates.push({ ...modalState, tab: inTab }); + } + + clearModalState(modalState: ModalState) { + this._modalStates = this._modalStates.filter(state => state !== modalState); + } + + modalStatesMarkdown(): string[] { + const result: string[] = ['### Modal state']; + for (const state of this._modalStates) { + const tool = this.tools.find(tool => tool.clearsModalState === state.type); + result.push(`- [${state.description}]: can be handled by the "${tool?.schema.name}" tool`); + } + return result; + } + tabs(): Tab[] { return this._tabs; } @@ -104,6 +128,7 @@ export class Context { } private _onPageClosed(tab: Tab) { + this._modalStates = this._modalStates.filter(state => state.tab !== tab); const index = this._tabs.indexOf(tab); if (index === -1) return; @@ -188,7 +213,6 @@ class Tab { readonly context: Context; readonly page: playwright.Page; private _console: playwright.ConsoleMessage[] = []; - private _fileChooser: playwright.FileChooser | undefined; private _snapshot: PageSnapshot | undefined; private _onPageClose: (tab: Tab) => void; @@ -202,13 +226,18 @@ class Tab { this._console.length = 0; }); page.on('close', () => this._onClose()); - page.on('filechooser', chooser => this._fileChooser = chooser); + page.on('filechooser', chooser => { + this.context.setModalState({ + type: 'fileChooser', + description: 'File chooser', + fileChooser: chooser, + }, this); + }); page.setDefaultNavigationTimeout(60000); page.setDefaultTimeout(5000); } private _onClose() { - this._fileChooser = undefined; this._console.length = 0; this._onPageClose(this); } @@ -222,8 +251,6 @@ class Tab { async run(callback: (tab: Tab) => Promise, options?: RunOptions): Promise { let runResult: RunResult | undefined; try { - if (!options?.noClearFileChooser) - this._fileChooser = undefined; if (options?.waitForCompletion) runResult = await waitForCompletion(this.page, () => callback(this)) ?? undefined; else @@ -240,13 +267,23 @@ ${runResult.code.join('\n')} \`\`\` `); + if (this.context.modalStates().length) { + result.push(...this.context.modalStatesMarkdown()); + return { + content: [{ + type: 'text', + text: result.join('\n'), + }], + }; + } + if (this.context.tabs().length > 1) result.push(await this.context.listTabs(), ''); if (this._snapshot) { if (this.context.tabs().length > 1) result.push('### Current tab'); - result.push(this._snapshot.text({ hasFileChooser: !!this._fileChooser })); + result.push(this._snapshot.text()); } const images = runResult.images?.map(image => { @@ -289,13 +326,6 @@ ${runResult.code.join('\n')} async console(): Promise { return this._console; } - - async submitFileChooser(paths: string[]) { - if (!this._fileChooser) - throw new Error('No file chooser visible'); - await this._fileChooser.setFiles(paths); - this._fileChooser = undefined; - } } class PageSnapshot { @@ -311,14 +341,8 @@ class PageSnapshot { return snapshot; } - text(options: { hasFileChooser: boolean }): string { - const results: string[] = []; - if (options.hasFileChooser) { - results.push('- There is a file chooser visible that requires browser_file_upload to be called'); - results.push(''); - } - results.push(this._text); - return results.join('\n'); + text(): string { + return this._text; } private async _build(page: playwright.Page) { diff --git a/src/server.ts b/src/server.ts index 8659102..2d81c8b 100644 --- a/src/server.ts +++ b/src/server.ts @@ -32,7 +32,7 @@ type Options = ContextOptions & { export function createServerWithTools(options: Options): Server { const { name, version, tools, resources } = options; - const context = new Context(options); + const context = new Context(tools, options); const server = new Server({ name, version }, { capabilities: { tools: {}, @@ -57,9 +57,21 @@ export function createServerWithTools(options: Options): Server { }; } + const modalStates = context.modalStates().map(state => state.type); + if ((tool.clearsModalState && !modalStates.includes(tool.clearsModalState)) || + (!tool.clearsModalState && modalStates.length)) { + const text = [ + `Tool "${request.params.name}" does not handle the modal state.`, + ...context.modalStatesMarkdown(), + ].join('\n'); + return { + content: [{ type: 'text', text }], + isError: true, + }; + } + try { - const result = await tool.handle(context, request.params.arguments); - return result; + return await tool.handle(context, request.params.arguments); } catch (error) { return { content: [{ type: 'text', text: String(error) }], diff --git a/src/tools/files.ts b/src/tools/files.ts index fc1af6d..13c5b6c 100644 --- a/src/tools/files.ts +++ b/src/tools/files.ts @@ -34,16 +34,20 @@ const uploadFile: ToolFactory = captureSnapshot => ({ const validatedParams = uploadFileSchema.parse(params); const tab = context.currentTab(); return await tab.runAndWait(async () => { - await tab.submitFileChooser(validatedParams.paths); + const modalState = context.modalStates().find(state => state.type === 'fileChooser'); + if (!modalState) + throw new Error('No file chooser visible'); + await modalState.fileChooser.setFiles(validatedParams.paths); + context.clearModalState(modalState); const code = [ `// [ diff --git a/src/tools/tool.ts b/src/tools/tool.ts index 877e0ba..c37d150 100644 --- a/src/tools/tool.ts +++ b/src/tools/tool.ts @@ -17,7 +17,7 @@ import type { ImageContent, TextContent } from '@modelcontextprotocol/sdk/types'; import type { JsonSchema7Type } from 'zod-to-json-schema'; import type { Context } from '../context'; - +import type * as playwright from 'playwright'; export type ToolCapability = 'core' | 'tabs' | 'pdf' | 'history' | 'wait' | 'files' | 'install'; export type ToolSchema = { @@ -26,6 +26,14 @@ export type ToolSchema = { inputSchema: JsonSchema7Type; }; +export type FileUploadModalState = { + type: 'fileChooser'; + description: string; + fileChooser: playwright.FileChooser; +}; + +export type ModalState = FileUploadModalState; + export type ToolResult = { content: (ImageContent | TextContent)[]; isError?: boolean; @@ -34,6 +42,7 @@ export type ToolResult = { export type Tool = { capability: ToolCapability; schema: ToolSchema; + clearsModalState?: ModalState['type']; handle: (context: Context, params?: Record) => Promise; }; diff --git a/tests/basic.spec.ts b/tests/core.spec.ts similarity index 77% rename from tests/basic.spec.ts rename to tests/core.spec.ts index 34f09f3..ed47e70 100644 --- a/tests/basic.spec.ts +++ b/tests/core.spec.ts @@ -14,7 +14,6 @@ * limitations under the License. */ -import fs from 'fs/promises'; import { test, expect } from './fixtures'; test('browser_navigate', async ({ client }) => { @@ -138,63 +137,6 @@ await page.getByRole('listbox').selectOption(['bar', 'baz']); `); }); -test('browser_file_upload', async ({ client }) => { - expect(await client.callTool({ - name: 'browser_navigate', - arguments: { - url: 'data:text/html,Title', - }, - })).toContainTextContent('- textbox [ref=s1e3]'); - - expect(await client.callTool({ - name: 'browser_click', - arguments: { - element: 'Textbox', - ref: 's1e3', - }, - })).toContainTextContent('There is a file chooser visible that requires browser_file_upload to be called'); - - const filePath = test.info().outputPath('test.txt'); - await fs.writeFile(filePath, 'Hello, world!'); - - { - const response = await client.callTool({ - name: 'browser_file_upload', - arguments: { - paths: [filePath], - }, - }); - - expect(response).not.toContainTextContent('There is a file chooser visible that requires browser_file_upload to be called'); - expect(response).toContainTextContent('textbox [ref=s3e3]: C:\\fakepath\\test.txt'); - } - - { - const response = await client.callTool({ - name: 'browser_click', - arguments: { - element: 'Textbox', - ref: 's3e3', - }, - }); - - expect(response).toContainTextContent('There is a file chooser visible that requires browser_file_upload to be called'); - expect(response).toContainTextContent('button "Button" [ref=s4e4]'); - } - - { - const response = await client.callTool({ - name: 'browser_click', - arguments: { - element: 'Button', - ref: 's4e4', - }, - }); - - expect(response, 'not submitting browser_file_upload dismisses file chooser').not.toContainTextContent('There is a file chooser visible that requires browser_file_upload to be called'); - } -}); - test('browser_type', async ({ client }) => { await client.callTool({ name: 'browser_navigate', diff --git a/tests/files.spec.ts b/tests/files.spec.ts new file mode 100644 index 0000000..b7a16b1 --- /dev/null +++ b/tests/files.spec.ts @@ -0,0 +1,77 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { test, expect } from './fixtures'; +import fs from 'fs/promises'; + +test('browser_file_upload', async ({ client }) => { + expect(await client.callTool({ + name: 'browser_navigate', + arguments: { + url: 'data:text/html,Title', + }, + })).toContainTextContent('- textbox [ref=s1e3]'); + + expect(await client.callTool({ + name: 'browser_click', + arguments: { + element: 'Textbox', + ref: 's1e3', + }, + })).toContainTextContent(`### Modal state +- [File chooser]: can be handled by the "browser_file_upload" tool`); + + const filePath = test.info().outputPath('test.txt'); + await fs.writeFile(filePath, 'Hello, world!'); + + { + const response = await client.callTool({ + name: 'browser_file_upload', + arguments: { + paths: [filePath], + }, + }); + + expect(response).not.toContainTextContent('### Modal state'); + expect(response).toContainTextContent('textbox [ref=s3e3]: C:\\fakepath\\test.txt'); + } + + { + const response = await client.callTool({ + name: 'browser_click', + arguments: { + element: 'Textbox', + ref: 's3e3', + }, + }); + + expect(response).toContainTextContent('- [File chooser]: can be handled by the \"browser_file_upload\" tool'); + } + + { + const response = await client.callTool({ + name: 'browser_click', + arguments: { + element: 'Button', + ref: 's4e4', + }, + }); + + expect(response).toContainTextContent(`Tool "browser_click" does not handle the modal state. +### Modal state +- [File chooser]: can be handled by the "browser_file_upload" tool`); + } +});