From ecfa10448b5dfc19415b48e1d1e6266710ed5120 Mon Sep 17 00:00:00 2001 From: Pavel Feldman Date: Thu, 24 Jul 2025 16:22:03 -0700 Subject: [PATCH] chore: extract loop tools into a separate folder (#755) --- src/loop/loop.ts | 13 +++--- src/loop/loopClaude.ts | 41 ++++++++++++------- src/loop/loopOpenAI.ts | 42 +++++++++++-------- src/loop/onetool.ts | 85 --------------------------------------- src/loopTools/context.ts | 65 ++++++++++++++++++++++++++++++ src/loopTools/main.ts | 63 +++++++++++++++++++++++++++++ src/loopTools/perform.ts | 36 +++++++++++++++++ src/loopTools/snapshot.ts | 32 +++++++++++++++ src/loopTools/tool.ts | 29 +++++++++++++ src/program.ts | 6 +++ src/tools/tool.ts | 39 +++--------------- 11 files changed, 296 insertions(+), 155 deletions(-) delete mode 100644 src/loop/onetool.ts create mode 100644 src/loopTools/context.ts create mode 100644 src/loopTools/main.ts create mode 100644 src/loopTools/perform.ts create mode 100644 src/loopTools/snapshot.ts create mode 100644 src/loopTools/tool.ts diff --git a/src/loop/loop.ts b/src/loop/loop.ts index f925bc6..5546ff3 100644 --- a/src/loop/loop.ts +++ b/src/loop/loop.ts @@ -41,15 +41,16 @@ export type LLMConversation = { }; export interface LLMDelegate { - createConversation(task: string, tools: Tool[]): LLMConversation; + createConversation(task: string, tools: Tool[], oneShot: boolean): LLMConversation; makeApiCall(conversation: LLMConversation): Promise; addToolResults(conversation: LLMConversation, results: Array<{ toolCallId: string; content: string; isError?: boolean }>): void; checkDoneToolCall(toolCall: LLMToolCall): string | null; } -export async function runTask(delegate: LLMDelegate, client: Client, task: string): Promise { +export async function runTask(delegate: LLMDelegate, client: Client, task: string, oneShot: boolean = false): Promise { const { tools } = await client.listTools(); - const conversation = delegate.createConversation(task, tools); + const taskContent = oneShot ? `Perform following task: ${task}.` : `Perform following task: ${task}. Once the task is complete, call the "done" tool.`; + const conversation = delegate.createConversation(taskContent, tools, oneShot); for (let iteration = 0; iteration < 5; ++iteration) { debug('history')('Making API call for iteration', iteration); @@ -99,8 +100,10 @@ export async function runTask(delegate: LLMDelegate, client: Client, task: strin } } - // Add tool results to conversation - delegate.addToolResults(conversation, toolResults); + if (oneShot) + return toolResults.map(result => result.content).join('\n'); + else + delegate.addToolResults(conversation, toolResults); } throw new Error('Failed to perform step, max attempts reached'); diff --git a/src/loop/loopClaude.ts b/src/loop/loopClaude.ts index c05e972..2fd3ed2 100644 --- a/src/loop/loopClaude.ts +++ b/src/loop/loopClaude.ts @@ -14,38 +14,48 @@ * limitations under the License. */ -import Anthropic from '@anthropic-ai/sdk'; +import type Anthropic from '@anthropic-ai/sdk'; import type { LLMDelegate, LLMConversation, LLMToolCall, LLMTool } from './loop.js'; import type { Tool } from '@modelcontextprotocol/sdk/types.js'; const model = 'claude-sonnet-4-20250514'; export class ClaudeDelegate implements LLMDelegate { - private anthropic = new Anthropic(); + private _anthropic: Anthropic | undefined; - createConversation(task: string, tools: Tool[]): LLMConversation { + async anthropic(): Promise { + if (!this._anthropic) { + const anthropic = await import('@anthropic-ai/sdk'); + this._anthropic = new anthropic.Anthropic(); + } + return this._anthropic; + } + + createConversation(task: string, tools: Tool[], oneShot: boolean): LLMConversation { const llmTools: LLMTool[] = tools.map(tool => ({ name: tool.name, description: tool.description || '', inputSchema: tool.inputSchema, })); - // Add the "done" tool - llmTools.push({ - name: 'done', - description: 'Call this tool when the task is complete.', - inputSchema: { - type: 'object', - properties: { - result: { type: 'string', description: 'The result of the task.' }, + if (!oneShot) { + llmTools.push({ + name: 'done', + description: 'Call this tool when the task is complete.', + inputSchema: { + type: 'object', + properties: { + result: { type: 'string', description: 'The result of the task.' }, + }, + required: ['result'], }, - }, - }); + }); + } return { messages: [{ role: 'user', - content: `Perform following task: ${task}. Once the task is complete, call the "done" tool.` + content: task }], tools: llmTools, }; @@ -119,7 +129,8 @@ export class ClaudeDelegate implements LLMDelegate { input_schema: tool.inputSchema, })); - const response = await this.anthropic.messages.create({ + const anthropic = await this.anthropic(); + const response = await anthropic.messages.create({ model, max_tokens: 10000, messages: claudeMessages, diff --git a/src/loop/loopOpenAI.ts b/src/loop/loopOpenAI.ts index 59f1011..224b19d 100644 --- a/src/loop/loopOpenAI.ts +++ b/src/loop/loopOpenAI.ts @@ -14,39 +14,48 @@ * limitations under the License. */ -import OpenAI from 'openai'; +import type OpenAI from 'openai'; import type { LLMDelegate, LLMConversation, LLMToolCall, LLMTool } from './loop.js'; import type { Tool } from '@modelcontextprotocol/sdk/types.js'; const model = 'gpt-4.1'; export class OpenAIDelegate implements LLMDelegate { - private openai = new OpenAI(); + private _openai: OpenAI | undefined; - createConversation(task: string, tools: Tool[]): LLMConversation { + async openai(): Promise { + if (!this._openai) { + const oai = await import('openai'); + this._openai = new oai.OpenAI(); + } + return this._openai; + } + + createConversation(task: string, tools: Tool[], oneShot: boolean): LLMConversation { const genericTools: LLMTool[] = tools.map(tool => ({ name: tool.name, description: tool.description || '', inputSchema: tool.inputSchema, })); - // Add the "done" tool - genericTools.push({ - name: 'done', - description: 'Call this tool when the task is complete.', - inputSchema: { - type: 'object', - properties: { - result: { type: 'string', description: 'The result of the task.' }, + if (!oneShot) { + genericTools.push({ + name: 'done', + description: 'Call this tool when the task is complete.', + inputSchema: { + type: 'object', + properties: { + result: { type: 'string', description: 'The result of the task.' }, + }, + required: ['result'], }, - required: ['result'], - }, - }); + }); + } return { messages: [{ role: 'user', - content: `Peform following task: ${task}. Once the task is complete, call the "done" tool.` + content: task }], tools: genericTools, }; @@ -108,7 +117,8 @@ export class OpenAIDelegate implements LLMDelegate { }, })); - const response = await this.openai.chat.completions.create({ + const openai = await this.openai(); + const response = await openai.chat.completions.create({ model, messages: openaiMessages, tools: openaiTools, diff --git a/src/loop/onetool.ts b/src/loop/onetool.ts deleted file mode 100644 index 748de21..0000000 --- a/src/loop/onetool.ts +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Copyright (c) Microsoft Corporation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import { Client } from '@modelcontextprotocol/sdk/client/index.js'; -import dotenv from 'dotenv'; -import { z } from 'zod'; - -import { contextFactory } from '../browserContextFactory.js'; -import { BrowserServerBackend } from '../browserServerBackend.js'; -import { Context } from '../context.js'; -import { logUnhandledError } from '../log.js'; -import { InProcessTransport } from '../mcp/inProcessTransport.js'; -import * as mcpServer from '../mcp/server.js'; -import * as mcpTransport from '../mcp/transport.js'; -import { packageJSON } from '../package.js'; -import { runTask } from './loop.js'; -import { OpenAIDelegate } from './loopOpenAI.js'; - -import type { FullConfig } from '../config.js'; -import type { ServerBackend } from '../mcp/server.js'; - -const oneToolSchema: mcpServer.ToolSchema = { - name: 'browser', - title: 'Perform a task with the browser', - description: 'Perform a task with the browser. It can click, type, export, capture screenshot, drag, hover, select options, etc.', - inputSchema: z.object({ - task: z.string().describe('The task to perform with the browser'), - }), - type: 'readOnly', -}; - -export async function runOneTool(config: FullConfig) { - dotenv.config(); - const serverBackendFactory = () => new OneToolServerBackend(config); - await mcpTransport.start(serverBackendFactory, config.server); -} - -class OneToolServerBackend implements ServerBackend { - readonly name = 'Playwright'; - readonly version = packageJSON.version; - private _innerClient: Client | undefined; - private _config: FullConfig; - - constructor(config: FullConfig) { - this._config = config; - } - - async initialize() { - const client = new Client({ name: 'Playwright Proxy', version: '1.0.0' }); - const browserContextFactory = contextFactory(this._config.browser); - const server = mcpServer.createServer(new BrowserServerBackend(this._config, browserContextFactory)); - await client.connect(new InProcessTransport(server)); - await client.ping(); - this._innerClient = client; - } - - tools(): mcpServer.ToolSchema[] { - return [oneToolSchema]; - } - - async callTool(schema: mcpServer.ToolSchema, parsedArguments: any): Promise { - const delegate = new OpenAIDelegate(); - const result = await runTask(delegate, this._innerClient!, parsedArguments.task as string); - return { - content: [{ type: 'text', text: result }], - }; - } - - serverClosed() { - void Context.disposeAll().catch(logUnhandledError); - } -} diff --git a/src/loopTools/context.ts b/src/loopTools/context.ts new file mode 100644 index 0000000..6f54e73 --- /dev/null +++ b/src/loopTools/context.ts @@ -0,0 +1,65 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { Client } from '@modelcontextprotocol/sdk/client/index.js'; +import { contextFactory } from '../browserContextFactory.js'; +import { BrowserServerBackend } from '../browserServerBackend.js'; +import { Context as BrowserContext } from '../context.js'; +import { runTask } from '../loop/loop.js'; +import { OpenAIDelegate } from '../loop/loopOpenAI.js'; +import { ClaudeDelegate } from '../loop/loopClaude.js'; +import { InProcessTransport } from '../mcp/inProcessTransport.js'; +import * as mcpServer from '../mcp/server.js'; + +import type { LLMDelegate } from '../loop/loop.js'; +import type { FullConfig } from '../config.js'; + +export class Context { + readonly config: FullConfig; + private _client: Client; + private _delegate: LLMDelegate; + + constructor(config: FullConfig, client: Client) { + this.config = config; + this._client = client; + if (process.env.OPENAI_API_KEY) + this._delegate = new OpenAIDelegate(); + else if (process.env.ANTHROPIC_API_KEY) + this._delegate = new ClaudeDelegate(); + else + throw new Error('No LLM API key found. Please set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable.'); + } + + static async create(config: FullConfig) { + const client = new Client({ name: 'Playwright Proxy', version: '1.0.0' }); + const browserContextFactory = contextFactory(config.browser); + const server = mcpServer.createServer(new BrowserServerBackend(config, browserContextFactory)); + await client.connect(new InProcessTransport(server)); + await client.ping(); + return new Context(config, client); + } + + async runTask(task: string, oneShot: boolean = false): Promise { + const result = await runTask(this._delegate, this._client!, task, oneShot); + return { + content: [{ type: 'text', text: result }], + }; + } + + async close() { + await BrowserContext.disposeAll(); + } +} diff --git a/src/loopTools/main.ts b/src/loopTools/main.ts new file mode 100644 index 0000000..ded5b88 --- /dev/null +++ b/src/loopTools/main.ts @@ -0,0 +1,63 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import dotenv from 'dotenv'; + +import * as mcpServer from '../mcp/server.js'; +import * as mcpTransport from '../mcp/transport.js'; +import { packageJSON } from '../package.js'; +import { Context } from './context.js'; +import { perform } from './perform.js'; +import { snapshot } from './snapshot.js'; + +import type { FullConfig } from '../config.js'; +import type { ServerBackend } from '../mcp/server.js'; +import type { Tool } from './tool.js'; + +export async function runLoopTools(config: FullConfig) { + dotenv.config(); + const serverBackendFactory = () => new LoopToolsServerBackend(config); + await mcpTransport.start(serverBackendFactory, config.server); +} + +class LoopToolsServerBackend implements ServerBackend { + readonly name = 'Playwright'; + readonly version = packageJSON.version; + private _config: FullConfig; + private _context: Context | undefined; + private _tools: Tool[] = [perform, snapshot]; + + constructor(config: FullConfig) { + this._config = config; + } + + async initialize() { + this._context = await Context.create(this._config); + } + + tools(): mcpServer.ToolSchema[] { + return this._tools.map(tool => tool.schema); + } + + async callTool(schema: mcpServer.ToolSchema, parsedArguments: any): Promise { + const tool = this._tools.find(tool => tool.schema.name === schema.name)!; + return await tool.handle(this._context!, parsedArguments); + } + + serverClosed() { + void this._context!.close(); + } +} diff --git a/src/loopTools/perform.ts b/src/loopTools/perform.ts new file mode 100644 index 0000000..29df7b4 --- /dev/null +++ b/src/loopTools/perform.ts @@ -0,0 +1,36 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from 'zod'; +import { defineTool } from './tool.js'; + +const performSchema = z.object({ + task: z.string().describe('The task to perform with the browser'), +}); + +export const perform = defineTool({ + schema: { + name: 'browser_perform', + title: 'Perform a task with the browser', + description: 'Perform a task with the browser. It can click, type, export, capture screenshot, drag, hover, select options, etc.', + inputSchema: performSchema, + type: 'destructive', + }, + + handle: async (context, params) => { + return await context.runTask(params.task); + }, +}); diff --git a/src/loopTools/snapshot.ts b/src/loopTools/snapshot.ts new file mode 100644 index 0000000..1b0f227 --- /dev/null +++ b/src/loopTools/snapshot.ts @@ -0,0 +1,32 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { z } from 'zod'; +import { defineTool } from './tool.js'; + +export const snapshot = defineTool({ + schema: { + name: 'browser_snapshot', + title: 'Take a snapshot of the browser', + description: 'Take a snapshot of the browser to read what is on the page.', + inputSchema: z.object({}), + type: 'readOnly', + }, + + handle: async (context, params) => { + return await context.runTask('Capture browser snapshot', true); + }, +}); diff --git a/src/loopTools/tool.ts b/src/loopTools/tool.ts new file mode 100644 index 0000000..5399b08 --- /dev/null +++ b/src/loopTools/tool.ts @@ -0,0 +1,29 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import type { z } from 'zod'; +import type * as mcpServer from '../mcp/server.js'; +import type { Context } from './context.js'; + + +export type Tool = { + schema: mcpServer.ToolSchema; + handle: (context: Context, params: z.output) => Promise; +}; + +export function defineTool(tool: Tool): Tool { + return tool; +} diff --git a/src/program.ts b/src/program.ts index e9f4bc8..508e977 100644 --- a/src/program.ts +++ b/src/program.ts @@ -25,6 +25,7 @@ import { runWithExtension } from './extension/main.js'; import { BrowserServerBackend } from './browserServerBackend.js'; import { Context } from './context.js'; import { contextFactory } from './browserContextFactory.js'; +import { runLoopTools } from './loopTools/main.js'; program .version('Version ' + packageJSON.version) @@ -55,6 +56,7 @@ program .option('--user-data-dir ', 'path to the user data directory. If not specified, a temporary directory will be created.') .option('--viewport-size ', 'specify browser viewport size in pixels, for example "1280, 720"') .addOption(new Option('--extension', 'Connect to a running browser instance (Edge/Chrome only). Requires the "Playwright MCP Bridge" browser extension to be installed.').hideHelp()) + .addOption(new Option('--loop-tools', 'Run loop tools').hideHelp()) .addOption(new Option('--vision', 'Legacy option, use --caps=vision instead').hideHelp()) .action(async options => { const abortController = setupExitWatchdog(); @@ -70,6 +72,10 @@ program await runWithExtension(config, abortController); return; } + if (options.loopTools) { + await runLoopTools(config); + return; + } const browserContextFactory = contextFactory(config.browser); const serverBackendFactory = () => new BrowserServerBackend(config, browserContextFactory); diff --git a/src/tools/tool.ts b/src/tools/tool.ts index 8f2a738..aa0628b 100644 --- a/src/tools/tool.ts +++ b/src/tools/tool.ts @@ -20,16 +20,7 @@ import type * as playwright from 'playwright'; import type { ToolCapability } from '../../config.js'; import type { Tab } from '../tab.js'; import type { Response } from '../response.js'; - -export type ToolSchema = { - name: string; - title: string; - description: string; - inputSchema: Input; - type: 'readOnly' | 'destructive'; -}; - -type InputType = z.Schema; +import type { ToolSchema } from '../mcp/server.js'; export type FileUploadModalState = { type: 'fileChooser'; @@ -45,44 +36,24 @@ export type DialogModalState = { export type ModalState = FileUploadModalState | DialogModalState; -export type SnapshotContent = { - type: 'snapshot'; - snapshot: string; -}; - -export type TextContent = { - type: 'text'; - text: string; -}; - -export type ImageContent = { - type: 'image'; - image: string; -}; - -export type CodeContent = { - type: 'code'; - code: string[]; -}; - -export type Tool = { +export type Tool = { capability: ToolCapability; schema: ToolSchema; handle: (context: Context, params: z.output, response: Response) => Promise; }; -export function defineTool(tool: Tool): Tool { +export function defineTool(tool: Tool): Tool { return tool; } -export type TabTool = { +export type TabTool = { capability: ToolCapability; schema: ToolSchema; clearsModalState?: ModalState['type']; handle: (tab: Tab, params: z.output, response: Response) => Promise; }; -export function defineTabTool(tool: TabTool): Tool { +export function defineTabTool(tool: TabTool): Tool { return { ...tool, handle: async (context, params, response) => {