chore: extract loop tools into a separate folder (#755)

2025-07-26 08:32:26 +08:00 · 2025-07-24 16:22:03 -07:00 · 2025-07-24 16:22:03 -07:00 · ecfa10448b
commit ecfa10448b
parent e153ac3b7c
11 changed files with 296 additions and 155 deletions
--- a/src/loop/loop.ts
+++ b/src/loop/loop.ts
@ -41,15 +41,16 @@ export type LLMConversation = {
 };

 export interface LLMDelegate {
-  createConversation(task: string, tools: Tool[]): LLMConversation;
+  createConversation(task: string, tools: Tool[], oneShot: boolean): LLMConversation;
  makeApiCall(conversation: LLMConversation): Promise<LLMToolCall[]>;
  addToolResults(conversation: LLMConversation, results: Array<{ toolCallId: string; content: string; isError?: boolean }>): void;
  checkDoneToolCall(toolCall: LLMToolCall): string | null;
 }

-export async function runTask(delegate: LLMDelegate, client: Client, task: string): Promise<string> {
+export async function runTask(delegate: LLMDelegate, client: Client, task: string, oneShot: boolean = false): Promise<string> {
  const { tools } = await client.listTools();
-  const conversation = delegate.createConversation(task, tools);
+  const taskContent = oneShot ? `Perform following task: ${task}.` : `Perform following task: ${task}. Once the task is complete, call the "done" tool.`;
+  const conversation = delegate.createConversation(taskContent, tools, oneShot);

  for (let iteration = 0; iteration < 5; ++iteration) {
    debug('history')('Making API call for iteration', iteration);
@ -99,8 +100,10 @@ export async function runTask(delegate: LLMDelegate, client: Client, task: strin
      }
    }

-    // Add tool results to conversation
-    delegate.addToolResults(conversation, toolResults);
+    if (oneShot)
+      return toolResults.map(result => result.content).join('\n');
+    else
+      delegate.addToolResults(conversation, toolResults);
  }

  throw new Error('Failed to perform step, max attempts reached');
--- a/src/loop/loopClaude.ts
+++ b/src/loop/loopClaude.ts
@ -14,38 +14,48 @@
 * limitations under the License.
 */

-import Anthropic from '@anthropic-ai/sdk';
+import type Anthropic from '@anthropic-ai/sdk';
 import type { LLMDelegate, LLMConversation, LLMToolCall, LLMTool } from './loop.js';
 import type { Tool } from '@modelcontextprotocol/sdk/types.js';

 const model = 'claude-sonnet-4-20250514';

 export class ClaudeDelegate implements LLMDelegate {
-  private anthropic = new Anthropic();
+  private _anthropic: Anthropic | undefined;

-  createConversation(task: string, tools: Tool[]): LLMConversation {
+  async anthropic(): Promise<Anthropic> {
+    if (!this._anthropic) {
+      const anthropic = await import('@anthropic-ai/sdk');
+      this._anthropic = new anthropic.Anthropic();
+    }
+    return this._anthropic;
+  }
+
+  createConversation(task: string, tools: Tool[], oneShot: boolean): LLMConversation {
    const llmTools: LLMTool[] = tools.map(tool => ({
      name: tool.name,
      description: tool.description || '',
      inputSchema: tool.inputSchema,
    }));

-    // Add the "done" tool
-    llmTools.push({
-      name: 'done',
-      description: 'Call this tool when the task is complete.',
-      inputSchema: {
-        type: 'object',
-        properties: {
-          result: { type: 'string', description: 'The result of the task.' },
+    if (!oneShot) {
+      llmTools.push({
+        name: 'done',
+        description: 'Call this tool when the task is complete.',
+        inputSchema: {
+          type: 'object',
+          properties: {
+            result: { type: 'string', description: 'The result of the task.' },
+          },
+          required: ['result'],
        },
-      },
-    });
+      });
+    }

    return {
      messages: [{
        role: 'user',
-        content: `Perform following task: ${task}. Once the task is complete, call the "done" tool.`
+        content: task
      }],
      tools: llmTools,
    };
@ -119,7 +129,8 @@ export class ClaudeDelegate implements LLMDelegate {
      input_schema: tool.inputSchema,
    }));

-    const response = await this.anthropic.messages.create({
+    const anthropic = await this.anthropic();
+    const response = await anthropic.messages.create({
      model,
      max_tokens: 10000,
      messages: claudeMessages,
--- a/src/loop/loopOpenAI.ts
+++ b/src/loop/loopOpenAI.ts
@ -14,39 +14,48 @@
 * limitations under the License.
 */

-import OpenAI from 'openai';
+import type OpenAI from 'openai';
 import type { LLMDelegate, LLMConversation, LLMToolCall, LLMTool } from './loop.js';
 import type { Tool } from '@modelcontextprotocol/sdk/types.js';

 const model = 'gpt-4.1';

 export class OpenAIDelegate implements LLMDelegate {
-  private openai = new OpenAI();
+  private _openai: OpenAI | undefined;

-  createConversation(task: string, tools: Tool[]): LLMConversation {
+  async openai(): Promise<OpenAI> {
+    if (!this._openai) {
+      const oai = await import('openai');
+      this._openai = new oai.OpenAI();
+    }
+    return this._openai;
+  }
+
+  createConversation(task: string, tools: Tool[], oneShot: boolean): LLMConversation {
    const genericTools: LLMTool[] = tools.map(tool => ({
      name: tool.name,
      description: tool.description || '',
      inputSchema: tool.inputSchema,
    }));

-    // Add the "done" tool
-    genericTools.push({
-      name: 'done',
-      description: 'Call this tool when the task is complete.',
-      inputSchema: {
-        type: 'object',
-        properties: {
-          result: { type: 'string', description: 'The result of the task.' },
+    if (!oneShot) {
+      genericTools.push({
+        name: 'done',
+        description: 'Call this tool when the task is complete.',
+        inputSchema: {
+          type: 'object',
+          properties: {
+            result: { type: 'string', description: 'The result of the task.' },
+          },
+          required: ['result'],
        },
-        required: ['result'],
-      },
-    });
+      });
+    }

    return {
      messages: [{
        role: 'user',
-        content: `Peform following task: ${task}. Once the task is complete, call the "done" tool.`
+        content: task
      }],
      tools: genericTools,
    };
@ -108,7 +117,8 @@ export class OpenAIDelegate implements LLMDelegate {
      },
    }));

-    const response = await this.openai.chat.completions.create({
+    const openai = await this.openai();
+    const response = await openai.chat.completions.create({
      model,
      messages: openaiMessages,
      tools: openaiTools,
--- a/src/loop/onetool.ts
+++ b/src/loop/onetool.ts
@ -1,85 +0,0 @@
-/**
- * Copyright (c) Microsoft Corporation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js';
-import dotenv from 'dotenv';
-import { z } from 'zod';
-
-import { contextFactory } from '../browserContextFactory.js';
-import { BrowserServerBackend } from '../browserServerBackend.js';
-import { Context } from '../context.js';
-import { logUnhandledError } from '../log.js';
-import { InProcessTransport } from '../mcp/inProcessTransport.js';
-import * as mcpServer from '../mcp/server.js';
-import * as mcpTransport from '../mcp/transport.js';
-import { packageJSON } from '../package.js';
-import { runTask } from './loop.js';
-import { OpenAIDelegate } from './loopOpenAI.js';
-
-import type { FullConfig } from '../config.js';
-import type { ServerBackend } from '../mcp/server.js';
-
-const oneToolSchema: mcpServer.ToolSchema<any> = {
-  name: 'browser',
-  title: 'Perform a task with the browser',
-  description: 'Perform a task with the browser. It can click, type, export, capture screenshot, drag, hover, select options, etc.',
-  inputSchema: z.object({
-    task: z.string().describe('The task to perform with the browser'),
-  }),
-  type: 'readOnly',
-};
-
-export async function runOneTool(config: FullConfig) {
-  dotenv.config();
-  const serverBackendFactory = () => new OneToolServerBackend(config);
-  await mcpTransport.start(serverBackendFactory, config.server);
-}
-
-class OneToolServerBackend implements ServerBackend {
-  readonly name = 'Playwright';
-  readonly version = packageJSON.version;
-  private _innerClient: Client | undefined;
-  private _config: FullConfig;
-
-  constructor(config: FullConfig) {
-    this._config = config;
-  }
-
-  async initialize() {
-    const client = new Client({ name: 'Playwright Proxy', version: '1.0.0' });
-    const browserContextFactory = contextFactory(this._config.browser);
-    const server = mcpServer.createServer(new BrowserServerBackend(this._config, browserContextFactory));
-    await client.connect(new InProcessTransport(server));
-    await client.ping();
-    this._innerClient = client;
-  }
-
-  tools(): mcpServer.ToolSchema<any>[] {
-    return [oneToolSchema];
-  }
-
-  async callTool(schema: mcpServer.ToolSchema<any>, parsedArguments: any): Promise<mcpServer.ToolResponse> {
-    const delegate = new OpenAIDelegate();
-    const result = await runTask(delegate, this._innerClient!, parsedArguments.task as string);
-    return {
-      content: [{ type: 'text', text: result }],
-    };
-  }
-
-  serverClosed() {
-    void Context.disposeAll().catch(logUnhandledError);
-  }
-}
--- a/src/loopTools/context.ts
+++ b/src/loopTools/context.ts
@ -0,0 +1,65 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { Client } from '@modelcontextprotocol/sdk/client/index.js';
+import { contextFactory } from '../browserContextFactory.js';
+import { BrowserServerBackend } from '../browserServerBackend.js';
+import { Context as BrowserContext } from '../context.js';
+import { runTask } from '../loop/loop.js';
+import { OpenAIDelegate } from '../loop/loopOpenAI.js';
+import { ClaudeDelegate } from '../loop/loopClaude.js';
+import { InProcessTransport } from '../mcp/inProcessTransport.js';
+import * as mcpServer from '../mcp/server.js';
+
+import type { LLMDelegate } from '../loop/loop.js';
+import type { FullConfig } from '../config.js';
+
+export class Context {
+  readonly config: FullConfig;
+  private _client: Client;
+  private _delegate: LLMDelegate;
+
+  constructor(config: FullConfig, client: Client) {
+    this.config = config;
+    this._client = client;
+    if (process.env.OPENAI_API_KEY)
+      this._delegate = new OpenAIDelegate();
+    else if (process.env.ANTHROPIC_API_KEY)
+      this._delegate = new ClaudeDelegate();
+    else
+      throw new Error('No LLM API key found. Please set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable.');
+  }
+
+  static async create(config: FullConfig) {
+    const client = new Client({ name: 'Playwright Proxy', version: '1.0.0' });
+    const browserContextFactory = contextFactory(config.browser);
+    const server = mcpServer.createServer(new BrowserServerBackend(config, browserContextFactory));
+    await client.connect(new InProcessTransport(server));
+    await client.ping();
+    return new Context(config, client);
+  }
+
+  async runTask(task: string, oneShot: boolean = false): Promise<mcpServer.ToolResponse> {
+    const result = await runTask(this._delegate, this._client!, task, oneShot);
+    return {
+      content: [{ type: 'text', text: result }],
+    };
+  }
+
+  async close() {
+    await BrowserContext.disposeAll();
+  }
+}
--- a/src/loopTools/main.ts
+++ b/src/loopTools/main.ts
@ -0,0 +1,63 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import dotenv from 'dotenv';
+
+import * as mcpServer from '../mcp/server.js';
+import * as mcpTransport from '../mcp/transport.js';
+import { packageJSON } from '../package.js';
+import { Context } from './context.js';
+import { perform } from './perform.js';
+import { snapshot } from './snapshot.js';
+
+import type { FullConfig } from '../config.js';
+import type { ServerBackend } from '../mcp/server.js';
+import type { Tool } from './tool.js';
+
+export async function runLoopTools(config: FullConfig) {
+  dotenv.config();
+  const serverBackendFactory = () => new LoopToolsServerBackend(config);
+  await mcpTransport.start(serverBackendFactory, config.server);
+}
+
+class LoopToolsServerBackend implements ServerBackend {
+  readonly name = 'Playwright';
+  readonly version = packageJSON.version;
+  private _config: FullConfig;
+  private _context: Context | undefined;
+  private _tools: Tool<any>[] = [perform, snapshot];
+
+  constructor(config: FullConfig) {
+    this._config = config;
+  }
+
+  async initialize() {
+    this._context = await Context.create(this._config);
+  }
+
+  tools(): mcpServer.ToolSchema<any>[] {
+    return this._tools.map(tool => tool.schema);
+  }
+
+  async callTool(schema: mcpServer.ToolSchema<any>, parsedArguments: any): Promise<mcpServer.ToolResponse> {
+    const tool = this._tools.find(tool => tool.schema.name === schema.name)!;
+    return await tool.handle(this._context!, parsedArguments);
+  }
+
+  serverClosed() {
+    void this._context!.close();
+  }
+}
--- a/src/loopTools/perform.ts
+++ b/src/loopTools/perform.ts
@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { z } from 'zod';
+import { defineTool } from './tool.js';
+
+const performSchema = z.object({
+  task: z.string().describe('The task to perform with the browser'),
+});
+
+export const perform = defineTool({
+  schema: {
+    name: 'browser_perform',
+    title: 'Perform a task with the browser',
+    description: 'Perform a task with the browser. It can click, type, export, capture screenshot, drag, hover, select options, etc.',
+    inputSchema: performSchema,
+    type: 'destructive',
+  },
+
+  handle: async (context, params) => {
+    return await context.runTask(params.task);
+  },
+});
--- a/src/loopTools/snapshot.ts
+++ b/src/loopTools/snapshot.ts
@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { z } from 'zod';
+import { defineTool } from './tool.js';
+
+export const snapshot = defineTool({
+  schema: {
+    name: 'browser_snapshot',
+    title: 'Take a snapshot of the browser',
+    description: 'Take a snapshot of the browser to read what is on the page.',
+    inputSchema: z.object({}),
+    type: 'readOnly',
+  },
+
+  handle: async (context, params) => {
+    return await context.runTask('Capture browser snapshot', true);
+  },
+});
--- a/src/loopTools/tool.ts
+++ b/src/loopTools/tool.ts
@ -0,0 +1,29 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import type { z } from 'zod';
+import type * as mcpServer from '../mcp/server.js';
+import type { Context } from './context.js';
+
+
+export type Tool<Input extends z.Schema = z.Schema> = {
+  schema: mcpServer.ToolSchema<Input>;
+  handle: (context: Context, params: z.output<Input>) => Promise<mcpServer.ToolResponse>;
+};
+
+export function defineTool<Input extends z.Schema>(tool: Tool<Input>): Tool<Input> {
+  return tool;
+}
--- a/src/program.ts
+++ b/src/program.ts
@ -25,6 +25,7 @@ import { runWithExtension } from './extension/main.js';
 import { BrowserServerBackend } from './browserServerBackend.js';
 import { Context } from './context.js';
 import { contextFactory } from './browserContextFactory.js';
+import { runLoopTools } from './loopTools/main.js';

 program
    .version('Version ' + packageJSON.version)
@ -55,6 +56,7 @@ program
    .option('--user-data-dir <path>', 'path to the user data directory. If not specified, a temporary directory will be created.')
    .option('--viewport-size <size>', 'specify browser viewport size in pixels, for example "1280, 720"')
    .addOption(new Option('--extension', 'Connect to a running browser instance (Edge/Chrome only). Requires the "Playwright MCP Bridge" browser extension to be installed.').hideHelp())
+    .addOption(new Option('--loop-tools', 'Run loop tools').hideHelp())
    .addOption(new Option('--vision', 'Legacy option, use --caps=vision instead').hideHelp())
    .action(async options => {
      const abortController = setupExitWatchdog();
@ -70,6 +72,10 @@ program
        await runWithExtension(config, abortController);
        return;
      }
+      if (options.loopTools) {
+        await runLoopTools(config);
+        return;
+      }

      const browserContextFactory = contextFactory(config.browser);
      const serverBackendFactory = () => new BrowserServerBackend(config, browserContextFactory);
--- a/src/tools/tool.ts
+++ b/src/tools/tool.ts
@ -20,16 +20,7 @@ import type * as playwright from 'playwright';
 import type { ToolCapability } from '../../config.js';
 import type { Tab } from '../tab.js';
 import type { Response } from '../response.js';
-
-export type ToolSchema<Input extends InputType> = {
-  name: string;
-  title: string;
-  description: string;
-  inputSchema: Input;
-  type: 'readOnly' | 'destructive';
-};
-
-type InputType = z.Schema;
+import type { ToolSchema } from '../mcp/server.js';

 export type FileUploadModalState = {
  type: 'fileChooser';
@ -45,44 +36,24 @@ export type DialogModalState = {

 export type ModalState = FileUploadModalState | DialogModalState;

-export type SnapshotContent = {
-  type: 'snapshot';
-  snapshot: string;
-};
-
-export type TextContent = {
-  type: 'text';
-  text: string;
-};
-
-export type ImageContent = {
-  type: 'image';
-  image: string;
-};
-
-export type CodeContent = {
-  type: 'code';
-  code: string[];
-};
-
-export type Tool<Input extends InputType = InputType> = {
+export type Tool<Input extends z.Schema = z.Schema> = {
  capability: ToolCapability;
  schema: ToolSchema<Input>;
  handle: (context: Context, params: z.output<Input>, response: Response) => Promise<void>;
 };

-export function defineTool<Input extends InputType>(tool: Tool<Input>): Tool<Input> {
+export function defineTool<Input extends z.Schema>(tool: Tool<Input>): Tool<Input> {
  return tool;
 }

-export type TabTool<Input extends InputType = InputType> = {
+export type TabTool<Input extends z.Schema = z.Schema> = {
  capability: ToolCapability;
  schema: ToolSchema<Input>;
  clearsModalState?: ModalState['type'];
  handle: (tab: Tab, params: z.output<Input>, response: Response) => Promise<void>;
 };

-export function defineTabTool<Input extends InputType>(tool: TabTool<Input>): Tool<Input> {
+export function defineTabTool<Input extends z.Schema>(tool: TabTool<Input>): Tool<Input> {
  return {
    ...tool,
    handle: async (context, params, response) => {