diff --git a/.gitignore b/.gitignore index 1089cef..50c114b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ node_modules/ test-results/ playwright-report/ .vscode/mcp.json - .idea .DS_Store +.env +sessions/ diff --git a/package-lock.json b/package-lock.json index 68b3aad..78dd9a7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,6 +22,7 @@ "mcp-server-playwright": "cli.js" }, "devDependencies": { + "@anthropic-ai/sdk": "^0.57.0", "@eslint/eslintrc": "^3.2.0", "@eslint/js": "^9.19.0", "@playwright/test": "1.55.0-alpha-1752701791000", @@ -33,15 +34,27 @@ "@typescript-eslint/eslint-plugin": "^8.26.1", "@typescript-eslint/parser": "^8.26.1", "@typescript-eslint/utils": "^8.26.1", + "dotenv": "^17.2.0", "eslint": "^9.19.0", "eslint-plugin-import": "^2.31.0", "eslint-plugin-notice": "^1.0.0", + "openai": "^5.10.2", "typescript": "^5.8.2" }, "engines": { "node": ">=18" } }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.57.0", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.57.0.tgz", + "integrity": "sha512-z5LMy0MWu0+w2hflUgj4RlJr1R+0BxKXL7ldXTO8FasU8fu599STghO+QKwId2dAD0d464aHtU+ChWuRHw4FNw==", + "dev": true, + "license": "MIT", + "bin": { + "anthropic-ai-sdk": "bin/cli" + } + }, "node_modules/@eslint-community/eslint-utils": { "version": "4.5.1", "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.5.1.tgz", @@ -72,9 +85,9 @@ } }, "node_modules/@eslint/config-array": { - "version": "0.19.2", - "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.19.2.tgz", - "integrity": "sha512-GNKqxfHG2ySmJOBSHg7LxeUx4xpuCoFjacmlCoYWEbaPXLwvfIjixRI12xCQZeULksQb23uiA8F40w5TojpV7w==", + "version": "0.21.0", + "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.0.tgz", + "integrity": "sha512-ENIdc4iLu0d93HeYirvKmrzshzofPw6VkZRKQGe9Nv46ZnWUzcF1xV01dcvEg/1wXUR61OmmlSfyeyO7EvjLxQ==", "dev": true, "license": "Apache-2.0", "dependencies": { @@ -87,9 +100,9 @@ } }, "node_modules/@eslint/config-helpers": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.1.0.tgz", - "integrity": "sha512-kLrdPDJE1ckPo94kmPPf9Hfd0DU0Jw6oKYrhe+pwSC0iTUInmTa+w6fw8sGgcfkFJGNdWOUeOaDM4quW4a7OkA==", + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.3.0.tgz", + "integrity": "sha512-ViuymvFmcJi04qdZeDc2whTHryouGcDlaxPqarTD0ZE10ISpxGUVZGZDx4w01upyIynL3iu6IXH2bS1NhclQMw==", "dev": true, "license": "Apache-2.0", "engines": { @@ -97,9 +110,9 @@ } }, "node_modules/@eslint/core": { - "version": "0.12.0", - "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.12.0.tgz", - "integrity": "sha512-cmrR6pytBuSMTaBweKoGMwu3EiHiEC+DoyupPmlZ0HxBJBtIxwe+j/E4XPIKNx+Q74c8lXKPwYawBf5glsTkHg==", + "version": "0.15.1", + "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.15.1.tgz", + "integrity": "sha512-bkOp+iumZCCbt1K1CmWf0R9pM5yKpDv+ZXtvSyQpudrI9kuFLp+bM2WOPXImuD/ceQuaa8f5pj93Y7zyECIGNA==", "dev": true, "license": "Apache-2.0", "dependencies": { @@ -110,9 +123,9 @@ } }, "node_modules/@eslint/eslintrc": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.0.tgz", - "integrity": "sha512-yaVPAiNAalnCZedKLdR21GOGILMLKPyqSLWaAjQFvYA2i/ciDi8ArYVr69Anohb6cH2Ukhqti4aFnYyPm8wdwQ==", + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.1.tgz", + "integrity": "sha512-gtF186CXhIl1p4pJNGZw8Yc6RlshoePRvE0X91oPGb3vZ8pM3qOS9W9NGPat9LziaBV7XrJWGylNQXkGcnM3IQ==", "dev": true, "license": "MIT", "dependencies": { @@ -134,13 +147,16 @@ } }, "node_modules/@eslint/js": { - "version": "9.22.0", - "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.22.0.tgz", - "integrity": "sha512-vLFajx9o8d1/oL2ZkpMYbkLv8nDB6yaIwFNt7nI4+I80U/z03SxmfOMsLbvWr3p7C+Wnoh//aOu2pQW8cS0HCQ==", + "version": "9.31.0", + "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.31.0.tgz", + "integrity": "sha512-LOm5OVt7D4qiKCqoiPbA7LWmI+tbw1VbTUowBcUMgQSuM6poJufkFkYDcQpo5KfgD39TnNySV26QjOh7VFpSyw==", "dev": true, "license": "MIT", "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://eslint.org/donate" } }, "node_modules/@eslint/object-schema": { @@ -154,13 +170,13 @@ } }, "node_modules/@eslint/plugin-kit": { - "version": "0.2.7", - "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.2.7.tgz", - "integrity": "sha512-JubJ5B2pJ4k4yGxaNLdbjrnk9d/iDz6/q8wOilpIowd6PJPgaxCuHBnBszq7Ce2TyMrywm5r4PnKm6V3iiZF+g==", + "version": "0.3.4", + "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.3.4.tgz", + "integrity": "sha512-Ul5l+lHEcw3L5+k8POx6r74mxEYKG5kOb6Xpy2gCRW6zweT6TEhAf8vhxGgjhqrd/VO/Dirhsb+1hNpD1ue9hw==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@eslint/core": "^0.12.0", + "@eslint/core": "^0.15.1", "levn": "^0.4.1" }, "engines": { @@ -595,9 +611,9 @@ } }, "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", - "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", + "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", "dev": true, "license": "MIT", "dependencies": { @@ -689,9 +705,9 @@ } }, "node_modules/acorn": { - "version": "8.14.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.14.1.tgz", - "integrity": "sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==", + "version": "8.15.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", + "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", "bin": { @@ -924,9 +940,9 @@ } }, "node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", + "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", "dev": true, "license": "MIT", "dependencies": { @@ -1269,6 +1285,19 @@ "node": ">=0.10.0" } }, + "node_modules/dotenv": { + "version": "17.2.0", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.0.tgz", + "integrity": "sha512-Q4sgBT60gzd0BB0lSyYD3xM4YxrXA9y4uBDof1JNYGzOXrQdQ6yX+7XIAqoFOGQFOTK1D3Hts5OllpxMDZFONQ==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -1461,20 +1490,20 @@ } }, "node_modules/eslint": { - "version": "9.22.0", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.22.0.tgz", - "integrity": "sha512-9V/QURhsRN40xuHXWjV64yvrzMjcz7ZyNoF2jJFmy9j/SLk0u1OLSZgXi28MrXjymnjEGSR80WCdab3RGMDveQ==", + "version": "9.31.0", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.31.0.tgz", + "integrity": "sha512-QldCVh/ztyKJJZLr4jXNUByx3gR+TDYZCRXEktiZoUR3PGy4qCmSbkxcIle8GEwGpb5JBZazlaJ/CxLidXdEbQ==", "dev": true, "license": "MIT", "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.12.1", - "@eslint/config-array": "^0.19.2", - "@eslint/config-helpers": "^0.1.0", - "@eslint/core": "^0.12.0", - "@eslint/eslintrc": "^3.3.0", - "@eslint/js": "9.22.0", - "@eslint/plugin-kit": "^0.2.7", + "@eslint/config-array": "^0.21.0", + "@eslint/config-helpers": "^0.3.0", + "@eslint/core": "^0.15.0", + "@eslint/eslintrc": "^3.3.1", + "@eslint/js": "9.31.0", + "@eslint/plugin-kit": "^0.3.1", "@humanfs/node": "^0.16.6", "@humanwhocodes/module-importer": "^1.0.1", "@humanwhocodes/retry": "^0.4.2", @@ -1485,9 +1514,9 @@ "cross-spawn": "^7.0.6", "debug": "^4.3.2", "escape-string-regexp": "^4.0.0", - "eslint-scope": "^8.3.0", - "eslint-visitor-keys": "^4.2.0", - "espree": "^10.3.0", + "eslint-scope": "^8.4.0", + "eslint-visitor-keys": "^4.2.1", + "espree": "^10.4.0", "esquery": "^1.5.0", "esutils": "^2.0.2", "fast-deep-equal": "^3.1.3", @@ -1641,9 +1670,9 @@ } }, "node_modules/eslint-scope": { - "version": "8.3.0", - "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.3.0.tgz", - "integrity": "sha512-pUNxi75F8MJ/GdeKtVLSbYg4ZI34J6C0C7sbL4YOp2exGwen7ZsuBqKzUhXd0qMQ362yET3z+uPwKeg/0C2XCQ==", + "version": "8.4.0", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz", + "integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==", "dev": true, "license": "BSD-2-Clause", "dependencies": { @@ -1671,9 +1700,9 @@ } }, "node_modules/eslint/node_modules/eslint-visitor-keys": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.0.tgz", - "integrity": "sha512-UyLnSehNt62FFhSwjZlHmeokpRK59rcz29j+F1/aDgbkbRTk7wIc9XzdoasMUbRNKDM0qQt/+BJ4BrpFeABemw==", + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", + "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", "dev": true, "license": "Apache-2.0", "engines": { @@ -1684,15 +1713,15 @@ } }, "node_modules/espree": { - "version": "10.3.0", - "resolved": "https://registry.npmjs.org/espree/-/espree-10.3.0.tgz", - "integrity": "sha512-0QYC8b24HWY8zjRnDTL6RiHfDbAWn63qb4LMj1Z4b076A4une81+z03Kg7l7mn/48PUTqoLptSXez8oknU8Clg==", + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz", + "integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==", "dev": true, "license": "BSD-2-Clause", "dependencies": { - "acorn": "^8.14.0", + "acorn": "^8.15.0", "acorn-jsx": "^5.3.2", - "eslint-visitor-keys": "^4.2.0" + "eslint-visitor-keys": "^4.2.1" }, "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" @@ -1702,9 +1731,9 @@ } }, "node_modules/espree/node_modules/eslint-visitor-keys": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.0.tgz", - "integrity": "sha512-UyLnSehNt62FFhSwjZlHmeokpRK59rcz29j+F1/aDgbkbRTk7wIc9XzdoasMUbRNKDM0qQt/+BJ4BrpFeABemw==", + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", + "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", "dev": true, "license": "Apache-2.0", "engines": { @@ -3151,6 +3180,28 @@ "wrappy": "1" } }, + "node_modules/openai": { + "version": "5.10.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.10.2.tgz", + "integrity": "sha512-n+vi74LzHtvlKcDPn9aApgELGiu5CwhaLG40zxLTlFQdoSJCLACORIPC2uVQ3JEYAbqapM+XyRKFy2Thej7bIw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/optionator": { "version": "0.9.4", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", diff --git a/package.json b/package.json index f65e417..5b47a4a 100644 --- a/package.json +++ b/package.json @@ -49,6 +49,7 @@ "zod-to-json-schema": "^3.24.4" }, "devDependencies": { + "@anthropic-ai/sdk": "^0.57.0", "@eslint/eslintrc": "^3.2.0", "@eslint/js": "^9.19.0", "@playwright/test": "1.55.0-alpha-1752701791000", @@ -60,9 +61,11 @@ "@typescript-eslint/eslint-plugin": "^8.26.1", "@typescript-eslint/parser": "^8.26.1", "@typescript-eslint/utils": "^8.26.1", + "dotenv": "^17.2.0", "eslint": "^9.19.0", "eslint-plugin-import": "^2.31.0", "eslint-plugin-notice": "^1.0.0", + "openai": "^5.10.2", "typescript": "^5.8.2" }, "bin": { diff --git a/src/connection.ts b/src/connection.ts index 85147a0..69fd434 100644 --- a/src/connection.ts +++ b/src/connection.ts @@ -17,7 +17,6 @@ import { Server as McpServer } from '@modelcontextprotocol/sdk/server/index.js'; import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from '@modelcontextprotocol/sdk/types.js'; import { zodToJsonSchema } from 'zod-to-json-schema'; - import { Context } from './context.js'; import { Response } from './response.js'; import { allTools } from './tools.js'; diff --git a/src/eval/loopClaude.ts b/src/eval/loopClaude.ts new file mode 100644 index 0000000..77bd233 --- /dev/null +++ b/src/eval/loopClaude.ts @@ -0,0 +1,119 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Anthropic from '@anthropic-ai/sdk'; +import debug from 'debug'; + +import type { Tool, ImageContent, TextContent } from '@modelcontextprotocol/sdk/types.js'; +import type { Client } from '@modelcontextprotocol/sdk/client/index.js'; + +const model = 'claude-sonnet-4-20250514'; + +export async function runTask(client: Client, task: string): Promise { + const anthropic = new Anthropic(); + const messages: Anthropic.Messages.MessageParam[] = []; + + const { tools } = await client.listTools(); + const claudeTools = tools.map(tool => asClaudeDeclaration(tool)); + + // Add initial user message + messages.push({ + role: 'user', + content: `Perform following task: ${task}.` + }); + + for (let iteration = 0; iteration < 5; ++iteration) { + debug('history')(messages); + + const response = await anthropic.messages.create({ + model, + max_tokens: 10000, + messages, + tools: claudeTools, + }); + + const content = response.content; + + const toolUseBlocks = content.filter(block => block.type === 'tool_use'); + const textBlocks = content.filter(block => block.type === 'text'); + + messages.push({ + role: 'assistant', + content: content + }); + + if (toolUseBlocks.length === 0) + return textBlocks.map(block => block.text).join('\n'); + + const toolResults: Anthropic.Messages.ToolResultBlockParam[] = []; + + for (const toolUse of toolUseBlocks) { + if (toolUse.name === 'done') + return JSON.stringify(toolUse.input, null, 2); + + try { + debug('tool')(toolUse.name, toolUse.input); + const response = await client.callTool({ + name: toolUse.name, + arguments: toolUse.input as any, + }); + const responseContent = (response.content || []) as (TextContent | ImageContent)[]; + debug('tool')(responseContent); + const text = responseContent.filter(part => part.type === 'text').map(part => part.text).join('\n'); + + toolResults.push({ + type: 'tool_result', + tool_use_id: toolUse.id, + content: text, + }); + } catch (error) { + debug('tool')(error); + toolResults.push({ + type: 'tool_result', + tool_use_id: toolUse.id, + content: `Error while executing tool "${toolUse.name}": ${error instanceof Error ? error.message : String(error)}\n\nPlease try to recover and complete the task.`, + is_error: true, + }); + // Skip remaining tool calls for this iteration + for (const remainingToolUse of toolUseBlocks.slice(toolUseBlocks.indexOf(toolUse) + 1)) { + toolResults.push({ + type: 'tool_result', + tool_use_id: remainingToolUse.id, + content: `This tool call is skipped due to previous error.`, + is_error: true, + }); + } + break; + } + } + + // Add tool results as user message + messages.push({ + role: 'user', + content: toolResults + }); + } + + throw new Error('Failed to perform step, max attempts reached'); +} + +function asClaudeDeclaration(tool: Tool): Anthropic.Messages.Tool { + return { + name: tool.name, + description: tool.description, + input_schema: tool.inputSchema, + }; +} diff --git a/src/eval/loopOpenAI.ts b/src/eval/loopOpenAI.ts new file mode 100644 index 0000000..4408b53 --- /dev/null +++ b/src/eval/loopOpenAI.ts @@ -0,0 +1,105 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import OpenAI from 'openai'; +import debug from 'debug'; + +import type { Tool, ImageContent, TextContent } from '@modelcontextprotocol/sdk/types.js'; +import type { Client } from '@modelcontextprotocol/sdk/client/index.js'; + +const model = 'gpt-4.1'; + +export async function runTask(client: Client, task: string): Promise { + const openai = new OpenAI(); + const messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[] = [ + { + role: 'user', + content: `Peform following task: ${task}. Once the task is complete, call the "done" tool.` + } + ]; + + const { tools } = await client.listTools(); + + for (let iteration = 0; iteration < 5; ++iteration) { + debug('history')(messages); + + const response = await openai.chat.completions.create({ + model, + messages, + tools: tools.map(tool => asOpenAIDeclaration(tool)), + tool_choice: 'auto' + }); + + const message = response.choices[0].message; + if (!message.tool_calls?.length) + return JSON.stringify(message.content, null, 2); + + messages.push({ + role: 'assistant', + tool_calls: message.tool_calls + }); + + for (const toolCall of message.tool_calls) { + const functionCall = toolCall.function; + + if (functionCall.name === 'done') + return JSON.stringify(functionCall.arguments, null, 2); + + try { + debug('tool')(functionCall.name, functionCall.arguments); + const response = await client.callTool({ + name: functionCall.name, + arguments: JSON.parse(functionCall.arguments) + }); + const content = (response.content || []) as (TextContent | ImageContent)[]; + debug('tool')(content); + const text = content.filter(part => part.type === 'text').map(part => part.text).join('\n'); + messages.push({ + role: 'tool', + tool_call_id: toolCall.id, + content: text, + }); + } catch (error) { + debug('tool')(error); + messages.push({ + role: 'tool', + tool_call_id: toolCall.id, + content: `Error while executing tool "${functionCall.name}": ${error instanceof Error ? error.message : String(error)}\n\nPlease try to recover and complete the task.`, + }); + for (const ignoredToolCall of message.tool_calls.slice(message.tool_calls.indexOf(toolCall) + 1)) { + messages.push({ + role: 'tool', + tool_call_id: ignoredToolCall.id, + content: `This tool call is skipped due to previous error.`, + }); + } + break; + } + } + } + throw new Error('Failed to perform step, max attempts reached'); +} + +function asOpenAIDeclaration(tool: Tool): OpenAI.Chat.Completions.ChatCompletionTool { + return { + type: 'function', + function: { + name: tool.name, + description: tool.description, + parameters: tool.inputSchema, + }, + }; +} diff --git a/src/eval/main.ts b/src/eval/main.ts new file mode 100644 index 0000000..4def87e --- /dev/null +++ b/src/eval/main.ts @@ -0,0 +1,68 @@ +/** + * Copyright (c) Microsoft Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* eslint-disable no-console */ + +import path from 'path'; +import url from 'url'; +import dotenv from 'dotenv'; + +import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js'; +import { Client } from '@modelcontextprotocol/sdk/client/index.js'; +import { program } from 'commander'; +import { runTask as runTaskOpenAI } from './loopOpenAI.js'; +import { runTask as runTaskClaude } from './loopClaude.js'; + +dotenv.config(); + +const __filename = url.fileURLToPath(import.meta.url); + +async function run(runTask: (client: Client, task: string) => Promise) { + const transport = new StdioClientTransport({ + command: 'node', + args: [ + path.resolve(__filename, '../../../cli.js'), + '--save-session', + '--output-dir', path.resolve(__filename, '../../../sessions') + ], + stderr: 'inherit', + env: process.env as Record, + }); + + const client = new Client({ name: 'test', version: '1.0.0' }); + await client.connect(transport); + await client.ping(); + + let lastResult: string | undefined; + for (const task of tasks) + lastResult = await runTask(client, task); + console.log(lastResult); + await client.close(); +} + +const tasks = [ + 'Open https://playwright.dev/', +]; + +program + .option('--model ', 'model to use') + .action(async options => { + if (options.model === 'claude') + await run(runTaskClaude); + else + await run(runTaskOpenAI); + }); +void program.parseAsync(process.argv); diff --git a/src/package.ts b/src/package.ts index a6c7019..e599f68 100644 --- a/src/package.ts +++ b/src/package.ts @@ -14,9 +14,9 @@ * limitations under the License. */ -import fs from 'node:fs'; -import url from 'node:url'; -import path from 'node:path'; +import fs from 'fs'; +import path from 'path'; +import url from 'url'; const __filename = url.fileURLToPath(import.meta.url); export const packageJSON = JSON.parse(fs.readFileSync(path.join(path.dirname(__filename), '..', 'package.json'), 'utf8'));