[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-SalesforceAIResearch--MCP-Universe":3,"tool-SalesforceAIResearch--MCP-Universe":62},[4,18,26,36,46,54],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",158594,2,"2026-04-16T23:34:05",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":42,"last_commit_at":43,"category_tags":44,"status":17},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,45],"插件",{"id":47,"name":48,"github_repo":49,"description_zh":50,"stars":51,"difficulty_score":32,"last_commit_at":52,"category_tags":53,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":55,"name":56,"github_repo":57,"description_zh":58,"stars":59,"difficulty_score":32,"last_commit_at":60,"category_tags":61,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[45,13,15,14],{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":78,"owner_twitter":77,"owner_website":77,"owner_url":79,"languages":80,"stars":113,"forks":114,"last_commit_at":115,"license":116,"difficulty_score":32,"env_os":117,"env_gpu":118,"env_ram":118,"env_deps":119,"category_tags":128,"github_topics":77,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":130,"updated_at":131,"faqs":132,"releases":173},8302,"SalesforceAIResearch\u002FMCP-Universe","MCP-Universe","MCP-Universe is a comprehensive framework designed for RL training, benchmarking, and developing AI agents for general tool-use.","MCP-Universe 是一个专为构建、优化和评估基于模型上下文协议（MCP）的 AI 智能体而设计的综合框架。它致力于解决当前大模型在真实场景中应用工具时的痛点，填补了现有基准测试过于简单、缺乏实际交互环境的空白。通过连接真实的 MCP 服务器，MCP-Universe 能够在包含长程推理、复杂陌生工具空间以及动态实时数据的真实环境中，对智能体进行严谨的性能评测。\n\n该平台特别适合 AI 研究人员、开发者以及需要验证智能体落地能力的团队使用。它不仅提供了一套行业领先的基准测试套件（如最新的 MCPMark），帮助量化智能体在处理多步骤任务时的表现，还内置了多个生产级开发工具。其中独特的技术亮点包括\"MCP+\"模块，它能通过精准的上下文管理将大模型的 Token 成本降低高达 75%，同时不牺牲输出质量；此外，其支持的“深度研究智能体”可通过并行调用工具显著提升执行效率。无论是希望复现前沿研究成果，还是旨在打造能高效操作真实世界工具的 AI 应用，MCP-Universe 都提供了一个从实验到部署的完整生态系统。","# \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FSalesforceAIResearch_MCP-Universe_readme_0a29aa6fe7fc.png\" alt=\"MCP-Universe\" width=\"23\" height=\"23\"> MCP-Universe\n\n[![Paper](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-arXiv:2508.14704-B31B1B?style=for-the-badge&logo=arxiv&logoColor=white)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.14704)\n[![Website](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FWebsite-Live-4285F4?style=for-the-badge&logo=googlechrome&logoColor=white)](https:\u002F\u002Fmcp-universe.github.io\u002F)\n[![Leaderboard](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLeaderboard-Results-FF6B35?style=for-the-badge&logo=chartdotjs&logoColor=white)](https:\u002F\u002Fmcp-universe.github.io\u002F#results)\n[![Discord](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDiscord-Join_Community-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https:\u002F\u002Fdiscord.gg\u002Ft9tU77GF)\n\n### 🎉 Latest Updates\n\n> **📊 [MCPMark Evaluation](#mcpmark-benchmark)** - MCP-Universe now supports evaluating the MCPMark tasks\n>\n> **🚀 [MCP+](#mcp-precision-context-management-for-mcp-agents)** - Agentic wrapper on MCP clients which reduce token costs by up to 75% \n>\n> **🔬 [Deep Research Agent](#deep-research-agent-wide--deep-wd-research)** - Scale the Width of Deep Research Agents with parallel tool calling, improving performance and efficiency\n\n\u003C\u002Fdiv>\n\n---\n\n## What is MCP-Universe?\n\nMCP-Universe is a comprehensive ecosystem for building, optimizing, and evaluating AI agents that interact with the Model Context Protocol (MCP). Beyond our industry-leading benchmark for real-world MCP server interactions, MCP-Universe provides production-ready tools for agent development including specialized research agents ([**Deep Research Agent**](#deep-research-agent-wide--deep-wd-research)), intelligent context management ([**MCP+**](#mcp-precision-context-management-for-mcp-agents)), and sophisticated orchestration workflows.\n\n\u003Cdiv align=\"center\">\n\n![MCP-Universe Introduction](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FSalesforceAIResearch_MCP-Universe_readme_ced03018c0f9.png)\n\n\u003C\u002Fdiv>\n\n**Benchmarking:** Unlike existing benchmarks that rely on overly simplistic tasks, MCP-Universe addresses critical gaps by evaluating LLMs in **real-world scenarios** through interaction with actual MCP servers, capturing real application challenges such as:\n\n- 🎯 **Long-horizon reasoning** across multi-step tasks\n- 🔧 **Large, unfamiliar tool spaces** with diverse MCP servers\n- 🌍 **Real-world data sources** and live environments\n- ⚡ **Dynamic evaluation** with time-sensitive ground truth\n\n\n## Table of Contents\n\n- [What's New](#whats-new)\n- [Architecture Overview](#architecture-overview)\n- [Getting Started](#getting-started)\n    - [Prerequisites](#prerequisites)\n    - [Installation](#installation)\n    - [Quick Test](#quick-test)\n- [Evaluating LLMs and Agents](#evaluating-llms-and-agents)\n    - [Prerequisites](#prerequisites-1)\n    - [Environment Configuration](#environment-configuration)\n    - [Benchmark Configuration](#benchmark-configuration)\n    - [Execution](#execution)\n    - [Save the running log](#save-the-running-log)\n    - [Save the benchmark result to a report](#save-the-benchmark-result-to-a-report)\n    - [Visualize the agent running information](#visualize-the-agent-running-information)\n- [Creating Custom Benchmarks](#creating-custom-benchmarks)\n    - [Task definition](#task-definition)\n    - [Benchmark definition](#benchmark-definition)\n- [Citation](#citation)\n\n## What's New\n\n### MCPMark Benchmark\n\n**📊 Evaluate MCP Agents with MCPMark**\n\nMCP-Universe now supports evaluating the **MCPMark** benchmark, enabling comprehensive testing and benchmarking of MCP agents. You can run MCPMark evaluations directly within the MCP-Universe framework to assess agent performance on MCP tasks.\n\n**📚 Resources:**\n- [How to run MCPMark](mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Fmcpmark\u002FREADME.md#running-mcpmark-tasks)\n- [Evaluation Scores](mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Fmcpmark\u002FREADME.md#benchmark-results-alignment)\n\n---\n\n### MCP+: Precision Context Management for MCP Agents\n\n**🚀 Reduce LLM Token Costs by up to 75% Without Sacrificing Quality**\n\nMCP tools often return large, verbose outputs that waste your LLM's context window and cost money. **MCP+** wraps your MCP clients with intelligent post-processing that extracts only the relevant information before it reaches your LLM.\n\n#### ✨ Key Features\n\n- **💰 Massive Cost Reduction**: 50-75% token savings on tool outputs\n- **⚡ Zero Code Changes**: Drop-in replacement for standard MCP clients\n\n\n**📚 [Learn More at mcp-plus.github.io →](https:\u002F\u002Fmcp-plus.github.io)**\n\n\u003C\u002Fdiv>\n\n---\n\n### Deep Research Agent: Wide & Deep (W&D) Research\n\n**🔬 Scale Research Width with Parallel Tool Calls**\n\n**Feb 11, 2026** — We introduce **Wide & Deep (W&D) research agents** that scale *width* by making more parallel tool calls per turn. This approach improves accuracy on BrowseComp, HLE, and GAIA benchmarks while reducing turns, API cost, and wall-clock time. Our W&D agent with GPT-5-medium reaches **62.2%** on BrowseComp, outperforming GPT-5-high deep research (54.9%).\n\n**📚 Resources:**\n- [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2602.07359)\n- [Website](https:\u002F\u002Fxqlin98.github.io\u002Fwide-deep-research-agent\u002F)\n- [Code](mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Fdeepresearch\u002FREADME.md)\n\n---\n\n## Architecture Overview\n\nThe MCPUniverse architecture consists of the following key components:\n\n- **Agents** (`mcpuniverse\u002Fagent\u002F`): Base implementations for different agent types\n- **Workflows** (`mcpuniverse\u002Fworkflows\u002F`): Orchestration and coordination layer\n- **MCP Servers** (`mcpuniverse\u002Fmcp\u002F`): Protocol management and external service integration\n- **LLM Integration** (`mcpuniverse\u002Fllm\u002F`): Multi-provider language model support\n- **Benchmarking** (`mcpuniverse\u002Fbenchmark\u002F`): Evaluation and testing framework\n- **Dashboard** (`mcpuniverse\u002Fdashboard\u002F`): Visualization and monitoring interface\n\nThe diagram below illustrates the high-level view:\n\n```\n┌─────────────────────────────────────────────────────────────────┐\n│                      Application Layer                          │\n├─────────────────────────────────────────────────────────────────┤\n│  Dashboard  │    Web API      │   Python Lib   │   Benchmarks   │\n│   (Gradio)  │   (FastAPI)     │                │                │\n└─────────────┬─────────────────┬────────────────┬────────────────┘\n              │                 │                │\n┌─────────────▼─────────────────▼────────────────▼────────────────┐\n│                      Orchestration Layer                        │\n├─────────────────────────────────────────────────────────────────┤\n│           Workflows           │        Benchmark Runner         │\n│    (Chain, Router, etc.)      │      (Evaluation Engine)        │\n└─────────────┬─────────────────┬────────────────┬────────────────┘\n              │                 │                │\n┌─────────────▼─────────────────▼────────────────▼────────────────┐\n│                        Agent Layer                              │\n├─────────────────────────────────────────────────────────────────┤\n│  BasicAgent │   ReActAgent    │  FunctionCall  │     Other      │\n│             │                 │     Agent      │     Agents     │\n└─────────────┬─────────────────┬────────────────┬────────────────┘\n              │                 │                │\n┌─────────────▼─────────────────▼────────────────▼────────────────┐\n│                      Foundation Layer                           │\n├─────────────────────────────────────────────────────────────────┤\n│   MCP Manager   │   LLM Manager   │  Memory Systems │  Tracers  │\n│   (Servers &    │   (Multi-Model  │   (RAM, Redis)  │ (Logging) │\n│    Clients)     │    Support)     │                 │           │\n└─────────────────┴─────────────────┴─────────────────┴───────────┘\n```\n\nMore information can be found [here](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fblob\u002Fmain\u002Fdocs).\n\n## Getting Started\n\nWe follow\nthe [feature branch workflow](https:\u002F\u002Fwww.atlassian.com\u002Fgit\u002Ftutorials\u002Fcomparing-workflows\u002Ffeature-branch-workflow)\nin this repo for its simplicity. To ensure code quality, [PyLint](https:\u002F\u002Fpylint.readthedocs.io\u002Fen\u002Flatest\u002F)\nis integrated into our CI to enforce Python coding standards.\n\n### Prerequisites\n\n* **Python**: Requires version 3.10 or higher.\n* **Docker**: Used for running Dockerized MCP servers.\n* **PostgreSQL** (optional): Used for database storage and persistence.\n* **Redis** (optional): Used for caching and memory management.\n\n### Installation\n\n1. **Clone the repository**\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe.git\n   cd MCP-Universe\n   ```\n\n2. **Create and activate virtual environment**\n   ```bash\n   python3 -m venv venv\n   source venv\u002Fbin\u002Factivate\n   ```\n\n3. **Install dependencies**\n   ```bash\n   pip install -r requirements.txt\n   pip install -r dev-requirements.txt\n   ```\n\n4. **Platform-specific requirements**\n\n   **Linux:**\n   ```bash\n   sudo apt-get install libpq-dev\n   ```\n\n   **macOS:**\n   ```bash\n   brew install postgresql\n   ```\n\n5. **Configure pre-commit hooks**\n   ```bash\n   pre-commit install\n   ```\n\n6. **Environment configuration**\n   ```bash\n   cp .env.example .env\n   # Edit .env with your API keys and configuration\n   ```\n\n### Quick Test\n\nTo run benchmarks, you first need to set environment variables:\n\n1. Copy the `.env.example` file to a new file named `.env`.\n2. In the `.env` file, set the required API keys for various services used by the agents,\n   such as `OPENAI_API_KEY` and `GOOGLE_MAPS_API_KEY`.\n\nTo execute a benchmark programmatically:\n\n```python\nfrom mcpuniverse.tracer.collectors import MemoryCollector  # You can also use SQLiteCollector\nfrom mcpuniverse.benchmark.runner import BenchmarkRunner\n\nasync def test():\n    trace_collector = MemoryCollector()\n    # Choose a benchmark config file under the folder \"mcpuniverse\u002Fbenchmark\u002Fconfigs\"\n    benchmark = BenchmarkRunner(\"dummy\u002Fbenchmark_1.yaml\")\n    # Run the specified benchmark\n    results = await benchmark.run(trace_collector=trace_collector)\n    # Get traces\n    trace_id = results[0].task_trace_ids[\"dummy\u002Ftasks\u002Fweather_1.json\"]\n    trace_records = trace_collector.get(trace_id)\n```\n\n## Evaluating LLMs and Agents\n\nThis section provides comprehensive instructions for evaluating LLMs and AI agents using the MCP-Universe benchmark suite. The framework supports evaluation across multiple domains including web search, location navigation, browser automation, financial analysis, repository management, and 3D design.\n\n### Prerequisites\n\nBefore running benchmark evaluations, ensure you have completed the [Getting Started](#getting-started) section and have the following:\n\n- Python: Version 3.10 or higher\n- Docker: Installed and available in your environment\n- All required dependencies installed via `pip install -r requirements.txt`\n- Active virtual environment\n- Appropriate API access for the services you intend to evaluate\n\n### Environment Configuration\n\n#### 1. Initial Setup\n\nCopy the environment template and configure your API credentials:\n\n```bash\ncp .env.example .env\n```\n\n#### 2. API Keys and Configuration\n\nConfigure the following environment variables in your `.env` file. The required keys depend on which benchmark domains you plan to evaluate:\n\n##### Core LLM Providers\n\n| Environment Variable | Provider | Description | Required For |\n|---------------------|----------|-------------|--------------|\n| `OPENAI_API_KEY` | OpenAI | API key for GPT models (gpt-5, etc.) | All domains |\n| `ANTHROPIC_API_KEY` | Anthropic | API key for Claude models | All domains |\n| `GEMINI_API_KEY` | Google | API key for Gemini models | All domains |\n\n> **Note**: You only need to configure the API key for the LLM provider you intend to use in your evaluation.\n\n##### Domain-Specific Services\n\n| Environment Variable | Service | Description | Setup Instructions |\n|---------------------|---------|-------------|-------------------|\n| `SERP_API_KEY` | SerpAPI | Web search API for search benchmark evaluation | [Get API key](https:\u002F\u002Fserpapi.com\u002F) |\n| `GOOGLE_MAPS_API_KEY` | Google Maps | Geolocation and mapping services | [Setup Guide](https:\u002F\u002Fconsole.cloud.google.com\u002Fgoogle\u002Fmaps-apis\u002Fcredentials) |\n| `GITHUB_PERSONAL_ACCESS_TOKEN` | GitHub | Personal access token for repository operations | [Token Setup](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fauthentication\u002Fkeeping-your-account-and-data-secure\u002Fmanaging-your-personal-access-tokens) |\n| `GITHUB_PERSONAL_ACCOUNT_NAME` | GitHub | Your GitHub username | N\u002FA |\n| `NOTION_API_KEY` | Notion | Integration token for Notion workspace access | [Integration Setup](https:\u002F\u002Fdevelopers.notion.com\u002Fdocs\u002Fauthorization#obtaining-a-token) |\n| `NOTION_ROOT_PAGE` | Notion | Root page ID for your Notion workspace | See configuration example below |\n\n##### System Paths\n\n| Environment Variable | Description | Example |\n|---------------------|-------------|---------|\n| `BLENDER_APP_PATH` | Full path to Blender executable (we used v4.4.0) | `\u002FApplications\u002FBlender.app\u002FContents\u002FMacOS\u002FBlender` |\n| `MCPUniverse_DIR` | Absolute path to your MCP-Universe repository | `\u002FUsers\u002Fusername\u002FMCP-Universe` |\n\n##### Configuration Examples\n\n**Notion Root Page ID:**\nIf your Notion page URL is:\n```\nhttps:\u002F\u002Fwww.notion.so\u002Fyour_workspace\u002FMCP-Evaluation-1dd6d96e12345678901234567eaf9eff\n```\nSet `NOTION_ROOT_PAGE=MCP-Evaluation-1dd6d96e12345678901234567eaf9eff`\n\n**Blender Installation:**\n1. Download Blender v4.4.0 from [blender.org](https:\u002F\u002Fwww.blender.org\u002F)\n2. Install our modified Blender MCP server following the [installation guide](docs\u002Fblender-setup.md)\n3. Set the path to the Blender executable\n\n##### ⚠️ Security Recommendations\n\n> **🔒 IMPORTANT SECURITY NOTICE**\n> \n> Please read and follow these security guidelines carefully before running benchmarks:\n\n- **🚨 GitHub Integration**: **CRITICAL** - We strongly recommend using a dedicated test GitHub account for benchmark evaluation. The AI agent will perform real operations on GitHub repositories, which could potentially modify or damage your personal repositories.\n\n- **🔐 API Key Management**: \n  - Store API keys securely and never commit them to version control\n  - Use environment variables or secure key management systems\n  - Regularly rotate your API keys for enhanced security\n\n- **🛡️ Access Permissions**: \n  - Grant minimal necessary permissions for each service integration\n  - Review and limit API key scopes to only required operations\n  - Monitor API usage and set appropriate rate limits\n\n- **⚡ Blender Operations**: The 3D design benchmarks will execute Blender commands that may modify or create files on your system. Ensure you have adequate backups and run in an isolated environment if necessary.\n\n### Benchmark Configuration\n\n#### Domain-Specific Configuration Files\n\nEach benchmark domain has a dedicated YAML configuration file located in `mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Ftest\u002F`. To evaluate your LLM\u002Fagent, modify the appropriate configuration file:\n\n| Domain | Configuration File | Description |\n|--------|-------------------|-------------|\n| Web Search | `web_search.yaml` | Search engine and information retrieval tasks |\n| Location Navigation | `location_navigation.yaml` | Geographic and mapping-related queries |\n| Browser Automation | `browser_automation.yaml` | Web interaction and automation scenarios |\n| Financial Analysis | `financial_analysis.yaml` | Market data analysis and financial computations |\n| Repository Management | `repository_management.yaml` | Git operations and code repository tasks |\n| 3D Design | `3d_design.yaml` | Blender-based 3D modeling and design tasks |\n\n#### LLM Model Configuration\n\nIn each configuration file, update the LLM specification to match your target model:\n\n```yaml\nkind: llm\nspec:\n  name: llm-1\n  type: openai  # or anthropic, google, etc.\n  config:\n    model_name: gpt-4o  # Replace with your target model\n```\n\n### Execution\n\n#### Running Individual Benchmarks\n\nExecute specific domain benchmarks using the following commands:\n\n```bash\n# Set Python path and run individual benchmarks\nexport PYTHONPATH=.\n\n# Location Navigation\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_location_navigation.py\n\n# Browser Automation  \npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_browser_automation.py\n\n# Financial Analysis\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_financial_analysis.py\n\n# Repository Management\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_repository_management.py\n\n# Web Search\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_web_search.py\n\n# 3D Design\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_3d_design.py\n```\n\n#### Batch Execution\n\nFor comprehensive evaluation across all domains:\n\n```bash\n#!\u002Fbin\u002Fbash\nexport PYTHONPATH=.\n\ndomains=(\"location_navigation\" \"browser_automation\" \"financial_analysis\" \n         \"repository_management\" \"web_search\" \"3d_design\")\n\nfor domain in \"${domains[@]}\"; do\n    echo \"Running benchmark: $domain\"\n    python \"tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_${domain}.py\"\n    echo \"Completed: $domain\"\ndone\n```\n\n### Save the running log\n\nIf you want to save the running log, you can pass the `trace_collector` to the benchmark run function:\n\n```python\nfrom mcpuniverse.tracer.collectors import FileCollector\n\ntrace_collector = FileCollector(log_file=\"log\u002Flocation_navigation.log\")\nbenchmark_results = await benchmark.run(trace_collector=trace_collector)\n```\n\n### Save the benchmark result to a report \n\nIf you want to save a report of the benchmark result, you can use `BenchmarkReport` to dump a report:\n\n```python\nfrom mcpuniverse.benchmark.report import BenchmarkReport\n\nreport = BenchmarkReport(benchmark, trace_collector=trace_collector)\nreport.dump()\n```\n\n### Visualize the agent running information\n\nTo run the benchmark with intermediate results and see real-time progress, pass `callbacks=get_vprint_callbacks()` to the run function:\n\n```python\nfrom mcpuniverse.callbacks.handlers.vprint import get_vprint_callbacks\n\nbenchmark_results = await benchmark.run(\n    trace_collector=trace_collector, \n    callbacks=get_vprint_callbacks()\n)\n```\n\nThis will print out the intermediate results as the benchmark runs.\n\n\nFor further details, refer to the in-code documentation or existing configuration samples in the repository.\n\n## Creating Custom Benchmarks\n\nA benchmark is defined by three main configuration elements: the task definition,\nagent\u002Fworkflow definition, and the benchmark configuration itself. Below is an example\nusing a simple \"weather forecasting\" task.\n\n### Task definition\n\nThe task definition is provided in JSON format, for example:\n\n```json\n{\n  \"category\": \"general\",\n  \"question\": \"What's the weather in San Francisco now?\",\n  \"mcp_servers\": [\n    {\n      \"name\": \"weather\"\n    }\n  ],\n  \"output_format\": {\n    \"city\": \"\u003CCity>\",\n    \"weather\": \"\u003CWeather forecast results>\"\n  },\n  \"evaluators\": [\n    {\n      \"func\": \"json -> get(city)\",\n      \"op\": \"=\",\n      \"value\": \"San Francisco\"\n    }\n  ]\n}\n```\n\nField descriptions:\n\n1. **category**: The task category, e.g., \"general\", \"google-maps\", etc. You can set any value for this property.\n2. **question**: The main question you want to ask in this task. This is treated as a user message.\n3. **mcp_servers**: A list of MCP servers that are supported in this framework.\n4. **output_format**: The desired output format of agent responses.\n5. **evaluators**: A list of tests to evaluate. For each test\u002Fevaluator, it has three attributes: \"func\" indicates\n   how to extract values from the agent response, \"op\" is the comparison operator, and \"value\" is the ground-truth\n   value.\n   It will evaluate **op(func(...), value, op_args...)**. \"op\" can be \"=\", \"\u003C\", \">\" or other customized operators.\n\nIn \"evaluators\", you need to write a rule (\"func\" attribute) showing how to extract values for testing. In the example\nabove, \"json -> get(city)\" will first do JSON decoding and then extract the value of key \"city\". There are several\npredefined funcs in this repo:\n\n1. **json**: Perform JSON decoding.\n2. **get**: Get the value of a key.\n3. **len**: Get the length of a list.\n4. **foreach**: Do a FOR-EACH loop.\n\nFor example, let's define\n\n```python\ndata = {\"x\": [{\"y\": [1]}, {\"y\": [1, 1]}, {\"y\": [1, 2, 3, 4]}]}\n```\n\nThen `get(x) -> foreach -> get(y) -> len` will do the following:\n\n1. Get the value of \"x\": `[{\"y\": [1]}, {\"y\": [1, 1]}, {\"y\": [1, 2, 3, 4]}]`.\n2. Do a foreach loop and get the value of \"y\": `[[1], [1, 1], [1, 2, 3, 4]]`.\n3. Get the length of each list: `[1, 2, 4]`.\n\nIf these predefined functions are not enough, you can implement custom ones.\nFor more details, please check\nthis [doc](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fblob\u002Fmain\u002Fdocs\u002Fcustom-evaluators-guide.md).\n\n### Benchmark definition\n\nDefine agent(s) and benchmark in a YAML file. Here’s a simple weather forecast benchmark:\n\n```yaml\nkind: llm\nspec:\n  name: llm-1\n  type: openai\n  config:\n    model_name: gpt-4o\n\n---\nkind: agent\nspec:\n  name: ReAct-agent\n  type: react\n  config:\n    llm: llm-1\n    instruction: You are an agent for weather forecasting.\n    servers:\n      - name: weather\n\n---\nkind: benchmark\nspec:\n  description: Test the agent for weather forecasting\n  agent: ReAct-agent\n  tasks:\n    - dummy\u002Ftasks\u002Fweather.json\n```\n\nThe benchmark definition mainly contains two parts: the agent definition and the benchmark configuration. The benchmark configuration is simple—you just need to specify the agent to use (by the defined agent name) and a list of tasks to evaluate. Each task entry is the task config file\npath. It can be a full file path or a partial file path. If it is a partial file path (like \"dummy\u002Ftasks\u002Fweather.json\"),\nit should be put in the\nfolder [mcpuniverse\u002Fbenchmark\u002Fconfigs](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Ftree\u002Fmain\u002Fmcpuniverse\u002Fbenchmark\u002Fconfigs)\nin this repo.\n\nThis framework offers a flexible way to define both simple agents (such as ReAct) and more complex, multi-step agent\nworkflows.\n\n1. **Specify LLMs:** Begin by declaring the large language models (LLMs) you want the agents to use. Each LLM component\n   must be assigned a unique name (e.g., `\"llm-1\"`). These names serve as identifiers that the framework uses to connect\n   the different components together.\n2. **Define an agent:** Next, define an agent by providing its name and selecting an agent class. Agent classes are\n   available in\n   the [mcpuniverse.agent](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Ftree\u002Fmain\u002Fmcpuniverse\u002Fagent) package.\n   Commonly used classes include `\"basic\"`, `\"function-call\"`, and `\"react\"`. Within the agent specification (\n   `spec.config`), you must also indicate which LLM instance the agent should use by setting the `\"llm\"` field.\n3. **Create complex workflows:** Beyond simple agents, the framework supports the definition of sophisticated,\n   orchestrated workflows where multiple agents interact or collaborate to solve more complex tasks.\n\nFor example:\n\n```yaml\nkind: llm\nspec:\n  name: llm-1\n  type: openai\n  config:\n    model_name: gpt-4o\n\n---\nkind: agent\nspec:\n  name: basic-agent\n  type: basic\n  config:\n    llm: llm-1\n    instruction: Return the latitude and the longitude of a place.\n\n---\nkind: agent\nspec:\n  name: function-call-agent\n  type: function-call\n  config:\n    llm: llm-1\n    instruction: You are an agent for weather forecast. Please return the weather today at the given latitude and longitude.\n    servers:\n      - name: weather\n\n---\nkind: workflow\nspec:\n  name: orchestrator-workflow\n  type: orchestrator\n  config:\n    llm: llm-1\n    agents:\n      - basic-agent\n      - function-call-agent\n\n---\nkind: benchmark\nspec:\n  description: Test the agent for weather forecasting\n  agent: orchestrator-workflow\n  tasks:\n    - dummy\u002Ftasks\u002Fweather.json\n```\n\n## Citation\n\nIf you use MCP-Universe in your research, please cite our paper:\n\n```bibtex\n@misc{mcpuniverse,\n  title={MCP-Universe: Benchmarking Large Language Models with Real-World Model Context Protocol Servers},\n  author={Ziyang Luo and Zhiqi Shen and Wenzhuo Yang and Zirui Zhao and Prathyusha Jwalapuram and Amrita Saha and Doyen Sahoo and Silvio Savarese and Caiming Xiong and Junnan Li},\n  year={2025},\n  eprint={2508.14704},\n  archivePrefix={arXiv},\n  primaryClass={cs.AI},\n  url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.14704}, \n}\n```\n","# \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FSalesforceAIResearch_MCP-Universe_readme_0a29aa6fe7fc.png\" alt=\"MCP-Universe\" width=\"23\" height=\"23\"> MCP-Universe\n\n[![Paper](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-arXiv:2508.14704-B31B1B?style=for-the-badge&logo=arxiv&logoColor=white)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.14704)\n[![Website](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FWebsite-Live-4285F4?style=for-the-badge&logo=googlechrome&logoColor=white)](https:\u002F\u002Fmcp-universe.github.io\u002F)\n[![Leaderboard](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLeaderboard-Results-FF6B35?style=for-the-badge&logo=chartdotjs&logoColor=white)](https:\u002F\u002Fmcp-universe.github.io\u002F#results)\n[![Discord](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDiscord-Join_Community-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https:\u002F\u002Fdiscord.gg\u002Ft9tU77GF)\n\n### 🎉 最新动态\n\n> **📊 [MCPMark 评估](#mcpmark-benchmark)** - MCP-Universe 现在支持对 MCPMark 任务进行评估\n>\n> **🚀 [MCP+](#mcp-precision-context-management-for-mcp-agents)** - 基于 MCP 客户端的代理式封装，可将 token 成本降低多达 75%\n>\n> **🔬 [深度研究代理](#deep-research-agent-wide--deep-wd-research)** - 通过并行工具调用扩展深度研究代理的宽度，从而提升性能和效率\n\n\u003C\u002Fdiv>\n\n---\n\n## 什么是 MCP-Universe？\n\nMCP-Universe 是一个全面的生态系统，用于构建、优化和评估与模型上下文协议（MCP）交互的 AI 代理。除了我们行业领先的现实世界 MCP 服务器交互基准测试之外，MCP-Universe 还提供了生产就绪的代理开发工具，包括专门的研究代理（**Deep Research Agent**）、智能上下文管理（**MCP+**）以及复杂的编排工作流。\n\n\u003Cdiv align=\"center\">\n\n![MCP-Universe 介绍](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FSalesforceAIResearch_MCP-Universe_readme_ced03018c0f9.png)\n\n\u003C\u002Fdiv>\n\n**基准测试：** 与依赖过于简单任务的现有基准不同，MCP-Universe 通过与实际的 MCP 服务器交互，在**真实场景**中评估大语言模型，填补了关键空白，捕捉到真实的应用挑战，例如：\n\n- 🎯 多步骤任务中的**长时序推理**\n- 🔧 包含多种 MCP 服务器的**大型、陌生工具空间**\n- 🌍 **真实世界的数据源**和实时环境\n- ⚡ 具有时间敏感性真值的**动态评估**\n\n## 目录\n\n- [最新动态](#whats-new)\n- [架构概览](#architecture-overview)\n- [开始使用](#getting-started)\n    - [先决条件](#prerequisites)\n    - [安装](#installation)\n    - [快速测试](#quick-test)\n- [评估 LLM 和代理](#evaluating-llms-and-agents)\n    - [先决条件](#prerequisites-1)\n    - [环境配置](#environment-configuration)\n    - [基准配置](#benchmark-configuration)\n    - [执行](#execution)\n    - [保存运行日志](#save-the-running-log)\n    - [将基准结果保存为报告](#save-the-benchmark-result-to-a-report)\n    - [可视化代理运行信息](#visualize-the-agent-running-information)\n- [创建自定义基准](#creating-custom-benchmarks)\n    - [任务定义](#task-definition)\n    - [基准定义](#benchmark-definition)\n- [引用](#citation)\n\n## 最新动态\n\n### MCPMark 基准测试\n\n**📊 使用 MCPMark 评估 MCP 代理**\n\nMCP-Universe 现在支持评估 **MCPMark** 基准，能够对 MCP 代理进行全面的测试和基准评估。您可以在 MCP-Universe 框架内直接运行 MCPMark 评估，以衡量代理在 MCP 任务上的表现。\n\n**📚 资源：**\n- [如何运行 MCPMark](mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Fmcpmark\u002FREADME.md#running-mcpmark-tasks)\n- [评估分数](mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Fmcpmark\u002FREADME.md#benchmark-results-alignment)\n\n---\n\n### MCP+: 针对 MCP 代理的精准上下文管理\n\n**🚀 在不牺牲质量的前提下，将 LLM 的 token 成本降低多达 75%**\n\nMCP 工具通常会返回大量冗长的输出，这不仅浪费了 LLM 的上下文窗口，还会增加成本。**MCP+** 通过智能后处理包装您的 MCP 客户端，在数据到达 LLM 之前仅提取相关信息。\n\n#### ✨ 主要特性\n\n- **💰 巨大的成本节约**：工具输出的 token 节省可达 50-75%\n- **⚡ 无需代码更改**：可直接替换标准 MCP 客户端\n\n\n**📚 [了解更多，请访问 mcp-plus.github.io →](https:\u002F\u002Fmcp-plus.github.io)**\n\n\u003C\u002Fdiv>\n\n---\n\n### 深度研究代理：宽深（W&D）研究\n\n**🔬 通过并行工具调用扩展研究宽度**\n\n**2026年2月11日** — 我们推出了**宽深（W&D）研究代理**，它通过每轮更多的并行工具调用来扩展“宽度”。这种方法在 BrowseComp、HLE 和 GAIA 基准上提高了准确性，同时减少了轮次、API 费用和实际耗时。我们的 W&D 代理配合 GPT-5-medium 在 BrowseComp 上达到了 **62.2%**，优于 GPT-5-high 深度研究（54.9%）。\n\n**📚 资源：**\n- [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2602.07359)\n- [网站](https:\u002F\u002Fxqlin98.github.io\u002Fwide-deep-research-agent\u002F)\n- [代码](mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Fdeepresearch\u002FREADME.md)\n\n---\n\n## 架构概述\n\nMCPUniverse 架构由以下关键组件组成：\n\n- **智能体**（`mcpuniverse\u002Fagent\u002F`）：不同智能体类型的基类实现\n- **工作流**（`mcpuniverse\u002Fworkflows\u002F`）：编排与协调层\n- **MCP 服务器**（`mcpuniverse\u002Fmcp\u002F`）：协议管理和外部服务集成\n- **大模型集成**（`mcpuniverse\u002Fllm\u002F`）：多提供商语言模型支持\n- **基准测试**（`mcpuniverse\u002Fbenchmark\u002F`）：评估与测试框架\n- **仪表盘**（`mcpuniverse\u002Fdashboard\u002F`）：可视化与监控界面\n\n下图展示了高层视图：\n\n```\n┌─────────────────────────────────────────────────────────────────┐\n│                      应用层                          │\n├─────────────────────────────────────────────────────────────────┤\n│  仪表盘  │    Web API      │   Python 库   │   基准测试   │\n│   (Gradio)  │   (FastAPI)     │                │                │\n└─────────────┬─────────────────┬────────────────┬────────────────┘\n              │                 │                │\n┌─────────────▼─────────────────▼────────────────▼────────────────┐\n│                      编排层                        │\n├─────────────────────────────────────────────────────────────────┤\n│           工作流           │        基准测试运行器         │\n│    (链、路由器等)      │      (评估引擎)        │\n└─────────────┬─────────────────┬────────────────┬────────────────┘\n              │                 │                │\n┌─────────────▼─────────────────▼────────────────▼────────────────┐\n│                        智能体层                              │\n├─────────────────────────────────────────────────────────────────┤\n│  BasicAgent │   ReActAgent    │  FunctionCall  │     其他      │\n│             │                 │     智能体      │     智能体     │\n└─────────────┬─────────────────┬────────────────┬────────────────┘\n              │                 │                │\n┌─────────────▼─────────────────▼────────────────▼────────────────┐\n│                      基础设施层                           │\n├─────────────────────────────────────────────────────────────────┤\n│   MCP 管理器   │   LLM 管理器   │  内存系统 │  追踪器  │\n│   (服务器与    │   (多模型  │   (RAM, Redis)  │ (日志记录) │\n│    客户端)     │    支持)     │                 │           │\n└─────────────────┴─────────────────┴─────────────────┴───────────┘\n```\n\n更多信息请参见 [这里](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fblob\u002Fmain\u002Fdocs)。\n\n## 快速入门\n\n为了简化流程，我们在该仓库中遵循 [特性分支工作流](https:\u002F\u002Fwww.atlassian.com\u002Fgit\u002Ftutorials\u002Fcomparing-workflows\u002Ffeature-branch-workflow)。为确保代码质量，我们已将 [PyLint](https:\u002F\u002Fpylint.readthedocs.io\u002Fen\u002Flatest\u002F) 集成到 CI 中，以强制执行 Python 编码规范。\n\n### 先决条件\n\n* **Python**：需版本 3.10 或更高。\n* **Docker**：用于运行 Docker 化的 MCP 服务器。\n* **PostgreSQL**（可选）：用于数据库存储和持久化。\n* **Redis**（可选）：用于缓存和内存管理。\n\n### 安装步骤\n\n1. **克隆仓库**\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe.git\n   cd MCP-Universe\n   ```\n\n2. **创建并激活虚拟环境**\n   ```bash\n   python3 -m venv venv\n   source venv\u002Fbin\u002Factivate\n   ```\n\n3. **安装依赖**\n   ```bash\n   pip install -r requirements.txt\n   pip install -r dev-requirements.txt\n   ```\n\n4. **平台特定要求**\n\n   **Linux:**\n   ```bash\n   sudo apt-get install libpq-dev\n   ```\n\n   **macOS:**\n   ```bash\n   brew install postgresql\n   ```\n\n5. **配置 pre-commit 钩子**\n   ```bash\n   pre-commit install\n   ```\n\n6. **环境配置**\n   ```bash\n   cp .env.example .env\n   # 根据您的 API 密钥和配置编辑 .env 文件\n   ```\n\n### 快速测试\n\n要运行基准测试，您需要先设置环境变量：\n\n1. 将 `.env.example` 文件复制为名为 `.env` 的新文件。\n2. 在 `.env` 文件中，设置智能体使用的各种服务所需的 API 密钥，例如 `OPENAI_API_KEY` 和 `GOOGLE_MAPS_API_KEY`。\n\n以编程方式执行基准测试的示例代码如下：\n\n```python\nfrom mcpuniverse.tracer.collectors import MemoryCollector  # 您也可以使用 SQLiteCollector\nfrom mcpuniverse.benchmark.runner import BenchmarkRunner\n\nasync def test():\n    trace_collector = MemoryCollector()\n    # 选择 \"mcpuniverse\u002Fbenchmark\u002Fconfigs\" 文件夹下的基准配置文件\n    benchmark = BenchmarkRunner(\"dummy\u002Fbenchmark_1.yaml\")\n    # 运行指定的基准测试\n    results = await benchmark.run(trace_collector=trace_collector)\n    # 获取追踪记录\n    trace_id = results[0].task_trace_ids[\"dummy\u002Ftasks\u002Fweather_1.json\"]\n    trace_records = trace_collector.get(trace_id)\n```\n\n## 评估大模型与智能体\n\n本节提供了使用 MCP-Universe 基准套件评估大模型和 AI 智能体的完整说明。该框架支持跨多个领域的评估，包括网络搜索、位置导航、浏览器自动化、财务分析、代码库管理以及 3D 设计等。\n\n### 先决条件\n\n在运行基准测试之前，请确保已完成 [快速入门](#getting-started) 部分，并具备以下条件：\n\n- Python：版本 3.10 或更高\n- Docker：已安装且可在环境中使用\n- 已通过 `pip install -r requirements.txt` 安装所有必要依赖\n- 激活的虚拟环境\n- 对您计划评估的服务拥有适当的 API 访问权限\n\n### 环境配置\n\n#### 1. 初始设置\n\n复制环境模板并配置您的 API 凭证：\n\n```bash\ncp .env.example .env\n```\n\n#### 2. API 密钥与配置\n\n在您的 `.env` 文件中配置以下环境变量。所需的密钥取决于您计划评估的基准测试领域：\n\n##### 核心大模型提供商\n\n| 环境变量 | 提供商 | 描述 | 必需项 |\n|---------------------|----------|-------------|--------------|\n| `OPENAI_API_KEY` | OpenAI | GPT 模型（gpt-5 等）的 API 密钥 | 所有领域 |\n| `ANTHROPIC_API_KEY` | Anthropic | Claude 模型的 API 密钥 | 所有领域 |\n| `GEMINI_API_KEY` | Google | Gemini 模型的 API 密钥 | 所有领域 |\n\n> **注意**：您只需为打算在评估中使用的 LLM 提供商配置相应的 API 密钥。\n\n##### 领域特定服务\n\n| 环境变量 | 服务 | 描述 | 设置说明 |\n|---------------------|---------|-------------|-------------------|\n| `SERP_API_KEY` | SerpAPI | 用于搜索基准测试的网络搜索 API | [获取 API 密钥](https:\u002F\u002Fserpapi.com\u002F) |\n| `GOOGLE_MAPS_API_KEY` | Google 地图 | 地理定位和地图服务 | [设置指南](https:\u002F\u002Fconsole.cloud.google.com\u002Fgoogle\u002Fmaps-apis\u002Fcredentials) |\n| `GITHUB_PERSONAL_ACCESS_TOKEN` | GitHub | 用于仓库操作的个人访问令牌 | [令牌设置](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fauthentication\u002Fkeeping-your-account-and-data-secure\u002Fmanaging-your-personal-access-tokens) |\n| `GITHUB_PERSONAL_ACCOUNT_NAME` | GitHub | 您的 GitHub 用户名 | 不适用 |\n| `NOTION_API_KEY` | Notion | 用于访问 Notion 工作区的集成令牌 | [集成设置](https:\u002F\u002Fdevelopers.notion.com\u002Fdocs\u002Fauthorization#obtaining-a-token) |\n| `NOTION_ROOT_PAGE` | Notion | 您的 Notion 工作区的根页面 ID | 参见下方的配置示例 |\n\n##### 系统路径\n\n| 环境变量 | 描述 | 示例 |\n|---------------------|-------------|---------|\n| `BLENDER_APP_PATH` | Blender 可执行文件的完整路径（我们使用 v4.4.0） | `\u002FApplications\u002FBlender.app\u002FContents\u002FMacOS\u002FBlender` |\n| `MCPUniverse_DIR` | 您的 MCP-Universe 仓库的绝对路径 | `\u002FUsers\u002Fusername\u002FMCP-Universe` |\n\n##### 配置示例\n\n**Notion 根页面 ID：**\n如果您的 Notion 页面 URL 是：\n```\nhttps:\u002F\u002Fwww.notion.so\u002Fyour_workspace\u002FMCP-Evaluation-1dd6d96e12345678901234567eaf9eff\n```\n则设置 `NOTION_ROOT_PAGE=MCP-Evaluation-1dd6d96e12345678901234567eaf9eff`\n\n**Blender 安装：**\n1. 从 [blender.org](https:\u002F\u002Fwww.blender.org\u002F) 下载 Blender v4.4.0。\n2. 按照 [安装指南](docs\u002Fblender-setup.md) 安装我们修改后的 Blender MCP 服务器。\n3. 设置 Blender 可执行文件的路径。\n\n##### ⚠️ 安全建议\n\n> **🔒 重要安全提示**\n> \n> 请在运行基准测试前仔细阅读并遵循以下安全指南：\n\n- **🚨 GitHub 集成**：**至关重要** - 我们强烈建议您使用一个专门的测试 GitHub 账户来进行基准测试。AI 代理将在 GitHub 仓库中执行实际操作，这可能会修改或损坏您的个人仓库。\n\n- **🔐 API 密钥管理**：\n  - 将 API 密钥安全存储，切勿将其提交到版本控制系统中。\n  - 使用环境变量或安全的密钥管理系统。\n  - 定期轮换您的 API 密钥以增强安全性。\n\n- **🛡️ 访问权限**：\n  - 为每个服务集成授予最小必要的权限。\n  - 审查并限制 API 密钥的作用范围，仅允许执行必要操作。\n  - 监控 API 使用情况，并设置适当的速率限制。\n\n- **⚡ Blender 操作**：3D 设计基准测试将执行可能修改或创建您系统上文件的 Blender 命令。请确保已做好充分备份，必要时在隔离环境中运行。\n\n### 基准测试配置\n\n#### 领域特定配置文件\n\n每个基准测试领域都有一个专用的 YAML 配置文件，位于 `mcpuniverse\u002Fbenchmark\u002Fconfigs\u002Ftest\u002F` 中。要评估您的 LLM\u002F代理，请修改相应的配置文件：\n\n| 领域 | 配置文件 | 描述 |\n|--------|-------------------|-------------|\n| 网络搜索 | `web_search.yaml` | 搜索引擎和信息检索任务 |\n| 位置导航 | `location_navigation.yaml` | 地理和地图相关查询 |\n| 浏览器自动化 | `browser_automation.yaml` | 网页交互和自动化场景 |\n| 财务分析 | `financial_analysis.yaml` | 市场数据分析和财务计算 |\n| 仓库管理 | `repository_management.yaml` | Git 操作和代码仓库任务 |\n| 3D 设计 | `3d_design.yaml` | 基于 Blender 的 3D 建模和设计任务 |\n\n#### LLM 模型配置\n\n在每个配置文件中，更新 LLM 规格以匹配您的目标模型：\n\n```yaml\nkind: llm\nspec:\n  name: llm-1\n  type: openai  # 或 anthropic、google 等\n  config:\n    model_name: gpt-4o  # 替换为您目标模型\n```\n\n### 执行\n\n#### 运行单个基准测试\n\n使用以下命令执行特定领域的基准测试：\n\n```bash\n# 设置 Python 路径并运行单个基准测试\nexport PYTHONPATH=.\n\n# 位置导航\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_location_navigation.py\n\n# 浏览器自动化  \npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_browser_automation.py\n\n# 财务分析\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_financial_analysis.py\n\n# 仓库管理\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_repository_management.py\n\n# 网络搜索\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_web_search.py\n\n# 3D 设计\npython tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_3d_design.py\n```\n\n#### 批量执行\n\n要对所有领域进行全面评估：\n\n```bash\n#!\u002Fbin\u002Fbash\nexport PYTHONPATH=.\n\ndomains=(\"location_navigation\" \"browser_automation\" \"financial_analysis\" \n         \"repository_management\" \"web_search\" \"3d_design\")\n\nfor domain in \"${domains[@]}\"; do\n    echo \"正在运行基准测试：$domain\"\n    python \"tests\u002Fbenchmark\u002Fmcpuniverse\u002Ftest_benchmark_${domain}.py\"\n    echo \"已完成：$domain\"\ndone\n```\n\n### 保存运行日志\n\n如果您想保存运行日志，可以将 `trace_collector` 传递给基准测试运行函数：\n\n```python\nfrom mcpuniverse.tracer.collectors import FileCollector\n\ntrace_collector = FileCollector(log_file=\"log\u002Flocation_navigation.log\")\nbenchmark_results = await benchmark.run(trace_collector=trace_collector)\n```\n\n### 将基准测试结果保存为报告\n\n如果您想保存基准测试结果报告，可以使用 `BenchmarkReport` 来导出报告：\n\n```python\nfrom mcpuniverse.benchmark.report import BenchmarkReport\n\nreport = BenchmarkReport(benchmark, trace_collector=trace_collector)\nreport.dump()\n```\n\n### 可视化代理运行信息\n\n要运行基准测试并查看中间结果和实时进度，可以将 `callbacks=get_vprint_callbacks()` 传递给运行函数：\n\n```python\nfrom mcpuniverse.callbacks.handlers.vprint import get_vprint_callbacks\n\nbenchmark_results = await benchmark.run(\n    trace_collector=trace_collector, \n    callbacks=get_vprint_callbacks()\n)\n```\n\n这将在基准测试运行时打印出中间结果。\n\n\n有关更多详细信息，请参阅代码中的文档或仓库中现有的配置示例。\n\n## 创建自定义基准测试\n\n一个基准测试由三个主要的配置元素定义：任务定义、代理\u002F工作流定义以及基准测试本身的配置。下面是一个使用简单“天气预报”任务的示例。\n\n### 任务定义\n\n任务定义以 JSON 格式提供，例如：\n\n```json\n{\n  \"category\": \"general\",\n  \"question\": \"旧金山现在的天气如何？\",\n  \"mcp_servers\": [\n    {\n      \"name\": \"weather\"\n    }\n  ],\n  \"output_format\": {\n    \"city\": \"\u003C城市>\",\n    \"weather\": \"\u003C天气预报结果>\"\n  },\n  \"evaluators\": [\n    {\n      \"func\": \"json -> get(city)\",\n      \"op\": \"=\",\n      \"value\": \"San Francisco\"\n    }\n  ]\n}\n```\n\n字段说明：\n\n1. **category**：任务类别，例如“general”、“google-maps”等。您可以为该属性设置任何值。\n2. **question**：您希望在此任务中提出的主要问题。这被视为用户消息。\n3. **mcp_servers**：此框架支持的 MCP 服务器列表。\n4. **output_format**：代理响应的期望输出格式。\n5. **evaluators**：用于评估的测试列表。对于每个测试\u002F评估器，它有三个属性：“func”表示如何从代理响应中提取值，“op”是比较运算符，“value”是真实值。\n   它将评估 **op(func(...), value, op_args...)**。“op”可以是“=”，“\u003C”，“>”或其他自定义运算符。\n\n在“evaluators”中，您需要编写一条规则（“func”属性），说明如何提取用于测试的值。在上面的示例中，“json -> get(city)”会先进行 JSON 解码，然后提取键“city”的值。此仓库中预定义了几种函数：\n\n1. **json**：执行 JSON 解码。\n2. **get**：获取某个键的值。\n3. **len**：获取列表的长度。\n4. **foreach**：执行 FOR-EACH 循环。\n\n例如，假设我们定义了如下数据：\n\n```python\ndata = {\"x\": [{\"y\": [1]}, {\"y\": [1, 1]}, {\"y\": [1, 2, 3, 4]}]}\n```\n\n那么 `get(x) -> foreach -> get(y) -> len` 将执行以下操作：\n\n1. 获取“x”的值：`[{\"y\": [1]}, {\"y\": [1, 1]}, {\"y\": [1, 2, 3, 4]}]`。\n2. 执行 foreach 循环，获取“y”的值：`[[1], [1, 1], [1, 2, 3, 4]]`。\n3. 获取每个列表的长度：`[1, 2, 4]`。\n\n如果这些预定义的函数不够用，您可以实现自定义函数。有关更多详细信息，请参阅\n此 [文档](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fblob\u002Fmain\u002Fdocs\u002Fcustom-evaluators-guide.md)。\n\n### 基准测试定义\n\n在 YAML 文件中定义代理和基准测试。以下是一个简单的天气预报基准测试：\n\n```yaml\nkind: llm\nspec:\n  name: llm-1\n  type: openai\n  config:\n    model_name: gpt-4o\n\n---\nkind: agent\nspec:\n  name: ReAct-agent\n  type: react\n  config:\n    llm: llm-1\n    instruction: 您是一名天气预报代理。\n    servers:\n      - name: weather\n\n---\nkind: benchmark\nspec:\n  description: 测试天气预报代理\n  agent: ReAct-agent\n  tasks:\n    - dummy\u002Ftasks\u002Fweather.json\n```\n\n基准测试定义主要包括两部分：代理定义和基准测试配置。基准测试配置很简单——您只需指定要使用的代理（通过已定义的代理名称）以及要评估的任务列表。每个任务条目都是任务配置文件的路径。它可以是完整文件路径，也可以是相对路径。如果是相对路径（如“dummy\u002Ftasks\u002Fweather.json”），则应将其放置在本仓库的\n[mcpuniverse\u002Fbenchmark\u002Fconfigs](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Ftree\u002Fmain\u002Fmcpuniverse\u002Fbenchmark\u002Fconfigs)\n文件夹中。\n\n该框架提供了一种灵活的方式来定义简单代理（如 ReAct）以及更复杂的多步骤代理工作流。\n\n1. **指定 LLM**：首先声明您希望代理使用的大型语言模型（LLM）。每个 LLM 组件必须被分配一个唯一的名称（例如“llm-1”）。这些名称作为标识符，框架使用它们来连接不同的组件。\n2. **定义代理**：接下来，通过提供代理的名称并选择代理类来定义代理。代理类可在\n[mcpuniverse.agent](https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Ftree\u002Fmain\u002Fmcpuniverse\u002Fagent) 包中找到。\n常用类包括“basic”、“function-call”和“react”。在代理规范（`spec.config`）中，您还必须通过设置“llm”字段来指示代理应使用哪个 LLM 实例。\n3. **创建复杂的工作流**：除了简单代理外，该框架还支持定义复杂的编排型工作流，其中多个代理相互协作以解决更复杂的任务。\n\n例如：\n\n```yaml\nkind: llm\nspec:\n  name: llm-1\n  type: openai\n  config:\n    model_name: gpt-4o\n\n---\nkind: agent\nspec:\n  name: basic-agent\n  type: basic\n  config:\n    llm: llm-1\n    instruction: 返回某个地点的纬度和经度。\n\n---\nkind: agent\nspec:\n  name: function-call-agent\n  type: function-call\n  config:\n    llm: llm-1\n    instruction: 您是一名天气预报代理。请根据给定的纬度和经度返回今天的天气情况。\n    servers:\n      - name: weather\n\n---\nkind: workflow\nspec:\n  name: orchestrator-workflow\n  type: orchestrator\n  config:\n    llm: llm-1\n    agents:\n      - basic-agent\n      - function-call-agent\n\n---\nkind: benchmark\nspec:\n  description: 测试天气预报代理\n  agent: orchestrator-workflow\n  tasks:\n    - dummy\u002Ftasks\u002Fweather.json\n```\n\n## 引用\n\n如果您在研究中使用 MCP-Universe，请引用我们的论文：\n\n```bibtex\n@misc{mcpuniverse,\n  title={MCP-Universe: 使用现实世界模型上下文协议服务器对大型语言模型进行基准测试},\n  author={Ziyang Luo 和 Zhiqi Shen 和 Wenzhuo Yang 和 Zirui Zhao 和 Prathyusha Jwalapuram 和 Amrita Saha 和 Doyen Sahoo 和 Silvio Savarese 和 Caiming Xiong 和 Junnan Li},\n  year={2025},\n  eprint={2508.14704},\n  archivePrefix={arXiv},\n  primaryClass={cs.AI},\n  url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.14704}, \n}\n```","# MCP-Universe 快速上手指南\n\nMCP-Universe 是一个用于构建、优化和评估基于模型上下文协议（MCP）的 AI Agent 的综合生态系统。它提供了业界领先的基准测试工具，支持在真实场景中与实际的 MCP 服务器交互，涵盖长程推理、复杂工具空间及动态数据源等挑战。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**：Linux 或 macOS（Windows 用户建议使用 WSL2）\n*   **Python**：版本 3.10 或更高\n*   **Docker**：必须安装并正在运行，用于启动容器化的 MCP 服务器\n*   **可选依赖**：\n    *   **PostgreSQL**：用于数据持久化存储\n    *   **Redis**：用于缓存和内存管理\n\n**系统依赖安装命令：**\n\n*   **Linux (Ubuntu\u002FDebian):**\n    ```bash\n    sudo apt-get install libpq-dev\n    ```\n*   **macOS:**\n    ```bash\n    brew install postgresql\n    ```\n\n## 安装步骤\n\n按照以下步骤克隆仓库并配置运行环境：\n\n1.  **克隆仓库**\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe.git\n    cd MCP-Universe\n    ```\n\n2.  **创建并激活虚拟环境**\n    ```bash\n    python3 -m venv venv\n    source venv\u002Fbin\u002Factivate\n    ```\n    > **提示**：国内用户若下载依赖较慢，可先配置 pip 国内镜像源（如阿里云或清华源）：\n    > `pip config set global.index-url https:\u002F\u002Fmirrors.aliyun.com\u002Fpypi\u002Fsimple\u002F`\n\n3.  **安装依赖包**\n    ```bash\n    pip install -r requirements.txt\n    pip install -r dev-requirements.txt\n    ```\n\n4.  **配置代码提交钩子（可选但推荐）**\n    ```bash\n    pre-commit install\n    ```\n\n5.  **配置环境变量**\n    复制示例配置文件并根据需要编辑：\n    ```bash\n    cp .env.example .env\n    ```\n    *编辑 `.env` 文件，填入您的 API Keys（如 `OPENAI_API_KEY`, `GOOGLE_MAPS_API_KEY` 等），具体取决于您要运行的测试任务。*\n\n## 基本使用\n\n完成安装后，您可以通过编写简单的 Python 脚本来运行基准测试。以下是一个最小化的运行示例：\n\n```python\nfrom mcpuniverse.tracer.collectors import MemoryCollector  # 也可以使用 SQLiteCollector\nfrom mcpuniverse.benchmark.runner import BenchmarkRunner\n\nasync def test():\n    # 初始化追踪收集器\n    trace_collector = MemoryCollector()\n    \n    # 选择配置文件 (位于 mcpuniverse\u002Fbenchmark\u002Fconfigs 目录下)\n    benchmark = BenchmarkRunner(\"dummy\u002Fbenchmark_1.yaml\")\n    \n    # 运行指定的基准测试\n    results = await benchmark.run(trace_collector=trace_collector)\n    \n    # 获取追踪记录\n    trace_id = results[0].task_trace_ids[\"dummy\u002Ftasks\u002Fweather_1.json\"]\n    trace_records = trace_collector.get(trace_id)\n    \n    # 此处可添加打印或分析结果的代码\n    print(f\"Trace records: {trace_records}\")\n\n# 注意：在实际运行前，请确保已正确设置异步入口点\n# 例如在脚本末尾添加:\n# import asyncio\n# asyncio.run(test())\n```\n\n**运行说明：**\n1.  确保 Docker 服务已启动，因为基准测试通常需要拉取并运行特定的 MCP 服务器容器。\n2.  确认 `.env` 文件中已配置好对应任务所需的 API Key。\n3.  运行上述脚本即可开始评估 Agent 在特定任务上的表现。","某金融科技团队正在开发一个能自动连接内部数据库、实时新闻源和交易 API 的复杂投资分析 Agent，以辅助分析师进行多步骤的市场调研与决策。\n\n### 没有 MCP-Universe 时\n- **评估脱离实际**：团队只能使用简化的静态数据集测试 Agent，无法验证其在连接真实 MCP 服务器处理动态金融数据时的长程推理能力。\n- **上下文成本高昂**：Agent 调用工具返回的冗长原始数据直接填入上下文，导致 LLM Token 消耗巨大，单次深度调研成本难以承受。\n- **研发效率低下**：缺乏统一的框架来编排并行工具调用，开发“广度优先”的深度研究功能需从零构建，耗时且容易出错。\n- **基准缺失**：没有行业标准基准（如 MCPMark）对标，难以量化 Agent 在陌生工具空间中的真实性能差距。\n\n### 使用 MCP-Universe 后\n- **真实场景验证**：利用内置的 MCPMark 基准和真实服务器交互环境，团队直接在动态金融场景中评估 Agent，精准捕捉长任务链中的推理断点。\n- **成本大幅降低**：集成 MCP+ 模块进行精确上下文管理，自动过滤冗余输出，在不牺牲分析质量的前提下将 Token 成本降低了 75%。\n- **高效并行扩展**：借助原生的深度研究 Agent（Deep Research Agent）架构，轻松实现多工具并行调用，显著提升了市场情报收集的宽度与效率。\n- **可视化迭代**：通过框架提供的运行日志可视化和标准化报告，团队能快速定位故障并量化性能提升，加速产品迭代周期。\n\nMCP-Universe 通过提供真实的基准测试、极致的成本控制及高效的编排能力，让复杂工具型 AI Agent 从实验室原型快速走向生产级应用。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FSalesforceAIResearch_MCP-Universe_0a29aa6f.png","SalesforceAIResearch","Salesforce AI Research","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FSalesforceAIResearch_6ff2d82a.png","Open Source projects released by Salesforce AI Research",null,"ospo@salesforce.com","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch",[81,85,89,92,96,100,104,107,110],{"name":82,"color":83,"percentage":84},"Python","#3572A5",97.4,{"name":86,"color":87,"percentage":88},"PLpgSQL","#336790",0.9,{"name":90,"color":91,"percentage":88},"Jupyter Notebook","#DA5B0B",{"name":93,"color":94,"percentage":95},"Shell","#89e051",0.4,{"name":97,"color":98,"percentage":99},"Jinja","#a52a22",0.3,{"name":101,"color":102,"percentage":103},"Makefile","#427819",0,{"name":105,"color":106,"percentage":103},"Dockerfile","#384d54",{"name":108,"color":109,"percentage":103},"CSS","#663399",{"name":111,"color":112,"percentage":103},"HTML","#e34c26",579,81,"2026-04-15T20:33:51","Apache-2.0","Linux, macOS","未说明",{"notes":120,"python":121,"dependencies":122},"需要安装 Docker 以运行容器化的 MCP 服务器。可选配置 PostgreSQL 用于数据持久化，Redis 用于缓存和内存管理。Linux 用户需安装 libpq-dev，macOS 用户需通过 brew 安装 postgresql。需配置 .env 文件并填入相关 API 密钥（如 OpenAI, Google Maps 等）方可运行基准测试。","3.10+",[123,124,125,126,127],"libpq-dev (Linux)","postgresql (macOS)","docker","redis (可选)","postgresql (可选)",[14,13,129],"其他","2026-03-27T02:49:30.150509","2026-04-17T09:53:28.864701",[133,138,143,148,153,158,163,168],{"id":134,"question_zh":135,"answer_zh":136,"source_url":137},37164,"在构建 ReAct 架构的系统提示词（System Prompt）时，是否需要包含所有所需 MCP 工具的完整 Schema？","是的，这是正确的。在系统提示词构建阶段，应该包含任务所需的所有 MCP 的完整工具 Schema（包括名称、参数等）。不过需要注意的是，如果工具数量过多（例如浏览器自动化任务涉及上百个工具），直接全部放入提示词可能会导致模型难以选择或超出上下文限制，未来可能会探索更优化的工具选择策略。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F8",{"id":139,"question_zh":140,"answer_zh":141,"source_url":142},37165,"运行基准测试时遇到 'ValueError: The command must be a valid string' 错误怎么办？","该错误通常发生在 MCP 服务器初始化阶段。请检查以下两点：\n1. 确认 `mcpuniverse\u002Fmcp\u002Fconfigs\u002Fserver_list.json` 文件中 Playwright MCP 服务器的配置是否正确。\n2. Playwright MCP 服务器依赖 `npx`，请确保您的环境中已安装并可用 `npx`（可以通过安装 Node.js 获得）。许多用户在安装 npx 后解决了此问题。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F32",{"id":144,"question_zh":145,"answer_zh":146,"source_url":147},37166,"排行榜上的分数是如何计算的？为什么有些模型的得分看起来不符合直觉？","每个任务 JSON 文件中的 `evaluators` 字段是一个列表，列表中的每个字典代表一条计算规则。以 3d_design 任务为例，规则包括 `blender.check_file` 和 `blender.check_file_content`。评分逻辑是累加的：只要生成了文件即可获得部分分数（如 0.5），如果生成的文件夹内容也符合规则则获得满分（1.0）。因此，得分差异反映了模型生成有效文件和合规内容的综合能力，而非仅仅是否生成了文件。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F47",{"id":149,"question_zh":150,"answer_zh":151,"source_url":152},37167,"对于需要保留推理过程（thinking\u002Freasoning）的模型（如 GLM-4.5），在使用 Function Calling 模式时如何处理消息历史？","目前为了与 OpenAI 风格的闭源模型保持一致（它们不暴露推理内容），默认实现仅追加 `tool_calls` 到消息历史中，不包含推理痕迹。如果您使用的模型（如 GLM-4.5）依赖上一步的推理内容进行下一步生成，您可以自行设计新的消息格式，修改代码将完整的 `message_obj` 追加到历史中（即使用 `message.append(message_obj)` 而非仅追加 tool_calls）。如果有新的评测结果，也可以提交以更新排行榜。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F29",{"id":154,"question_zh":155,"answer_zh":156,"source_url":157},37168,"运行报告生成时出现异常，提示日志目录（Logdir）不存在，如何解决？","这是因为日志目录不会自动创建。您可以在 `mcpuniverse\u002Fbenchmark\u002Freport.py` 文件的第 15 行附近添加以下代码来自动创建目录：\n```python\nREPORT_FOLDER.mkdir(parents=True, exist_ok=True)\n```\n这将确保在序列化结果之前目录已存在，避免新用户在首次运行时遇到异常。该修复已合并到主分支。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F16",{"id":159,"question_zh":160,"answer_zh":161,"source_url":162},37169,"示例代码中的 `task_trace_ids` 键名报错找不到文件，正确的键名是什么？","示例代码中存在一个键名错误。请将获取 trace_id 的代码行：\n`trace_id = results[0].task_trace_ids[\"dummy\u002Ftasks\u002Fweather.json\"]`\n修改为：\n`trace_id = results[0].task_trace_ids[\"dummy\u002Ftasks\u002Fweather_1.json\"]`\n注意文件名后缀的变化（增加了 `_1`）。该文档错误已在 README 中修复。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F15",{"id":164,"question_zh":165,"answer_zh":166,"source_url":167},37170,"代码库中在哪里可以找到论文中提到的 'Cursor Agent' 的实现？","目前代码库中尚未包含 Cursor Agent 的具体实现。维护者表示将在短期内发布该部分的代码，请关注仓库的后续更新。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F7",{"id":169,"question_zh":170,"answer_zh":171,"source_url":172},37171,"为什么 `mcpuniverse\u002Fllm\u002F__init__.py` 中没有列出 Gemini 模型？是不支持还是遗漏了？","这主要是之前的遗漏，并非 Gemini 不兼容。维护者已确认会将 Gemini 添加到 `mcpuniverse\u002Fllm\u002F__init__.py` 中以支持该模型。","https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fissues\u002F6",[174,179,184,189,194,199,204,209,214],{"id":175,"version":176,"summary_zh":177,"released_at":178},297698,"v1.1.3","## 变更内容\n* 添加 Salesforce 网关支持及 MCP+ 的智能配置更新，由 @pjwalapuram 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F56 中完成。\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fcompare\u002Fv1.1.2...v1.1.3","2026-03-25T05:32:23",{"id":180,"version":181,"summary_zh":182,"released_at":183},297699,"v1.1.2","## 变更内容\n* 添加支持头部的 HTTP\u002FSSE 传输，并修复资源 URI 验证，由 @pjwalapuram 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F55 中完成。\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fcompare\u002Fv1.1.1...v1.1.2","2026-03-10T08:54:49",{"id":185,"version":186,"summary_zh":187,"released_at":188},297700,"v1.1.1","修复依赖问题","2026-03-03T02:51:03",{"id":190,"version":191,"summary_zh":192,"released_at":193},297701,"v1.1.0","## 变更内容\n* @likaixin2000 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F38 中添加了 Blender 插件文件，以简化基准测试的设置。\n* @likaixin2000 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F39 中添加了用于 Blender 和 VNC 服务的安装脚本。\n* @viczxchen 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F36 中将 MCPMark 迁移到 MCP-Universe 框架。\n* @xqlin98 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F52 中更新了主 README 文件中的新闻。\n* @pjwalapuram 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F54 中新增了 MCP+：面向 MCP 代理的精准上下文管理。\n\n## 新贡献者\n* @likaixin2000 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F38 中完成了首次贡献。\n* @viczxchen 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F36 中完成了首次贡献。\n* @xqlin98 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F52 中完成了首次贡献。\n* @pjwalapuram 在 https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fpull\u002F54 中完成了首次贡献。\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002FSalesforceAIResearch\u002FMCP-Universe\u002Fcompare\u002Fv1.0.4...v1.1.0","2026-02-26T07:24:21",{"id":195,"version":196,"summary_zh":197,"released_at":198},297702,"v1.0.4","修复 bugs","2025-10-16T09:09:14",{"id":200,"version":201,"summary_zh":202,"released_at":203},297703,"v1.0.3","实现了基于 Celery 工作进程和消息队列的分布式任务执行框架，用于异步运行 AI 代理。","2025-10-07T08:22:37",{"id":205,"version":206,"summary_zh":207,"released_at":208},297704,"v1.0.2","支持 OpenAI SDK\n修复一些小问题","2025-09-18T04:22:10",{"id":210,"version":211,"summary_zh":212,"released_at":213},297705,"v1.0.1","添加新的函数调用代理\n优化大模型 API 调用","2025-09-05T08:02:48",{"id":215,"version":216,"summary_zh":217,"released_at":218},297706,"v1.0.0","第一次发布","2025-09-04T03:21:11"]