[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-microsoft--WindowsAgentArena":3,"tool-microsoft--WindowsAgentArena":62},[4,18,26,36,46,54],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",159636,2,"2026-04-17T23:33:34",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":42,"last_commit_at":43,"category_tags":44,"status":17},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,45],"插件",{"id":47,"name":48,"github_repo":49,"description_zh":50,"stars":51,"difficulty_score":32,"last_commit_at":52,"category_tags":53,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":55,"name":56,"github_repo":57,"description_zh":58,"stars":59,"difficulty_score":32,"last_commit_at":60,"category_tags":61,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[45,13,15,14],{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":78,"owner_twitter":79,"owner_website":80,"owner_url":81,"languages":82,"stars":111,"forks":112,"last_commit_at":113,"license":114,"difficulty_score":115,"env_os":116,"env_gpu":117,"env_ram":118,"env_deps":119,"category_tags":126,"github_topics":128,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":138,"updated_at":139,"faqs":140,"releases":171},8938,"microsoft\u002FWindowsAgentArena","WindowsAgentArena","Windows Agent Arena (WAA) 🪟 is a scalable OS platform for testing and benchmarking of multi-modal AI agents.","Windows Agent Arena 是一个专为测试和评估多模态 AI 智能体而设计的可扩展 Windows 操作系统平台。它主要解决了当前 AI 研究缺乏真实、可复现的桌面环境难题，让开发者能够在接近真实的 Windows 系统中验证智能体处理复杂任务的能力，而非仅停留在理论或简化模拟阶段。\n\n该平台特别适合人工智能研究人员、算法工程师以及致力于开发桌面自动化助手的开发者使用。其核心亮点在于强大的规模化部署能力：依托 Azure ML 云基础设施，Windows Agent Arena 支持并行运行数百个智能体，能在几分钟内完成大量任务的基准测试，将原本需要数天的评估工作大幅提速。此外，平台近期还更新了“困难模式”，要求智能体自主初始化任务环境（如自行查找并打开所需软件），从而更严格地考验其独立操作与规划能力。结合微软开源的顶尖屏幕理解模型 Omniparser，Windows Agent Arena 为构建和评估能像人类一样操作电脑的智能体提供了高效、专业的实验场。","\r\n\u003Cdiv align=\"center\">\r\n    \r\n![Banner](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_0b596ef712e4.png)\r\n[![Website](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FWebsite-red)](https:\u002F\u002Fmicrosoft.github.io\u002FWindowsAgentArena)\r\n[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-green)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.08264)\r\n[![License](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-yellow.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FMIT)\r\n[![PRs](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FAI-Podcast-blue.svg?logo=data:image\u002Fsvg%2bxml;base64,PHN2ZyBmaWxsPSIjZmZmZmZmIiB2aWV3Qm94PSIwIDAgMjQgMjQiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PGcgaWQ9IlNWR1JlcG9fYmdDYXJyaWVyIiBzdHJva2Utd2lkdGg9IjAiPjwvZz48ZyBpZD0iU1ZHUmVwb190cmFjZXJDYXJyaWVyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjwvZz48ZyBpZD0iU1ZHUmVwb19pY29uQ2FycmllciI+PHBhdGggZD0iTTEzLDRWMjBhMSwxLDAsMCwxLTIsMFY0YTEsMSwwLDAsMSwyLDBaTTgsNUExLDEsMCwwLDAsNyw2VjE4YTEsMSwwLDAsMCwyLDBWNkExLDEsMCwwLDAsOCw1Wk00LDdBMSwxLDAsMCwwLDMsOHY4YTEsMSwwLDAsMCwyLDBWOEExLDEsMCwwLDAsNCw3Wk0xNiw1YTEsMSwwLDAsMC0xLDFWMThhMSwxLDAsMCwwLDIsMFY2QTEsMSwwLDAsMCwxNiw1Wm00LDJhMSwxLDAsMCwwLTEsMXY4YTEsMSwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMjAsN1oiPjwvcGF0aD48L2c+PC9zdmc+)](https:\u002F\u002Fmicrosoft.github.io\u002FWindowsAgentArena\u002Fstatic\u002Ffiles\u002Fwaa_podcast.wav)\r\n\r\n\u003C\u002Fdiv>\r\n\r\n**Windows Agent Arena (WAA) 🪟** is a scalable Windows AI agent platform for testing and benchmarking multi-modal, desktop AI agents. WAA provides researchers and developers with a reproducible and realistic Windows OS environment for AI research, where agentic AI workflows can be tested across a diverse range of tasks.\r\n\r\nWAA supports the deployment of agents **at scale** using the Azure ML cloud infrastructure, allowing for the parallel running of multiple agents and delivering quick benchmark results for hundreds of tasks in minutes, not days.\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cvideo src=\"https:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002Fe0a8d88d-d28a-493d-b74f-2455f36c21f1\" alt=\"waa_intro\">\r\n\u003C\u002Fdiv>\r\n\r\n## 📢 Updates\r\n- 2024-11-10: We added a new difficulty mode for Windows Agent Arena! You can try the new harder difficulty mode by changing the default `diff_lvl=\"normal\"` to `diff_lvl=\"hard\"` in `src\u002Fwin-arena-container\u002Fstart_client.sh`. Under the harder difficulty, in many tasks, agents must also learn to initialize\u002Fset up the task themselves (e.g., finding and opening the right program\u002Fapplication for the task) rather than have the task \"set up\" for them by the task config.\r\n- 2024-10-30: We released the code for our Navi agent with Omniparser! For the top performing mode in the paper, run `.\u002Frun-local.sh --som-origin mixed-omni --gpu-enabled true`\r\n- 2024-10-23: Microsoft open-sourced [Omniparser](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOmniParser), the current top performing screen understanding model in our benchmark.\r\n- 2024-09-13: We released our [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.08264), [code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena), [project page](https:\u002F\u002Fmicrosoft.github.io\u002FWindowsAgentArena), and [blog post](https:\u002F\u002Fwww.microsoft.com\u002Fapplied-sciences\u002Fprojects\u002Fwindows-agent-arena). Check it out!\r\n\r\n## 📚 Citation\r\nOur technical report paper can be found [here](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.08264).\r\nIf you find this environment useful, please consider citing our work:\r\n```\r\n@article{bonatti2024windows,\r\nauthor = { Bonatti, Rogerio and Zhao, Dan and Bonacci, Francesco and Dupont, Dillon, and Abdali, Sara and Li, Yinheng and Wagle, Justin and Koishida, Kazuhito and Bucker, Arthur and Jang, Lawrence and Hui, Zack},\r\ntitle = {Windows Agent Arena: Evaluating Multi-Modal OS Agents at Scale},\r\ninstitution = {Microsoft},\r\nyear = {2024},\r\nmonth = {September}, \r\n}\r\n```\r\n\r\n## ☝️ Pre-requisites:\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_d1bb235904ea.png\" alt=\"main\" height=\"200\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n- Docker daemon installed and running. On Windows, we recommend using [Docker with WSL 2](https:\u002F\u002Fdocs.docker.com\u002Fdesktop\u002Fwsl\u002F).\r\n- An [OpenAI](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fintroduction) or [Azure OpenAI](https:\u002F\u002Fazure.microsoft.com\u002Fen-us\u002Fproducts\u002Fai-services\u002Fopenai-service) API Key.\r\n- Python 3.9 - we recommend using [Conda](https:\u002F\u002Fdocs.conda.io\u002Fprojects\u002Fconda\u002Fen\u002Flatest\u002Fuser-guide\u002Fgetting-started.html) and creating an adhoc python environment for running the scripts. For creating a new environment run `conda create -n winarena python=3.9`.\r\n\r\nClone the repository and install dependencies:\r\n```bash\r\ngit clone https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena.git\r\ncd WindowsAgentArena\r\n# Install the required dependencies in your python environment\r\n# conda activate winarena\r\npip install -r requirements.txt\r\n```\r\n\r\n## 💻 Local deployment (WSL or Linux)\r\n\r\n\r\n### 1. Configuration file\r\nCreate a new `config.json` at the root of the project with the necessary keys (from OpenAI or Azure endpoints):\r\n\r\n```json\r\n{\r\n    \"OPENAI_API_KEY\": \"\u003COPENAI_API_KEY>\", \u002F\u002F if you are using OpenAI endpoint\r\n    \"AZURE_API_KEY\": \"\u003CAZURE_API_KEY>\",  \u002F\u002F if you are using Azure endpoint\r\n    \"AZURE_ENDPOINT\": \"https:\u002F\u002Fyourendpoint.openai.azure.com\u002F\", \u002F\u002F if you are using Azure endpoint\r\n}\r\n```\r\n\r\n### 2. Prepare the Windows Arena Docker Image\r\n\r\n#### 2.1 Pull the WinArena-Base Image from Docker Hub\r\n\r\nTo get started, pull the base image from Docker Hub:\r\n\r\n```bash\r\ndocker pull windowsarena\u002Fwinarena-base:latest\r\n```\r\n\r\nThis image includes all the necessary dependencies (such as packages and models) required to run the code in the `src` directory.\r\n\r\n#### 2.2 Build the WinArena Image Locally\r\n\r\nNext, build the WinArena image locally:\r\n\r\n```bash\r\ncd scripts\r\n.\u002Fbuild-container-image.sh\r\n\r\n# If there are any changes in 'Dockerfile-WinArena-Base', use the --build-base-image flag to build also the base image locally\r\n# .\u002Fbuild-container-image.sh --build-base-image true\r\n\r\n# For other build options:\r\n# .\u002Fbuild-container-image.sh --help\r\n```\r\n\r\nThis will create the `windowsarena\u002Fwinarena:latest` image with the latest code from the `src` directory.\r\n\r\n### 3. Prepare the Windows 11 VM\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cvideo src=\"https:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F6d55b9b5-3242-49af-be20-64f2086108b9\" height=\"500\" alt=\"local_prepare_golden_image\">\r\n\u003C\u002Fdiv>\r\n\r\n#### 3.1 Download Windows 11 Evaluation .iso file:\r\n1. Visit [Microsoft Evaluation Center](https:\u002F\u002Finfo.microsoft.com\u002Fww-landing-windows-11-enterprise.html), accept the Terms of Service, and download a **Windows 11 Enterprise Evaluation (90-day trial, English, United States)** ISO file [~6GB]\r\n2. After downloading, rename the file to `setup.iso` and copy it to the directory `WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fimage`\r\n\r\n#### 3.2 Automatic Setup of the Windows 11 golden image:\r\nBefore running the arena, you need to prepare a new WAA snapshot (also referred as WAA golden image). This 30GB snapshot represents a fully functional Windows 11 VM with all the programs needed to run the benchmark. This VM additionally hosts a Python server which receives and executes agent commands. To learn more about the components at play, see our [local](\u002Fimg\u002Farchitecture-local.png) and [cloud](\u002Fimg\u002Farchitecture-azure.png) components diagrams.\r\n\r\nTo prepare the gold snapshot, run **once**:\r\n```bash\r\ncd .\u002Fscripts\r\n.\u002Frun-local.sh --prepare-image true\r\n```\r\nYou can monitor progress at `http:\u002F\u002Flocalhost:8006`. The preparation process is fully automated and will take ~20 minutes.\r\n\r\n**Please do not interfere with the VM while it is being prepared. It will automatically shut down when the provisioning process is complete.**\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_b1ca0770dad6.png\" alt=\"local_prepare_screen_unattend\" height=\"500\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_fda2c263c130.png\" alt=\"local_prepare_screen_setup\" height=\"500\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\nAt the end, you should expect the Docker container named `winarena` to gracefully terminate as shown from the below logs.\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_3f2f2699d268.png\" alt=\"local_prepare_logs_successful\" height=\"200\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n\u003Cbr\u002F>\r\n\r\nYou will find the 30GB WAA golden image in `WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fstorage`, consisting of the following files:\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_09ff78330f70.png\" alt=\"run_local_prepare_storage_successful\" height=\"200\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n\u003Cbr\u002F>\r\n\r\n##### Additional Notes\r\n- During development, if you want to include any changes made in the `src\u002Fwin-arena-container` directory in the WAA golden image, please ensure to specify the flag `--skip-build false` to the `run-local.sh` script (default to true). This will ensure that a new container image is built instead than using the prebuilt `windowsarena\u002Fwinarena:latest` image.\r\n- If you have previously run an installation process and want to do it again from scratch, make sure to delete the content of `storage`.\r\n- We recommend copying this `storage` folder to a safe location outside of the repository in case you or the agent accidentally corrupt the VM at some point and you want to avoid a fresh setup.\r\n- Depending on your docker settings, you might have to run the above command with `sudo`.\r\n- Running on WSL2? If you encounter the error `\u002Fbin\u002Fbash: bad interpreter: No such file or directory`, we recommend converting the bash scripts from DOS\u002FWindows format to Unix format:\r\n```bash\r\ncd .\u002Fscripts\r\nfind . -maxdepth 1 -type f -exec dos2unix {} +\r\n```\r\n\r\n### 4. Deploying the agent in the arena\r\n\r\n#### 4.1 Running the base benchmark\r\n\r\nYou're now ready to launch the evaluation. To run the baseline agent on all benchmark tasks, do:\r\n\r\n```bash\r\ncd scripts\r\n.\u002Frun-local.sh\r\n# For client\u002Fagent options:\r\n# .\u002Frun-local.sh --help\r\n```\r\n\r\nOpen http:\u002F\u002Flocalhost:8006 to see the Windows VM with the agent running. If you have a beefy PC, you can instead run the strongest agent configuration in our paper by doing:\r\n```bash\r\n.\u002Frun-local.sh --gpu-enabled true --som-origin mixed-omni --a11y-backend uia\r\n```\r\n\r\nAt the end of the run you can display the results using the command:\r\n```bash\r\ncd src\u002Fwin-arena-container\u002Fclient\r\npython show_results.py --result_dir \u003Cpath_to_results_folder>\r\n```\r\n\r\n#### Available Configurations\r\n\r\nBelow is a comparison of various combinations of hyperparameters used by the Navi agent in our study, which can be overridden by specifying `--som-origin \u003Csom_origin> --a11y-backend \u003Ca11y_backend>` when running the `run-local.sh` script:\r\n\r\n| Command | Description | Notes |\r\n|---------|-------------|--------|\r\n| `.\u002Frun-local.sh --som-origin mixed-omni --a11y-backend uia` | Combines Omniparser with accessibility tree information | ⭐**Recommended for best results** |\r\n| `.\u002Frun-local.sh --som-origin omni` | Uses Omniparser for screen understanding | |\r\n| `.\u002Frun-local.sh --som-origin oss` | Uses webparse, groundingdino, and OCR (TesseractOCR) | 🌲Baseline |\r\n| `.\u002Frun-local.sh --som-origin a11y --a11y-backend uia` | Uses slower, more accurate accessibility tree | |\r\n| `.\u002Frun-local.sh --som-origin a11y --a11y-backend win32` | Uses faster, less accurate accessibility tree | 🐇Fastest |\r\n| `.\u002Frun-local.sh --som-origin mixed-oss --a11y-backend uia` | Combines oss detections with accessibility tree |  |\r\n\r\n- `--som-origin` determines how the Navi agent detects screen elements\r\n- `--a11y-backend` specifies the Accessibility backend type (when using `a11y` or mixed modes)\r\n\r\n#### 4.2 Local development tips\r\n\r\nAt first sight it might seem challenging to develop\u002Fdebug code running inside the docker container. However, we provide a few tips to make this process easier. Check the [Development-Tips Doc](.\u002Fdocs\u002FDevelopment-Tips.md) for more details such as:\r\n- How to attach a VSCode window (with debugger) to the running container\r\n- How to change the agent and Windows server code from your local machine and see the changes reflected in real time in the container\r\n\r\n## 🌐 Azure Deployment -> Parallelizing the benchmark \r\n\r\nWe offer a seamless way to run the Windows Agent Arena on Azure ML Compute VMs. This option will significantly reduce the time needed to test your agent in all benchmark tasks from hours\u002Fdays to minutes.\r\n\r\n### 1. Set up the Azure resource group:\r\n\r\n- If you don't already have an Azure subscription, you can start a [free trial](https:\u002F\u002Fazure.microsoft.com\u002Fen-us\u002Ffree\u002F). Take note of the subscription id, we will use it as `AZURE_SUBSCRIPTION_ID` in Section 3.\r\n- In the [Azure portal](https:\u002F\u002Fportal.azure.com\u002F), create a new resource group (e.g. `agents`) in the region of your choice. Take note of the resource group name, we will use it as `AZURE_ML_RESOURCE_GROUP` in Section 3.\r\n- Within this resource group, create an Azure Machine Learning resource (e.g. name it `agents_ml`). Take note of the ML workspace name, we will use it as `AZURE_ML_WORKSPACE_NAME` in Section 3. During the creation wizard, make sure to check the boxes for automatically creating new:\r\n    - Storage Account. **Note:** Take note of the Storage Account name, we will use it to upload the golden image in Section 2.\r\n    - Key vault.\r\n    - Application Insights.\r\n    - [optional] Container Registry. You can use the Azure Container Registry to privately store your custom docker images without the need to push them to the public Docker Hub.\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_2ffdf80e3081.png\" alt=\"azure_create_ml\" height=\"300\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n- Once creation is complete, navigate to the [Azure Machine Learning portal](https:\u002F\u002Fml.azure.com\u002Fhome) and click on your workspace (`agents`)\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_f64af0dd9d57.png\" alt=\"azure_ml_portal\" height=\"200\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n- In the workspace, navigate to the `Notebooks` tab. In your user-assigned folder (as shown in the figure below), create a new bash (.sh) file named `compute-instance-startup.sh`. Copy the content of `scripts\u002Fazure_files\u002Fcompute-instance-startup.sh` into this file and save it. This script will be used every time a new VM is launched in Azure to apply some base configurations. Take note of the path where you save the file (in the form of `Users\u002F\u003CYOUR_USER>\u002Fcompute-instance-startup.sh`), we will use it to run the script in Section 3.\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_83b36eb99195.png\" alt=\"azure_notebook\" height=\"200\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n- [Optional] You might want to ask for more compute quota for your region depending on your needs. You can do so by navigating to the [Azure Quota page](https:\u002F\u002Fml.azure.com\u002Fquota\u002F). As a reference, we currently use the `Standard_D8_v3` VM size for our benchmarking, which falls under the `Standard Dv3 Family Cluster Dedicated vCPUs` category. Each VM uses 8 cores. Make sure the machine type you use supports [nested virtualization](https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fanswers\u002Fquestions\u002F813416\u002Fhow-do-i-know-what-size-azure-vm-supports-nested-v).\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_10a94c3b6b73.png\" alt=\"azure_quota\" height=\"200\"\u002F>\r\n\u003C\u002Fdiv>\r\n\r\n\r\n### 2. Uploading Windows 11 and Docker images to Azure\r\n\r\n- Upload the Windows 11 storage folder to the Blob container associated with your default datastore. By default, the Azure ML Workspace's underlying data is backed by a Storage Account through one or more ML datastores. The default datastore, named `workspaceblobstore`, is created during the workspace setup and linked to a Blob container under the Azure Storage Account. You can review the association between the datastores and containers by visiting [Azure ML Datastore](https:\u002F\u002Fml.azure.com\u002Fdata\u002Fdatastore). Once found, you can then upload the storage folder in different ways:\r\n    - Download the [Azure Storage Explorer](https:\u002F\u002Fazure.microsoft.com\u002Fen-us\u002Ffeatures\u002Fstorage-explorer\u002F) program, log in, and select the blob container. Upload the `WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fstorage` folder from your local machine after running the local setup steps.\u003Cdiv align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_40bdf7fa6ce8.png\" alt=\"azure_blobstore\" height=\"100\"\u002F>\u003C\u002Fdiv>\r\n    - Alternatively, you can use the Azure CLI to upload the folder. To install the CLI, follow the steps provided [here](https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fcli\u002Fazure\u002Finstall-azure-cli). Once installed, you can use the following command:\r\n        ```bash\r\n        az login --use-device-code # Only needed if prompted\r\n        az storage blob upload-batch --account-name \u003CSTORAGE_ACCOUNT_NAME> --destination \u003CCONTAINER_NAME> --source \u003CLOCAL_FOLDER>\r\n        # For a list of parameters check: https:\u002F\u002Fdocs.microsoft.com\u002Fen-us\u002Fcli\u002Fazure\u002Fstorage\u002Fblob?view=azure-cli-latest\r\n        ```\r\n    - Alternatively, use the [Azure portal](https:\u002F\u002Fportal.azure.com\u002F) interface to upload the folder. Navigate to the storage account, click on `Storage browser->Blob containers`, select your container, and upload the folder. This option is not recommended for large files as connections might get unstable.\r\n    \r\n- [Optional] If you are not using the default `windowsarena\u002Fwinarena:latest` image, you can upload your custom image to the Azure Container Registry. You can do so by following the [Azure Container Registry documentation](https:\u002F\u002Fdocs.microsoft.com\u002Fen-us\u002Fazure\u002Fcontainer-registry\u002Fcontainer-registry-get-started-portal)\r\n    ```bash\r\n    az login --use-device-code\r\n    # potentially needed if commands below don't work: az acr login --name \u003CACR_NAME>\r\n    docker login # you will be prompted to enter your ACR credentials (username + password which can be found in the Azure portal)\r\n    docker tag \u003CIMAGE_NAME> \u003CACR_NAME>.azurecr.io\u002F\u003CIMAGE_NAME>:\u003CTAG>\r\n    docker push \u003CACR_NAME>.azurecr.io\u002F\u003CIMAGE_NAME>:\u003CTAG>\r\n    ```\r\n\r\n### 3. Environment configurations and deployment\r\n\r\n- Add the additional keys to the `config.json` file at the root of the project:\r\n```json\r\n{\r\n    ... \u002F\u002F Your previous configs\r\n\r\n    \"AZURE_SUBSCRIPTION_ID\": \"\u003CYOUR_AZURE_SUBSCRIPTION_ID>\", \r\n    \"AZURE_ML_RESOURCE_GROUP\": \"\u003CYOUR_AZURE_ML_RESOURCE_GROUP>\",\r\n    \"AZURE_ML_WORKSPACE_NAME\": \"\u003CYOUR_AZURE_ML_WORKSPACE_NAME>\"\r\n}\r\n```\r\n\r\n- Create a new file named `experiments.json` to specify any parameters needed for each experiment run, including the agent to deploy and the underlying LLM model to use. You can find a reference `experiments.json` consisting of multiple experiments to run at [`scripts\u002Fexperiments.json`](scripts\\experiments.json):\r\n```json\r\n{\r\n  \"experiment_1\": {\r\n    \"ci_startup_script_path\": \"Users\u002F\u003CYOUR_USER>\u002Fcompute-instance-startup.sh\", \u002F\u002F As seen in Section 1\r\n    \"agent\": \"navi\",\r\n    \"datastore_input_path\": \"storage\",\r\n    \"docker_img_name\": \"windowsarena\u002Fwinarena:latest\",\r\n    \"exp_name\": \"experiment_1\",\r\n    \"num_workers\": 4,\r\n    \"use_managed_identity\": false,\r\n    \"json_name\": \"evaluation_examples_windows\u002Ftest_all.json\",\r\n    \"model_name\": \"gpt-4-1106-vision-preview\",\r\n    \"som_origin\": \"oss\", \u002F\u002F or a11y, or mixed-oss\r\n    \"a11y_backend\": \"win32\" \u002F\u002F or uia\r\n  }\r\n  \u002F\u002F ...\r\n}\r\n```\r\n- (Optional) You can also generate the `experiments.json` by using both the `--experiments_json` and `--update_json` parameters of `run_azure.py`, the above JSON is equivalent to the following command:\r\n```bash\r\ncd scripts\r\npython run_azure.py --experiments_json \"experiments.json\" --update_json --exp_name \"experiment_1\" --ci_startup_script_path \"Users\u002F\u003CYOUR_USER>\u002Fcompute-instance-startup.sh\" --agent \"navi\" --json_name \"evaluation_examples_windows\u002Ftest_all.json\" --num_workers 4 --som_origin oss --a11y_backend win32\r\n```\r\n\r\n- Deploy the agent on Azure ML Compute by running:\r\n```bash\r\naz login --use-device-code # https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fcli\u002Fazure\u002Finstall-azure-cli\r\n# If multiple tenants or subscriptions, make sure to select the right ones with:\r\n# az login --use-device-code --tenant \"\u003CYOUR_AZURE_AD_TENANT_ID>\"\r\n# az account set --subscription \"\u003CYOUR_AZURE_AD_TENANT_ID>\"\r\n\r\n# Make sure you have installed the python requirements in your conda environment\r\n# conda activate winarena\r\n# pip install -r requirements.txt\r\n\r\n# From your activated conda environment:\r\ncd scripts\r\npython run_azure.py --experiments_json \"experiments.json\"\r\n```\r\n\r\nFor any unfinished experiments in `experiments.json`, the script will: \r\n1. Create `\u003Cnum_workers` Azure Compute Instance VMs.\r\n2. Run one ML Training Job named `\u003Cexp_name>` per VM.\r\n3. Dispose the VMs once the jobs are completed.\r\n\r\nThe logs from the run will be saved in a `agent_outputs` folder in the same blob container where you uploaded the Windows 11 image. You can download the `agent_outputs` folder to your local machine and run the `show_azure.py` script to see the results from every experiment as a markdown table.\r\n\r\n```bash\r\ncd scripts\r\npython show_azure.py --json_config \"experiments.json\" --result_dir \u003Cpath_to_downloaded_agent_outputs_folder>\r\n```\r\n\r\n## 🤖 BYOA: Bring Your Own Agent\r\nWant to test your own agents in Windows Agent Arena? You can use our default agent as a template and create your own folder under `src\u002Fwin-arena-container\u002Fclient\u002Fmm_agents`. You just need to make sure that your `agent.py` file features `predict()` and `reset()` functions. For more information on agent development check out the [BYOA Doc](.\u002Fdocs\u002FDevelop-Agent.md).\r\n\r\n## 👩‍💻 Open-source contributions\r\n\r\nWe welcome contributions to the Windows Agent Arena project. In particular, we welcome:\r\n- New open-sourced agents to be added to the benchmark\r\n- New tasks to be added to our existing categories, or new categories altogether\r\n\r\nIf you are interested in contributing, please check out our [Task Development Guidelines](.\u002Fdocs\u002FDevelop-Tasks.md).\r\n\r\n## ❓ FAQ\r\n### What are approximate running times and costs for the benchmark?\r\n| Component | Cost | Time |\r\n|----------|----------|----------|\r\n| Azure Standard_D8_v3 VM    | ~$8 ($0.38\u002Fh * 40 * 0.5h)   |    |\r\n| GPT-4V    | $100   |  ~35min with 40 VMs  |\r\n| GPT-4o    | $100   | ~35min with 40 VMs   |\r\n| GPT-4o-mini    | $15   | ~30min with 40 VMs   |\r\n\r\n\r\n### How can I customizing resource allocation for local runs?\r\n\r\nBy default, the `run-local.sh` script attempts to create a QEMU VM with 8 GB of RAM and 8 CPU cores. If your system has limited resources, you can override these defaults by specifying the desired RAM and CPU allocation:\r\n\r\n```bash\r\n.\u002Frun-local.sh --ram-size 4G --cpu-cores 4\r\n```\r\n\r\n### How can I toggle support for KVM acceleration?\r\n\r\nIf your system does not support [KVM acceleration](https:\u002F\u002Fgithub.com\u002Fdockur\u002Fwindows?tab=readme-ov-file#how-do-i-verify-if-my-system-supports-kvm), you can disable it by specifying the `--use-kvm false` flag:\r\n\r\n```bash\r\n.\u002Frun-local.sh --use-kvm false\r\n```\r\n\r\nNote that running the benchmark locally without KVM acceleration is not recommended due to performance issues. In this case, we recommend preparing the golden image for later running the benchmark on Azure. \r\n\r\n\r\n## 👏 Acknowledgements\r\n\r\n- [OS World](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld) for the original benchmark task framework.\r\n- [Dockur](https:\u002F\u002Fgithub.com\u002Fdockur\u002Fwindows) for the Docker infrastructure underlying WAA.\r\n- [GroundingDINO](https:\u002F\u002Fgithub.com\u002FIDEA-Research\u002FGroundingDINO) for the object detection module in our Navi Agent.\r\n- [NotebookLM](https:\u002F\u002Fnotebooklm.google.com) for our AI-generated podcast.\r\n\r\n## 🤝 Contributing\r\n\r\nThis project welcomes contributions and suggestions.  Most contributions require you to agree to a\r\nContributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us\r\nthe rights to use your contribution. For details, visit https:\u002F\u002Fcla.opensource.microsoft.com.\r\n\r\nWhen you submit a pull request, a CLA bot will automatically determine whether you need to provide\r\na CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions\r\nprovided by the bot. You will only need to do this once across all repos using our CLA.\r\n\r\nThis project has adopted the [Microsoft Open Source Code of Conduct](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002F).\r\nFor more information see the [Code of Conduct FAQ](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002Ffaq\u002F) or\r\ncontact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.\r\n\r\n## 🛡️ Trademarks\r\n\r\nThis project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Flegal\u002Fintellectualproperty\u002Ftrademarks\u002Fusage\u002Fgeneral).\r\nUse of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.\r\nAny use of third-party trademarks or logos are subject to those third-party's policies.\r\n","\u003Cdiv align=\"center\">\r\n    \r\n![Banner](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_0b596ef712e4.png)\r\n[![网站](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FWebsite-red)](https:\u002F\u002Fmicrosoft.github.io\u002FWindowsAgentArena)\r\n[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-green)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.08264)\r\n[![许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-yellow.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FMIT)\r\n[![PRs](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FAI-Podcast-blue.svg?logo=data:image\u002Fsvg%2bxml;base64,PHN2ZyBmaWxsPSIjZmZmZmZmIiB2aWV3Qm94PSIwIDAgMjQgMjQiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PGcgaWQ9IlNWR1JlcG9fYmdDYXJyaWVyIiBzdHJva2Utd2lkdGg9IjAiPjwvZz48ZyBpZD0iU1ZHUmVwb190cmFjZXJDYXJyaWVyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjwvZz48ZyBpZD0iU1ZHUmVwb19pY29uQ2FycmllciI+PHBhdGggZD0iTTEzLDRWMjBhMSwxLDAsMCwxLTIsMFY0YTEsMSwwLDAsMCwxLDIsMFY2QTEsMSwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExL......\u003Cdiv align=\"center\">\r\n    \r\n![Banner](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_0b596ef712e4.png)\r\n[![网站](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FWebsite-red)](https:\u002F\u002Fmicrosoft.github.io\u002FWindowsAgentArena)\r\n[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-green)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.08264)\r\n[![许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-yellow.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FMIT)\r\n[![PRs](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FAI-Podcast-blue.svg?logo=data:image\u002Fsvg%2bxml;base64,PHN2ZyBmaWxsPSIjZmZmZmZmIiB2aWV3Qm94PSIwIDAgMjQgMjQiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PGcgaWQ9IlNWR1JlcG9fYmdDYXJyaWVyIiBzdHJva2Utd2lkdGg9IjAiPjwvZz48ZyBpZD0iU1ZHUmVwb190cmFjZXJDYXJyaWVyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjwvZz48ZyBpZD0iU1ZHUmVwb19pY29uQ2FycmllciI+PHBhdGggZD0iTTEzLDRWMjBhMSwxLDAsMCwxLTIsMFY0YTEsMSwwLDAsMCwxLDIsMFYxOEEtMSwxLDAsMCwwLDAsNyw2VjE4YTEsMSwwLDAsMCwyLDBWNkExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDEsMCwwLDAsMCwyLDBWOEExLDE......\n\n### 3. 准备 Windows 11 虚拟机\n\n\u003Cdiv align=\"center\">\n    \u003Cvideo src=\"https:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F6d55b9b5-3242-49af-be20-64f2086108b9\" height=\"500\" alt=\"local_prepare_golden_image\">\n\u003C\u002Fdiv>\n\n#### 3.1 下载 Windows 11 评估版 .iso 文件：\n1. 访问 [Microsoft 评估中心](https:\u002F\u002Finfo.microsoft.com\u002Fww-landing-windows-11-enterprise.html)，接受服务条款，并下载一个 **Windows 11 企业版评估版（90 天试用，英语，美国）** ISO 文件 [约 6GB]。\n2. 下载完成后，将文件重命名为 `setup.iso`，并将其复制到目录 `WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fimage`。\n\n#### 3.2 自动设置 Windows 11 黄金镜像：\n在运行 Arena 之前，您需要准备一个新的 WAA 快照（也称为 WAA 黄金镜像）。这个 30GB 的快照代表一个功能齐全的 Windows 11 虚拟机，其中包含运行基准测试所需的所有程序。该虚拟机还托管了一个 Python 服务器，用于接收并执行代理命令。要了解相关组件的更多信息，请参阅我们的 [本地](\u002Fimg\u002Farchitecture-local.png) 和 [云](\u002Fimg\u002Farchitecture-azure.png) 组件示意图。\n\n要准备黄金快照，请**仅运行一次**以下命令：\n```bash\ncd .\u002Fscripts\n.\u002Frun-local.sh --prepare-image true\n```\n您可以在 `http:\u002F\u002Flocalhost:8006` 监控进度。准备过程完全自动化，大约需要 20 分钟。\n\n**请勿在虚拟机准备过程中进行任何操作。当配置过程完成时，虚拟机会自动关闭。**\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_b1ca0770dad6.png\" alt=\"local_prepare_screen_unattend\" height=\"500\"\u002F>\n\u003C\u002Fdiv>\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_fda2c263c130.png\" alt=\"local_prepare_screen_setup\" height=\"500\"\u002F>\n\u003C\u002Fdiv>\n\n最后，您应该会看到名为 `winarena` 的 Docker 容器按预期正常终止，如下所示的日志所示。\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_3f2f2699d268.png\" alt=\"local_prepare_logs_successful\" height=\"200\"\u002F>\n\u003C\u002Fdiv>\n\n\u003Cbr\u002F>\n\n您将在 `WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fstorage` 中找到 30GB 的 WAA 黄金镜像，它由以下文件组成：\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_09ff78330f70.png\" alt=\"run_local_prepare_storage_successful\" height=\"200\"\u002F>\n\u003C\u002Fdiv>\n\n\u003Cbr\u002F>\n\n##### 补充说明\n- 在开发过程中，如果您希望将 `src\u002Fwin-arena-container` 目录中的任何更改包含到 WAA 黄金镜像中，请确保在 `run-local.sh` 脚本中指定 `--skip-build false` 标志（默认为 true）。这将确保构建新的容器镜像，而不是使用预构建的 `windowsarena\u002Fwinarena:latest` 镜像。\n- 如果您之前已经运行过安装流程，并希望从头开始重新进行，请务必删除 `storage` 目录中的内容。\n- 我们建议将此 `storage` 文件夹复制到仓库外部的安全位置，以防您或代理在某个时刻意外损坏虚拟机，从而避免重新设置。\n- 根据您的 Docker 设置，您可能需要使用 `sudo` 来运行上述命令。\n- 如果您正在使用 WSL2？如果遇到 `\u002Fbin\u002Fbash: bad interpreter: No such file or directory` 错误，我们建议将 Bash 脚本从 DOS\u002FWindows 格式转换为 Unix 格式：\n```bash\ncd .\u002Fscripts\nfind . -maxdepth 1 -type f -exec dos2unix {} +\n```\n\n### 4. 在 Arena 中部署代理\n\n#### 4.1 运行基础基准测试\n\n现在您已准备好启动评估。要运行基准代理以完成所有基准任务，请执行以下命令：\n\n```bash\ncd scripts\n.\u002Frun-local.sh\n# 如需查看客户端\u002F代理选项：\n# .\u002Frun-local.sh --help\n```\n\n打开 http:\u002F\u002Flocalhost:8006，您将看到运行代理的 Windows 虚拟机。如果您拥有一台性能强劲的电脑，也可以通过以下命令运行我们论文中最强的代理配置：\n```bash\n.\u002Frun-local.sh --gpu-enabled true --som-origin mixed-omni --a11y-backend uia\n```\n\n运行结束后，您可以使用以下命令显示结果：\n```bash\ncd src\u002Fwin-arena-container\u002Fclient\npython show_results.py --result_dir \u003Cresults_folder_path>\n```\n\n#### 可用配置\n\n以下是我们在研究中使用的 Navi 代理各种超参数组合的比较，您可以通过在运行 `run-local.sh` 脚本时指定 `--som-origin \u003Csom_origin> --a11y-backend \u003Ca11y_backend>` 来覆盖这些配置：\n\n| 命令 | 描述 | 备注 |\n|---------|-------------|--------|\n| `.\u002Frun-local.sh --som-origin mixed-omni --a11y-backend uia` | 将 Omniparser 与辅助功能树信息结合 | ⭐**推荐用于最佳效果** |\n| `.\u002Frun-local.sh --som-origin omni` | 使用 Omniparser 进行屏幕理解 | |\n| `.\u002Frun-local.sh --som-origin oss` | 使用 webparse、groundingdino 和 OCR（TesseractOCR）| 🌲基线 |\n| `.\u002Frun-local.sh --som-origin a11y --a11y-backend uia` | 使用更慢但更准确的辅助功能树 | |\n| `.\u002Frun-local.sh --som-origin a11y --a11y-backend win32` | 使用更快但准确性较低的辅助功能树 | 🐇最快 |\n| `.\u002Frun-local.sh --som-origin mixed-oss --a11y-backend uia` | 将 OSS 检测结果与辅助功能树结合 |  |\n\n- `--som-origin` 决定 Navi 代理如何检测屏幕元素。\n- `--a11y-backend` 指定辅助功能后端类型（在使用 `a11y` 或混合模式时）。\n\n#### 4.2 本地开发提示\n\n乍一看，在 Docker 容器内运行的代码似乎难以开发和调试。不过，我们提供了一些技巧来简化这一过程。有关更多详细信息，请参阅 [开发提示文档](.\u002Fdocs\u002FDevelopment-Tips.md)，例如：\n- 如何将 VSCode 窗口（带调试器）附加到正在运行的容器；\n- 如何从本地机器修改代理和 Windows 服务器代码，并实时查看容器中的更改。\n\n## 🌐 Azure 部署 -> 并行化基准测试\n\n我们提供了一种无缝方式，可在 Azure ML 计算虚拟机上运行 Windows Agent Arena。此选项可显著缩短在所有基准任务中测试您的代理所需的时间，从数小时\u002F数天缩短至几分钟。\n\n### 1. 设置 Azure 资源组：\n\n- 如果您还没有 Azure 订阅，可以开始[免费试用](https:\u002F\u002Fazure.microsoft.com\u002Fen-us\u002Ffree\u002F)。请记下订阅 ID，我们将在第 3 部分中将其用作 `AZURE_SUBSCRIPTION_ID`。\n- 在 [Azure 门户](https:\u002F\u002Fportal.azure.com\u002F) 中，在您选择的区域创建一个新的资源组（例如 `agents`）。请记下资源组名称，我们将在第 3 部分中将其用作 `AZURE_ML_RESOURCE_GROUP`。\n- 在此资源组中，创建一个 Azure 机器学习资源（例如命名为 `agents_ml`）。请记下机器学习工作区名称，我们将在第 3 部分中将其用作 `AZURE_ML_WORKSPACE_NAME`。在创建向导中，请确保勾选自动创建以下内容的选项：\n    - 存储账户。**注意：** 请记下存储账户名称，我们将在第 2 部分中使用它来上传黄金镜像。\n    - 密钥保管库。\n    - Application Insights。\n    - 【可选】容器注册表。您可以使用 Azure 容器注册表私密地存储自定义 Docker 镜像，而无需将其推送到公共的 Docker Hub。\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_2ffdf80e3081.png\" alt=\"azure_create_ml\" height=\"300\"\u002F>\n\u003C\u002Fdiv>\n\n- 创建完成后，导航到 [Azure 机器学习门户](https:\u002F\u002Fml.azure.com\u002Fhome)，并单击您的工作区（`agents`）。\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_f64af0dd9d57.png\" alt=\"azure_ml_portal\" height=\"200\"\u002F>\n\u003C\u002Fdiv>\n\n- 在工作区中，导航到“笔记本”选项卡。在您的用户分配文件夹中（如图所示），创建一个名为 `compute-instance-startup.sh` 的新 Bash (.sh) 文件。将 `scripts\u002Fazure_files\u002Fcompute-instance-startup.sh` 文件的内容复制到该文件中并保存。每次在 Azure 中启动新的虚拟机时，都会运行此脚本以应用一些基础配置。请记下您保存文件的路径（格式为 `Users\u002F\u003CYOUR_USER>\u002Fcompute-instance-startup.sh`），我们将在第 3 部分中使用该路径来运行此脚本。\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_83b36eb99195.png\" alt=\"azure_notebook\" height=\"200\"\u002F>\n\u003C\u002Fdiv>\n\n- 【可选】根据您的需求，您可能需要为所在区域申请更多的计算配额。您可以通过导航到 [Azure 配额页面](https:\u002F\u002Fml.azure.com\u002Fquota\u002F) 来进行申请。作为参考，我们目前在基准测试中使用 `Standard_D8_v3` 虚拟机规格，该规格属于 `Standard Dv3 系列专用 vCPU` 类别。每台虚拟机使用 8 个核心。请确保您使用的机器类型支持【嵌套虚拟化】(https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fanswers\u002Fquestions\u002F813416\u002Fhow-do-i-know-what-size-azure-vm-supports-nested-v)。\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_10a94c3b6b73.png\" alt=\"azure_quota\" height=\"200\"\u002F>\n\u003C\u002Fdiv>\n\n\n### 2. 将 Windows 11 和 Docker 镜像上传到 Azure\n\n- 将 Windows 11 存储文件夹上传到与默认数据存储关联的 Blob 容器中。默认情况下，Azure 机器学习工作区的基础数据由存储账户通过一个或多个机器学习数据存储提供支持。名为 `workspaceblobstore` 的默认数据存储是在设置工作区时创建的，并链接到 Azure 存储账户下的 Blob 容器。您可以通过访问 [Azure 机器学习数据存储](https:\u002F\u002Fml.azure.com\u002Fdata\u002Fdatastore) 来查看数据存储和容器之间的关联。找到后，您可以通过多种方式上传存储文件夹：\n    - 下载 [Azure 存储资源管理器](https:\u002F\u002Fazure.microsoft.com\u002Fen-us\u002Ffeatures\u002Fstorage-explorer\u002F) 程序，登录并选择 Blob 容器。在完成本地设置步骤后，从您的本地计算机上传 `WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fstorage` 文件夹。\u003Cdiv align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_readme_40bdf7fa6ce8.png\" alt=\"azure_blobstore\" height=\"100\"\u002F>\u003C\u002Fdiv>\n    - 或者，您也可以使用 Azure CLI 上传文件夹。要安装 CLI，请按照 [此处](https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fcli\u002Fazure\u002Finstall-azure-cli) 提供的步骤操作。安装完成后，可以使用以下命令：\n        ```bash\n        az login --use-device-code # 仅在提示时需要\n        az storage blob upload-batch --account-name \u003CSTORAGE_ACCOUNT_NAME> --destination \u003CCONTAINER_NAME> --source \u003CLOCAL_FOLDER>\n        # 参数列表请参阅：https:\u002F\u002Fdocs.microsoft.com\u002Fen-us\u002Fcli\u002Fazure\u002Fstorage\u002Fblob?view=azure-cli-latest\n        ```\n    - 或者，使用 [Azure 门户](https:\u002F\u002Fportal.azure.com\u002F) 界面上传文件夹。导航到存储账户，单击“存储浏览器→Blob 容器”，选择您的容器，然后上传文件夹。由于连接可能会不稳定，不建议对大文件使用此方法。\n    \n- 【可选】如果您不使用默认的 `windowsarena\u002Fwinarena:latest` 镜像，可以将自定义镜像上传到 Azure 容器注册表。您可以按照 [Azure 容器注册表文档](https:\u002F\u002Fdocs.microsoft.com\u002Fen-us\u002Fazure\u002Fcontainer-registry\u002Fcontainer-registry-get-started-portal) 操作：\n    ```bash\n    az login --use-device-code\n    # 如果以下命令不起作用，可能需要执行：az acr login --name \u003CACR_NAME>\n    docker login # 系统会提示您输入 ACR 凭据（用户名 + 密码，可在 Azure 门户中找到）\n    docker tag \u003CIMAGE_NAME> \u003CACR_NAME>.azurecr.io\u002F\u003CIMAGE_NAME>:\u003CTAG>\n    docker push \u003CACR_NAME>.azurecr.io\u002F\u003CIMAGE_NAME>:\u003CTAG>\n    ```\n\n### 3. 环境配置与部署\n\n- 将以下额外的键添加到项目根目录下的 `config.json` 文件中：\n```json\n{\n    ... \u002F\u002F 您之前的配置\n\n    \"AZURE_SUBSCRIPTION_ID\": \"\u003CYOUR_AZURE_SUBSCRIPTION_ID>\", \n    \"AZURE_ML_RESOURCE_GROUP\": \"\u003CYOUR_AZURE_ML_RESOURCE_GROUP>\",\n    \"AZURE_ML_WORKSPACE_NAME\": \"\u003CYOUR_AZURE_ML_WORKSPACE_NAME>\"\n}\n```\n\n- 创建一个名为 `experiments.json` 的新文件，用于指定每次实验运行所需的参数，包括要部署的代理和使用的底层 LLM 模型。您可以在 [`scripts\u002Fexperiments.json`](scripts\\experiments.json) 中找到一个包含多个实验的参考 `experiments.json`：\n```json\n{\n  \"experiment_1\": {\n    \"ci_startup_script_path\": \"Users\u002F\u003CYOUR_USER>\u002Fcompute-instance-startup.sh\", \u002F\u002F 如第 1 节所示\n    \"agent\": \"navi\",\n    \"datastore_input_path\": \"storage\",\n    \"docker_img_name\": \"windowsarena\u002Fwinarena:latest\",\n    \"exp_name\": \"experiment_1\",\n    \"num_workers\": 4,\n    \"use_managed_identity\": false,\n    \"json_name\": \"evaluation_examples_windows\u002Ftest_all.json\",\n    \"model_name\": \"gpt-4-1106-vision-preview\",\n    \"som_origin\": \"oss\", \u002F\u002F 或 a11y，或 mixed-oss\n    \"a11y_backend\": \"win32\" \u002F\u002F 或 uia\n  }\n  \u002F\u002F ...\n}\n```\n\n- （可选）您也可以使用 `run_azure.py` 的 `--experiments_json` 和 `--update_json` 参数来生成 `experiments.json`，上述 JSON 等价于以下命令：\n```bash\ncd scripts\npython run_azure.py --experiments_json \"experiments.json\" --update_json --exp_name \"experiment_1\" --ci_startup_script_path \"Users\u002F\u003CYOUR_USER>\u002Fcompute-instance-startup.sh\" --agent \"navi\" --json_name \"evaluation_examples_windows\u002Ftest_all.json\" --num_workers 4 --som_origin oss --a11y_backend win32\n```\n\n- 通过运行以下命令，在 Azure ML Compute 上部署代理：\n```bash\naz login --use-device-code # https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fcli\u002Fazure\u002Finstall-azure-cli\n# 如果有多个租户或订阅，请确保选择正确的租户和订阅：\n# az login --use-device-code --tenant \"\u003CYOUR_AZURE_AD_TENANT_ID>\"\n# az account set --subscription \"\u003CYOUR_AZURE_AD_TENANT_ID>\"\n\n# 确保已在您的 conda 环境中安装 Python 依赖项\n# conda activate winarena\n# pip install -r requirements.txt\n\n# 在激活的 conda 环境中：\ncd scripts\npython run_azure.py --experiments_json \"experiments.json\"\n```\n\n对于 `experiments.json` 中未完成的实验，脚本将执行以下操作：\n1. 创建 `\u003Cnum_workers` 个 Azure Compute Instance VM。\n2. 每个 VM 运行一个名为 `\u003Cexp_name>` 的 ML 训练作业。\n3. 作业完成后销毁这些 VM。\n\n运行日志将保存在您上传 Windows 11 镜像的同一 Blob 容器中的 `agent_outputs` 文件夹中。您可以将 `agent_outputs` 文件夹下载到本地，并运行 `show_azure.py` 脚本，以 Markdown 表格的形式查看每个实验的结果。\n\n```bash\ncd scripts\npython show_azure.py --json_config \"experiments.json\" --result_dir \u003Cpath_to_downloaded_agent_outputs_folder>\n```\n\n## 🤖 BYOA：自带你的代理\n想在 Windows Agent Arena 中测试您自己的代理吗？您可以使用我们的默认代理作为模板，在 `src\u002Fwin-arena-container\u002Fclient\u002Fmm_agents` 下创建您自己的文件夹。您只需确保您的 `agent.py` 文件包含 `predict()` 和 `reset()` 函数即可。有关代理开发的更多信息，请参阅 [BYOA 文档](.\u002Fdocs\u002FDevelop-Agent.md)。\n\n## 👩‍💻 开源贡献\n我们欢迎对 Windows Agent Arena 项目的贡献。特别是，我们欢迎：\n- 新的开源代理加入基准测试\n- 向现有类别添加新任务，或直接创建新的类别\n\n如果您有兴趣参与贡献，请查看我们的 [任务开发指南](.\u002Fdocs\u002FDevelop-Tasks.md)。\n\n## ❓ 常见问题解答\n### 基准测试的大致运行时间和成本是多少？\n| 组件 | 成本 | 时间 |\n|----------|----------|----------|\n| Azure Standard_D8_v3 VM    | ~$8 ($0.38\u002F小时 * 40 * 0.5小时)   |    |\n| GPT-4V    | $100   |  ~35分钟，使用 40 个 VM  |\n| GPT-4o    | $100   | ~35分钟，使用 40 个 VM   |\n| GPT-4o-mini    | $15   | ~30分钟，使用 40 个 VM   |\n\n\n### 如何自定义本地运行的资源分配？\n默认情况下，`run-local.sh` 脚本会尝试创建一个具有 8 GB 内存和 8 个 CPU 核心的 QEMU VM。如果您的系统资源有限，可以通过指定所需的内存和 CPU 分配来覆盖这些默认值：\n\n```bash\n.\u002Frun-local.sh --ram-size 4G --cpu-cores 4\n```\n\n### 如何切换 KVM 加速支持？\n如果您的系统不支持 [KVM 加速](https:\u002F\u002Fgithub.com\u002Fdockur\u002Fwindows?tab=readme-ov-file#how-do-i-verify-if-my-system-supports-kvm)，可以通过指定 `--use-kvm false` 标志来禁用它：\n\n```bash\n.\u002Frun-local.sh --use-kvm false\n```\n\n请注意，由于性能问题，不建议在没有 KVM 加速的情况下在本地运行基准测试。在这种情况下，我们建议您准备黄金镜像，以便稍后在 Azure 上运行基准测试。\n\n\n## 👏 致谢\n- [OS World](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld) 提供了原始的基准测试任务框架。\n- [Dockur](https:\u002F\u002Fgithub.com\u002Fdockur\u002Fwindows) 提供了 WAA 底层的 Docker 基础设施。\n- [GroundingDINO](https:\u002F\u002Fgithub.com\u002FIDEA-Research\u002FGroundingDINO) 提供了我们 Navi 代理中的目标检测模块。\n- [NotebookLM](https:\u002F\u002Fnotebooklm.google.com) 提供了我们的人工智能生成播客。\n\n## 🤝 参与贡献\n本项目欢迎贡献和建议。大多数贡献都需要您同意一份贡献者许可协议 (CLA)，声明您有权并确实授予我们使用您贡献的权利。有关详细信息，请访问 https:\u002F\u002Fcla.opensource.microsoft.com。\n\n当您提交拉取请求时，CLA 机器人会自动确定您是否需要提供 CLA，并相应地为 PR 添加标记（例如状态检查、评论）。请按照机器人提供的说明操作。您只需在整个使用我们 CLA 的仓库中进行一次此操作即可。\n\n本项目已采用 [微软开源行为准则](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002F)。有关更多信息，请参阅 [行为准则常见问题解答](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002Ffaq\u002F)，或如有任何其他疑问或意见，请联系 [opencode@microsoft.com](mailto:opencode@microsoft.com)。\n\n## 🛡️ 商标\n\n本项目可能包含用于项目、产品或服务的商标或标识。微软商标或标识的授权使用须遵守并严格执行[微软商标与品牌指南](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Flegal\u002Fintellectualproperty\u002Ftrademarks\u002Fusage\u002Fgeneral)。\n在本项目的修改版本中使用微软商标或标识，不得造成混淆或暗示微软的赞助关系。\n任何第三方商标或标识的使用均应遵循该第三方的相关政策。","# Windows Agent Arena 快速上手指南\n\nWindows Agent Arena (WAA) 是一个可扩展的 Windows AI 智能体平台，用于测试和基准评估多模态桌面 AI 智能体。它提供了一个可复现且真实的 Windows 操作系统环境，支持在 Azure ML 云基础设施上大规模并行运行智能体。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux 或 Windows (推荐使用 **WSL 2**)。\n    *   *Windows 用户提示*: 请务必安装 [Docker Desktop with WSL 2](https:\u002F\u002Fdocs.docker.com\u002Fdesktop\u002Fwsl\u002F)。\n*   **Docker**: 已安装并正在运行 Docker Daemon。\n*   **Python**: 版本 3.9。\n    *   推荐使用 Conda 创建独立环境：`conda create -n winarena python=3.9`\n*   **API Key**: 需要 [OpenAI](https:\u002F\u002Fplatform.openai.com) 或 [Azure OpenAI](https:\u002F\u002Fazure.microsoft.com) 的 API 密钥。\n*   **磁盘空间**: 建议预留至少 40GB 可用空间（用于存放 Windows 11 镜像和容器数据）。\n\n## 安装步骤\n\n### 1. 克隆项目与安装依赖\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena.git\ncd WindowsAgentArena\n\n# 激活 conda 环境 (如果已创建)\n# conda activate winarena\n\n# 安装 Python 依赖\npip install -r requirements.txt\n```\n\n### 2. 配置 API 密钥\n\n在项目根目录创建 `config.json` 文件，填入您的 API 密钥：\n\n```json\n{\n    \"OPENAI_API_KEY\": \"\u003C您的_OPENAI_API_KEY>\", \n    \"AZURE_API_KEY\": \"\u003C您的_AZURE_API_KEY>\",  \n    \"AZURE_ENDPOINT\": \"https:\u002F\u002F\u003C您的端点>.openai.azure.com\u002F\" \n}\n```\n*注：根据您使用的服务类型，只需填写对应的字段即可。*\n\n### 3. 构建 Docker 镜像\n\n首先拉取基础镜像，然后构建本地运行所需的完整镜像：\n\n```bash\n# 拉取基础镜像\ndocker pull windowsarena\u002Fwinarena-base:latest\n\n# 进入脚本目录并构建\ncd scripts\n.\u002Fbuild-container-image.sh\n```\n\n*注意：如果在 WSL2 中遇到 `\u002Fbin\u002Fbash: bad interpreter` 错误，请先执行 `find . -maxdepth 1 -type f -exec dos2unix {} +` 转换脚本格式。*\n\n### 4. 准备 Windows 11 黄金镜像 (Golden Image)\n\n这是最关键的一步，系统将自动下载 Windows 11 评估版 ISO 并配置环境（耗时约 20 分钟）。\n\n**前置操作：**\n1. 访问 [Microsoft Evaluation Center](https:\u002F\u002Finfo.microsoft.com\u002Fww-landing-windows-11-enterprise.html) 下载 **Windows 11 Enterprise Evaluation (90-day trial, English, United States)** ISO 文件 (~6GB)。\n2. 将下载的文件重命名为 `setup.iso`。\n3. 将其移动到目录：`WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fimage`\n\n**执行初始化：**\n```bash\ncd .\u002Fscripts\n.\u002Frun-local.sh --prepare-image true\n```\n\n*   可通过浏览器访问 `http:\u002F\u002Flocalhost:8006` 监控进度。\n*   **请勿干扰**正在准备的虚拟机，完成后它将自动关闭。\n*   成功后，30GB 的镜像文件将生成在 `src\u002Fwin-arena-container\u002Fvm\u002Fstorage` 目录下（建议备份此文件夹以便后续重用）。\n\n## 基本使用\n\n完成上述步骤后，即可启动基准测试。\n\n### 运行基准测试\n\n在 `scripts` 目录下执行以下命令启动默认智能体测试：\n\n```bash\ncd scripts\n.\u002Frun-local.sh\n```\n\n启动后，访问 `http:\u002F\u002Flocalhost:8006` 即可在浏览器中实时观察 Windows 虚拟机中智能体的操作过程。\n\n### 高级配置示例\n\n若要使用论文中表现最佳的配置（结合 Omniparser 和辅助功能树），请运行：\n\n```bash\n.\u002Frun-local.sh --gpu-enabled true --som-origin mixed-omni --a11y-backend uia\n```\n\n### 查看结果\n\n测试结束后，使用以下命令查看结果报告：\n\n```bash\ncd src\u002Fwin-arena-container\u002Fclient\npython show_results.py --result_dir \u003C结果文件夹路径>\n```\n\n### 常用参数说明\n\n| 参数 | 说明 | 推荐场景 |\n| :--- | :--- | :--- |\n| `--som-origin mixed-omni` | 结合 Omniparser 与辅助功能树 | ⭐ **最佳效果** |\n| `--som-origin oss` | 使用开源模型 (webparse, groundingdino, OCR) | 🌲 基线测试 |\n| `--som-origin a11y --a11y-backend win32` | 仅使用 Win32 辅助功能树 | 🐇 最快速度 |","某大型科技公司的 AI 研发团队正致力于开发一款能自动处理复杂办公流程的多模态桌面助手，急需在真实 Windows 环境中验证其操作能力。\n\n### 没有 WindowsAgentArena 时\n- **环境搭建繁琐**：研究人员需手动配置多台物理机或虚拟机来模拟不同用户场景，耗时数天且难以保证环境一致性。\n- **测试效率低下**：串行执行数百个测试任务（如“整理 Excel 报表并发送邮件”）需要数周时间，严重拖慢迭代节奏。\n- **评估标准不一**：缺乏统一的基准测试集，不同团队对“任务成功”的定义模糊，导致模型性能对比困难。\n- **复杂场景缺失**：难以模拟需要 Agent 自主启动软件、查找文件等“高难度”初始化操作，模型在真实落地时频频失效。\n\n### 使用 WindowsAgentArena 后\n- **一键部署环境**：基于 Docker 和 Azure ML 基础设施，分钟级即可拉起可复现的真实 Windows OS 容器，彻底消除环境差异。\n- **大规模并行评测**：支持成百上千个 Agent 并行运行，原本需数周的数百项任务测试现在仅需几分钟即可产出结果。\n- **标准化基准打分**：内置多样化的预定义任务库和统一评分机制，让团队能客观量化模型在多模态理解与操作上的进步。\n- **挑战模式进阶**：通过切换至\"hard\"难度模式，强制 Agent 自主完成应用启动与环境设置，有效提升了模型应对真实未知场景的鲁棒性。\n\nWindowsAgentArena 将原本耗时数周的桌面智能体验证过程压缩至分钟级，为多模态 AI 从实验室走向真实办公场景提供了不可或缺的加速引擎。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_WindowsAgentArena_0b596ef7.png","microsoft","Microsoft","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fmicrosoft_4900709c.png","Open source projects and samples from Microsoft",null,"opensource@microsoft.com","OpenAtMicrosoft","https:\u002F\u002Fopensource.microsoft.com","https:\u002F\u002Fgithub.com\u002Fmicrosoft",[83,87,91,95,99,103,107],{"name":84,"color":85,"percentage":86},"Python","#3572A5",87.9,{"name":88,"color":89,"percentage":90},"Cuda","#3A4E3A",5.2,{"name":92,"color":93,"percentage":94},"Shell","#89e051",3.4,{"name":96,"color":97,"percentage":98},"PowerShell","#012456",2.2,{"name":100,"color":101,"percentage":102},"JavaScript","#f1e05a",0.7,{"name":104,"color":105,"percentage":106},"C++","#f34b7d",0.6,{"name":108,"color":109,"percentage":110},"Batchfile","#C1F12E",0.1,854,96,"2026-04-17T03:52:59","MIT",4,"Linux, Windows (需通过 WSL 2)","非必需。若运行高性能代理（如 Omniparser），建议使用支持 CUDA 的 NVIDIA GPU；具体型号和显存未说明。","未说明（但生成 Windows 11 黄金镜像需约 30GB 磁盘空间，ISO 文件约 6GB）",{"notes":120,"python":121,"dependencies":122},"1. 必须安装并运行 Docker 守护进程，Windows 用户推荐使用 Docker Desktop 配合 WSL 2。\n2. 需要 OpenAI 或 Azure OpenAI 的 API 密钥。\n3. 强烈建议使用 Conda 创建名为 'winarena' 的 Python 3.9 环境。\n4. 首次运行需下载约 6GB 的 Windows 11 企业版评估镜像，并自动化构建约 30GB 的虚拟机快照（耗时约 20 分钟）。\n5. 若在 WSL2 中遇到脚本解释器错误，需将 bash 脚本从 DOS 格式转换为 Unix 格式。","3.9",[123,124,125],"Docker","OpenAI API 或 Azure OpenAI API Key","requirements.txt 中定义的库",[127,13,15,14],"其他",[129,130,131,132,133,134,135,136,137],"agentic","ai","ai-agent","ai-research","windows","ai-benchmark","desktop-agent","computer","computer-use","2026-03-27T02:49:30.150509","2026-04-18T14:13:06.319161",[141,146,151,156,161,166],{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},40103,"运行基准测试任务时终端长时间无响应或卡住怎么办？","如果在 WSL 或 Windows 环境下运行，系统构建可能需要超过 30 分钟。如果等待超过 5 分钟仍无响应，建议重试以下步骤：\n1. 从 main 分支拉取最新代码。\n2. 运行 `docker pull windowsarena\u002Fwinarena:latest` 更新镜像。\n3. 删除 `src\u002Fwin-arena-container\u002Fvm\u002Fstorage` 目录下的所有现有文件。\n4. 重新执行 `.\u002Frun-local --prepare-image true` 进行重建。\n请确保项目描述中的系统文件与本地一致，如有差异需清空存储目录后重建。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fissues\u002F69",{"id":147,"question_zh":148,"answer_zh":149,"source_url":150},40104,"运行 run-local.sh 时遇到 'Failed to connect to QMP' 错误如何解决？","该问题通常是因为基础镜像未正确构建。请先手动构建基础镜像，执行命令：\n`.\u002Fbuild-container-image.sh --build-base-image true`\n待构建完成后，再正常运行 `.\u002Frun-local.sh` 即可解决。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fissues\u002F42",{"id":152,"question_zh":153,"answer_zh":154,"source_url":155},40105,"运行脚本时出现 'exec: bash: no such file or directory' 或 Docker 启动失败错误？","这通常表明 Docker 守护进程配置有问题或无法找到 bash 路径（常见于 Windows Git Bash 环境）。\n建议检查 Docker 是否正确安装并运行。如果是路径问题，参考相关报错信息，确保 Docker 能正确调用 shell。对于 OCI runtime create failed 错误，可参考 StackOverflow 上的解决方案排查 Docker 运行时配置：https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F48074282\u002Fdocker-container-not-starting-giving-oci-runtime-create-failed","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fissues\u002F33",{"id":157,"question_zh":158,"answer_zh":159,"source_url":160},40106,"如何运行特定的基准测试任务子集或指定类别？","虽然文档主要介绍全量运行，但遇到准备步骤失败（如 LibreOffice 或 pip 安装失败）导致无法运行时，应优先清理环境。请删除 `src\u002Fwin-arena-container\u002Fvm\u002Fstorage` 中的内容并重新运行 `.\u002Frun-local --prepare-image true`。关于运行特定子集，目前主要通过配置文件或脚本参数控制，若遇阻请先确保基础环境（镜像和存储）已按上述步骤正确重置。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fissues\u002F23",{"id":162,"question_zh":163,"answer_zh":164,"source_url":165},40107,"构建 'agentarena\u002Fwindows-local' 镜像使用的是哪个 Dockerfile？","该项目使用的镜像是基于 [dockur\u002Fwindows](https:\u002F\u002Fgithub.com\u002Fdockur\u002Fwindows)  fork 并简化而来的。具体实现可以参考维护者的仓库：https:\u002F\u002Fgithub.com\u002Ffrancedot\u002Fwindows-local。用户也可以直接使用 dockur\u002Fwindows 镜像并添加缺失的运行脚本来自行构建。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fissues\u002F38",{"id":167,"question_zh":168,"answer_zh":169,"source_url":170},40108,"准备 Windows 11 镜像时提示 'A media driver your computer needs is missing' 或找不到 'dockerbridge' 设备怎么办？","这是在 Linux (如 Ubuntu 20.04) 上准备镜像时的常见问题。解决方案已在 Issue #27 中讨论（见引用）。通常可以通过直接复制项目中已有的 `WindowsAgentArena\u002Fsrc\u002Fwin-arena-container\u002Fvm\u002Fstorage` 文件到你的项目对应目录，从而绕过 Windows 安装过程中的驱动缺失问题。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fissues\u002F32",[172,177,182,187],{"id":173,"version":174,"summary_zh":175,"released_at":176},323629,"v0.0.4","## 变更内容\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F6 中修复了超参数表格和示例。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F10 中添加了 BYOA 文档。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F12 中添加了关于在 WAA 中开发新任务的指导。\n* 文档：由 @eltociear 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F13 中更新了 README.md。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F17 中更新了 README.md，并加入了播客相关内容。\n* 由 @dznyu 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F19 中更新了 run.py。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F20 中公开了 RAM_SIZE、CPU_CORES 和 KVM。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F14 中增加了对 macOS 主机的支持。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F21 中添加了关于 RAM_SIZE、CPU_CORES 和 KVM 的文档。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F24 中修复了损坏的镜像和 py 别名。\n\n## 新贡献者\n* @eltociear 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F13 中完成了他们的首次贡献。\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fcompare\u002Fv0.0.3...v0.0.4","2024-09-28T21:06:11",{"id":178,"version":179,"summary_zh":180,"released_at":181},323630,"v0.0.3","## 变更内容\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F5 中传播超参数\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fcompare\u002Fv0.0.2...v0.0.3","2024-09-10T18:39:10",{"id":183,"version":184,"summary_zh":185,"released_at":186},323631,"v0.0.2","## 变更内容\n* 由 @Inhenn 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F3 中修复了 README 中的 experimen.json 命令。\n* 由 @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F4 中添加了本地超参数和相关文档。\n\n## 新贡献者\n* @Inhenn 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F3 中完成了首次贡献。\n* @francedot 在 https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fpull\u002F4 中完成了首次贡献。\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FWindowsAgentArena\u002Fcompare\u002Fv0.0.1...v0.0.2","2024-09-10T12:12:31",{"id":188,"version":189,"summary_zh":77,"released_at":190},323632,"v0.0.1","2024-09-06T18:20:24"]