[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-xlang-ai--OpenCUA":3,"tool-xlang-ai--OpenCUA":62},[4,18,26,36,46,54],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",159267,2,"2026-04-17T11:29:14",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":42,"last_commit_at":43,"category_tags":44,"status":17},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,45],"插件",{"id":47,"name":48,"github_repo":49,"description_zh":50,"stars":51,"difficulty_score":32,"last_commit_at":52,"category_tags":53,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":55,"name":56,"github_repo":57,"description_zh":58,"stars":59,"difficulty_score":32,"last_commit_at":60,"category_tags":61,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[45,13,15,14],{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":68,"readme_en":69,"readme_zh":70,"quickstart_zh":71,"use_case_zh":72,"hero_image_url":73,"owner_login":74,"owner_name":75,"owner_avatar_url":76,"owner_bio":77,"owner_company":78,"owner_location":78,"owner_email":78,"owner_twitter":79,"owner_website":80,"owner_url":81,"languages":82,"stars":95,"forks":96,"last_commit_at":97,"license":98,"difficulty_score":10,"env_os":99,"env_gpu":100,"env_ram":101,"env_deps":102,"category_tags":109,"github_topics":111,"view_count":32,"oss_zip_url":78,"oss_zip_packed_at":78,"status":17,"created_at":118,"updated_at":119,"faqs":120,"releases":151},8504,"xlang-ai\u002FOpenCUA","OpenCUA","OpenCUA: Open Foundations for Computer-Use Agents","OpenCUA 是一个专为“电脑使用代理”（Computer-Use Agents）打造的开源基础框架，旨在让 AI 像人类一样操作电脑。它通过提供大规模数据集、标注工具、评估基准及预训练模型，解决了当前 AI 在跨操作系统和复杂应用环境中难以精准执行任务、缺乏高质量训练数据以及评估标准不统一的痛点。\n\n这套框架非常适合 AI 研究人员、开发者以及对自动化代理感兴趣的技术团队使用。研究人员可利用其涵盖 3 种操作系统、200+ 应用的 AgentNet 数据集进行模型训练；开发者则能借助 AgentNetTool 高效采集人类操作演示，并利用 AgentNetBench 对模型动作进行离线精准评估。\n\nOpenCUA 的核心亮点在于其端到端的基础模型系列（如 OpenCUA-7B\u002F32B\u002F72B），这些模型不仅具备强大的任务规划能力，还能在图形界面中实现高精度的视觉定位（Grounding），直接生成可执行的操作指令。其中，OpenCUA-72B 曾在权威榜单 OSWorld-Verified 中位居榜首。此外，项目生态活跃，已支持 vLLM 加速推理及低显存量化版本，大大降低了部署门","OpenCUA 是一个专为“电脑使用代理”（Computer-Use Agents）打造的开源基础框架，旨在让 AI 像人类一样操作电脑。它通过提供大规模数据集、标注工具、评估基准及预训练模型，解决了当前 AI 在跨操作系统和复杂应用环境中难以精准执行任务、缺乏高质量训练数据以及评估标准不统一的痛点。\n\n这套框架非常适合 AI 研究人员、开发者以及对自动化代理感兴趣的技术团队使用。研究人员可利用其涵盖 3 种操作系统、200+ 应用的 AgentNet 数据集进行模型训练；开发者则能借助 AgentNetTool 高效采集人类操作演示，并利用 AgentNetBench 对模型动作进行离线精准评估。\n\nOpenCUA 的核心亮点在于其端到端的基础模型系列（如 OpenCUA-7B\u002F32B\u002F72B），这些模型不仅具备强大的任务规划能力，还能在图形界面中实现高精度的视觉定位（Grounding），直接生成可执行的操作指令。其中，OpenCUA-72B 曾在权威榜单 OSWorld-Verified 中位居榜首。此外，项目生态活跃，已支持 vLLM 加速推理及低显存量化版本，大大降低了部署门槛，是推动通用电脑操作代理发展的坚实基石。","\n\u003Ch1 style=\"\n  font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Helvetica,Arial,sans-serif;\n  font-size:48px;\n  font-weight:700;\n  line-height:1.25;\n  text-align:center;\n  margin:0 0 24px;\">\n  OpenCUA: Open Foundations for Computer-Use Agents\n\u003C\u002Fh1>\n\n\u003Cp align=\"center\">\n&nbsp&nbsp🌐 \u003Ca href=\"https:\u002F\u002Fopencua.xlang.ai\u002F\">Website\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp📑 \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.09123\">Paper\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🤗 \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet\">Dataset\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🔎 \u003Ca href=\"https:\u002F\u002Fagentnet_data_viewer.xlang.ai\u002F\">Data Viewer\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🤖 \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fxlangai\u002Fopencua-open-foundations-for-computer-use-agents-6882014ebecdbbe46074a68d\">Model\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🔧  \u003Ca href=\"https:\u002F\u002Fagentnet-tool.xlang.ai\u002F\">Tool\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🎮  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fxlangai\u002FOpenCUA-demo\">Model Demo\u003C\u002Fa>&nbsp&nbsp \n\u003C\u002Fp>\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_45ed918f76f8.png\" width=\"600\" alt=\"OpenCUA-7B Performance Scaling\">\n\u003C\u002Fdiv>\n\n\u003Cdiv style=\"max-width:900px;margin:0 auto;\">\n\n## 📢 Updates\n- 2026-01-17: 🎉 **vLLM now fully supports OpenCUA-7B, OpenCUA-32B, and OpenCUA-72B!** Thanks to the [Meituan EvoCUA Team](https:\u002F\u002Fgithub.com\u002Fmeituan\u002FEvoCUA) for their contributions to vLLM integration. See [vLLM Serve](model\u002FREADME.md) for usage instructions.\n\n- 2025-12-17: You can now view AgentNet dataset trajectories online via [AgentNet Data Viewer](https:\u002F\u002Fagentnet_data_viewer.xlang.ai\u002F), or use the code in `data\u002Fvis\u002F` to visualize your own trajectory data. See [vis\u002FREADME.md](.\u002Fdata\u002Fvis\u002FREADME.md) for usage instructions. We also summarized the metadata of AgentNet here [Metadata json](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet\u002Fblob\u002Fmain\u002Fmeta_data_merged.jsonl).\n\n\n- 2025-11-28: VLLM support of OpenCUA is available at [[Model] Add OpenCUA-7B support #29068](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fpull\u002F29068). Super grateful to [lim4349](https:\u002F\u002Fgithub.com\u002Flim4349) !\n  \n- 2025-10-12:  \u003Cspan style=\"font-weight:bold\">[OpenCUA-7B-exl2](https:\u002F\u002Fhuggingface.co\u002Fsujitvasanth\u002FOpenCUA-7B-exl2) is now live!\u003C\u002Fspan> ⚡️  \n  Thanks to [Sujit Vasanth](https:\u002F\u002Fhuggingface.co\u002Fsujitvasanth) for producing a quantized **exllamav2** version of OpenCUA-7B — enabling much faster inference with lower VRAM usage.  \n\n\n- 2025-10-03: \u003Cspan style=\"color:red; font-weight:bold\">New OpenCUA model!\u003C\u002Fspan>🔥 \n[OpenCUA-72B](https:\u002F\u002Fhuggingface.co\u002Fxlangai\u002FOpenCUA-72B-preview) now ranks #1 on the [OSWorld-Verified leaderboard](https:\u002F\u002Fos-world.github.io\u002F). It also has strong grounding ability, 37.3% (SOTA) on UI-Vision \n and 60.8% on ScreenSpot-Pro.\n- 2025-08-13: We released our [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.09123) and [project page](https:\u002F\u002Fopencua.xlang.ai\u002F). Check it out!\n\n# Introduction\n\u003Cdiv style=\"\n  max-width: 880px;              \u002F* 可按需调节整体宽度 *\u002F\n  margin: 0 auto;               \u002F* 居中容器 *\u002F\n  text-align: justify;          \u002F* 关键：两端对齐 *\u002F\n  text-justify: inter-word;     \u002F* 优化英文对齐效果 *\u002F\n  line-height: 1.6;\">\n  \n\u003Cb>OpenCUA\u003C\u002Fb> is a comprehensive open-source framework for scaling CUA data and foundation models, consisting of: \n- \u003Cb>[AgentNet](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet)\u003C\u002Fb>: the first large-scale computer-use task dataset spanning 3 operating systems and 200+ applications and websites; \n- **[AgentNetTool](https:\u002F\u002Fagentnet-tool.xlang.ai\u002F)**: an annotation infrastructure that seamlessly captures human computer-use demonstrations; \n- \u003Cb>[AgentNetBench](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Ftree\u002Fmain\u002FAgentNetBench)\u003C\u002Fb>: an offline evaluator that benchmarks model-predicted low-level actions against ground-truth trajectories.\n- **[OpenCUA Models](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fxlangai\u002Fopencua-open-foundations-for-computer-use-agents-6882014ebecdbbe46074a68d\")**: end-to-end computer-use foundation models than can produce executable actions in the computer environments with great planning and grounding capabilities.\n\n\nWith the help of OpenCUA framework, our end-to-end agent models demonstrate strong performance across CUA benchmarks. In particular, \u003Cb>OpenCUA-72B\u003C\u002Fb> achieves an average success rate of **45.0%** on [OSWorld-Verified](https:\u002F\u002Fos-world.github.io\u002F), \nestablishing a new state-of-the-art (SOTA) among open-source models. \n\n\u003C\u002Fdiv>\n\n\n##  🚀 Quick Start of OpenCUA Models\n\u003Cdiv style=\"border-left: 6px solid #f28c28; background: #fff8e6; padding: 12px 16px; margin: 16px 0;\">\n  \u003Cstrong>⚠️ Important for Qwen-based Models (OpenCUA-7B, OpenCUA-32B, OpenCUA-72B):\u003C\u002Fstrong>\n  \n  To align with our training infrastructure, we have modified the model in two places:\n  \u003Cul style=\"margin-top: 8px;\">\n    \u003Cli>1. Multimodal Rotary Position Embedding (M-RoPE) has been replaced with 1D RoPE\u003C\u002Fstrong>.\u003C\u002Fli>\n    \u003Cli>2. Using the same Tokenizer and ChatTemplate as Kimi-VL.\u003C\u002Fli>\n    \u003Cli>Do not use the default transformers and vllm classes to load the model. Tokenizer and Chat Template should be aligned if training the models.\u003C\u002Fli>\n  \u003C\u002Ful>\n\u003C\u002Fdiv>\n\n\n### Installation & Download\n\nFirst, install the required transformers dependencies:\n\n```bash\nconda create -n opencua python=3.10\nconda activate opencua\npip install -r requirement.txt\n```\n\nDownload the model weight from huggingface:\n```bash\nfrom huggingface_hub import snapshot_download\nsnapshot_download(\n    repo_id=\"xlangai\u002FOpenCUA-7B\",\n    local_dir=\"OpenCUA-7B\",                \n    local_dir_use_symlinks=False  \n)\n```\n\n### 🚀 vLLM Serve\n\nWe recommend using vLLM for production deployment. Requires **vllm>=0.12.0** with `--trust-remote-code`.\n\n```bash\n# OpenCUA-7B (single GPU)\nvllm serve xlangai\u002FOpenCUA-7B \\\n  --trust-remote-code \\\n  --served-model-name opencua-7b \\\n  --host 0.0.0.0 \\\n  --port 8000\n\n# OpenCUA-32B (4 GPUs, tensor parallel)\nvllm serve xlangai\u002FOpenCUA-32B \\\n  --trust-remote-code \\\n  --tensor-parallel-size 4 \\\n  --served-model-name opencua-32b \\\n  --host 0.0.0.0 \\\n  --port 8000\n\n# OpenCUA-72B with data parallelism (tp=2, dp=4 for 4 instances on 8 GPUs)\nvllm serve xlangai\u002FOpenCUA-72B \\\n  --trust-remote-code \\\n  --tensor-parallel-size 2 \\\n  --data-parallel-size 4 \\\n  --gpu-memory-utilization 0.85 \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\nAdjust `--tensor-parallel-size`, `--data-parallel-size`, and `--gpu-memory-utilization` based on your hardware configuration.\n\nFor more examples and inference code, see [model\u002Finference\u002Fvllm_inference.py](.\u002Fmodel\u002Finference\u002Fvllm_inference.py).\n\n### 🎯 GUI Grounding\n\nFirst, start the vLLM server (using OpenCUA-7B as example):\n```bash\nvllm serve xlangai\u002FOpenCUA-7B \\\n  --trust-remote-code \\\n  --served-model-name opencua-7b \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\nThen run the grounding examples:\n```\ncd .\u002Fmodel\u002Finference\u002F\npython vllm_inference.py\n```\n\nOr with HuggingFace Transformers (no server required):\n```\npython huggingface_inference.py\n```\n\n### 🖥️ Computer Use Agent\n**[OpenCUAAgent](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld\u002Fblob\u002Fmain\u002Fmm_agents\u002Fopencua_agent.py)** is developed in the [OSWorld](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld) environment based on OpenCUA models. It iteratively perceives the environment via screenshots, produces reflective long CoT as inner monologue, and predicts the next action to be executed. OpenCUAAgent uses 3 images in total and L2 CoT format in default.\n\nCommand for running OpenCUA-7B and OpenCUA-32B in OSWorld:\n```\n    python run_multienv_opencua.py \\\n        --headless \\\n        --observation_type screenshot \\\n        --model OpenCUA-32B \\\n        --result_dir .\u002Fresults --test_all_meta_path evaluation_examples\u002Ftest_all_no_gdrive.json \\\n        --max_steps 100 \\\n        --num_envs 30  \\\n        --coordinate_type qwen25\n```\n\n---\n\n## Performance\n\n### Online Agent Evaluation\nOpenCUA models achieves strong performance on **[OSWorld-Verified](https:\u002F\u002Fos-world.github.io\u002F)**. \nOPENCUA-32B achieves the best performance among all open-source models with an average success rate of 34.8%, outperforming prior baselines by large margins. \nIt also closes the gap to proprietary Claude models.\n\u003Cdiv align=\"center\">\n\n| **Model**                        | **15 Steps** | **50 Steps** | **100 Steps** |\n|-------------------------------|:--------:|:--------:|:---------:|\n| **Proprietary**               |          |          |           |\n| OpenAI CUA                    | 26.0     | 31.3     | 31.4      |\n| Seed 1.5-VL                   | 27.9     | —        | 34.1      |\n| Claude 3.7 Sonnet             | 27.1     | 35.8     | 35.9      |\n| Claude 4 Sonnet               | 31.2     | 43.9     | 41.5      |\n| **Open-Source**               |          |          |           |\n| Qwen 2.5-VL-32B-Instruct      | 3.0      | —        | 3.9       |\n| Qwen 2.5-VL-72B-Instruct      | 4.4      | —        | 5.0       |\n| Kimi-VL-A3B                   | 9.7      | —        | 10.3      |\n| UI-TARS-72B-DPO               | 24.0     | 25.8     | 27.1      |\n| UI-TARS-1.5-7B                | 24.5     | 27.3     | 27.4      |\n| OpenCUA-7B *(Ours)*           | 24.3     | 27.9     | 26.6      |\n| OpenCUA-32B *(Ours)*          | **29.7** | **34.1** | 34.8      |\n| **OpenCUA-72B*(Ours)***      | 39.0   | 44.9  | **45.0**  |\n\u003C\u002Fdiv>\n\n*OpenCUA scores are the mean of 3 independent runs.*\n\n### GUI Grounding Performance\n\u003Cdiv align=\"center\">\n\n| **Model** | **OSWorld-G** | **ScreenSpot-V2** | **ScreenSpot-Pro** | **UI-Vision** |\n|-------|-----------|---------------|----------------| ---------- |\n| Qwen2.5-VL-7B   | 31.4 | 88.8 | 27.6 |  0.85 |\n| Qwen2.5-VL-32B  | 46.5 | 87.0 | 39.4 | - |\n| UI-TARS-72B     | 57.1 | 90.3 | 38.1 | 25.5 |\n| **OpenCUA-7B**  | 55.3 | 92.3 | 50.0 | 29.7 |\n| **OpenCUA-32B** | **59.6** | **93.4** | 55.3 | 33.3 |\n| **OpenCUA-72B** | 59.2 | 92.9 | **60.8** | **37.3** |\n\u003C\u002Fdiv>\n\n\n### AgentNetBench (Offline Evaluation)\n\u003Cdiv align=\"center\">\n\n| **Model** | **Coordinate Actions** | **Content Actions** | **Function Actions** | **Average** |\n|-------|-------------------|-----------------|------------------|---------|\n| Qwen2.5-VL-7B | 50.7 | 40.8 | 3.1 | 48.0 |\n| Qwen2.5-VL-32B | 66.6 | 47.2 | 41.5 | 64.8 |\n| Qwen2.5-VL-72B | 67.2 | 52.6 | 50.5 | 67.0 |\n| OpenAI CUA          | 71.7 | 57.3 | **80.0** | 73.1 |\n| **OpenCUA-7B**  | 79.0 | 62.0 | 44.3 | 75.2 |\n| **OpenCUA-32B** | **81.9** | 66.1 | 55.7 | **79.1** |\n\u003C\u002Fdiv>\n\n---\n\n## AgentNet Dataset - Large-Scale Computer-Use Dataset\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_435e6fde1fb6.png\" width=\"400\" alt=\"AgentNet Dataset Domain Distribution\">\n\u003C\u002Fdiv>\n\nAgentNet is the first large-scale desktop computer-use agent trajectory dataset, containing 22.6K human-annotated computer-use tasks across Windows, macOS, and Ubuntu systems. \n\n👉 **[AgentNet Huggingface Dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet)**\n\nDownload the dataset here：\n```\npip install -U huggingface_hub\nhuggingface-cli download xlangai\u002FAgentNet --repo-type dataset --local-dir .\u002FAgentNet\n```\n\nUse the following command to unzip the file (For exmaple, Ubuntu data):\n```\ncd path_to_your_zip_files\n\n# Merge all the zips\nzip -s 0 images.zip --out images-full.zip\n\n# Unzip\nunzip images-full.zip -d path_to_your_target_dir\n```\n\nCollecting computer-use agent training data requires 3 steps:\n- Demonstrate human computer-use task via [AgentNetTool](https:\u002F\u002Fagentnet-tool.xlang.ai\u002F);\n- Preprocess the demonstration using [Action Reduction & State-Action Matching](.\u002Fdata\u002Fdata-processor);\n- For each step, [synthesize reflective long CoT](.\u002Fdata\u002Fcot-generator)\n\n\n### 1 AgentNetTool – Annotation & Verification Tool\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_1a6fa28847d9.png\" width=\"700\" alt=\"AgentNet Tool\">\n\u003C\u002Fdiv>\n\n\nOur **AgentNetTool** is a cross-platform GUI recorder that runs unobtrusively on annotators’ machines. It captures synchronized **screen video**, **mouse\u002Fkeyboard events**, and **accessibility trees**, then provides an in-browser UI for reviewing, trimming, and submitting demonstrations. AgentNet Tool is available on Windows, macOS and Ubuntu. \n\n👉 **[AgentNetTool Document](https:\u002F\u002Fagentnet-tool.xlang.ai\u002F)**\n\n\n\n### 2 DataProcessor – Action Reduction & State–Action Matching\nRaw demonstrations can contain thousands of low-level events that are too dense for model training.  \nThe **DataProcessor** module (`.\u002Fdata\u002Fdata-process\u002F`) performs two key steps:\n\n1. **Action Reduction** — merges granular signals into concise, semantically meaningful PyAutoGUI actions (e.g., collapsing mouse moves → click, coalescing scrolls, grouping key-press sequences into text or hotkeys).  \n2. **State–Action Matching** — aligns every reduced action with the *last visually distinct frame* **before** the action begins, avoiding future-information leakage and yielding compact state–action pairs.\n\nThese processed trajectories underlie all downstream training and evaluation.\n\n---\n\n### 3 CoTGenerator – Synthesizing Reflective Long Chain-of-Thought Inner Monologue\nTo boost robustness and interpretability, we augment each trajectory with **reflective long Chain-of-Thought (CoT) reasoning**.  \nThe **CoTGenerator** pipeline (`.\u002Fdata\u002Fcot-generator\u002F`) synthesizes step-level reflections that:\n\n* reflect on the previous action,\n* explain *why* an action is chosen given the current observation and history,  \n* note potential alternative actions, and  \n* forecast the expected next state.\n\nEmpirically, models trained with these rich CoTs scale better with data and generalize across unseen applications.\n\n\n## AgentNetBench\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_ed62ced0018b.png\" width=\"800\" alt=\"AgentNetBench\">\n\u003C\u002Fdiv>\n\n\n**AgentNetBench** (`.\u002FAgentNetBench\u002F`) provides a realistic offline evaluator for OS agent trajectories. It compares model-predicted low-level actions (click, moveTo, write, press, scroll, terminate, etc.) against ground-truth human actions and reports detailed metrics.\n\n👉 See **[AgentNetBench\u002FREADME.md](.\u002Fevaluation\u002Fagentnetbench\u002FREADME.md)** for usage instructions.\n\n## TODO\n- [x] **vLLM Support** ✅\n  - vLLM now fully supports OpenCUA-7B, OpenCUA-32B, and OpenCUA-72B.\n  - See [vLLM Serve](#vllm-serve) section for usage instructions.\n  - Thanks to the Meituan EvoCUA Team for their contributions!\n\n- [ ] **Training Code**\n  - OpenCUA models are developed based on the training infrastructure of Kimi Team.\n  - Currently developing the training pipeline based on open-source infrastructure.\n\n## Star History\n\n[![Star History Chart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_fe1896dbe37a.png)](https:\u002F\u002Fwww.star-history.com\u002F#xlang-ai\u002FOpenCUA&type=date&legend=top-left)\n\n## Acknowledge\n\u003Cp>\nWe thank Yu Su, Caiming Xiong, and the anonymous reviewers for their insightful discussions and valuable feedback.\nWe are grateful to Moonshot AI for providing training infrastructure and annotated data.\nWe also sincerely appreciate Hao Yang, Zhengtao Wang, and Yanxu Chen from the Kimi Team for their strong infrastructure support and helpful guidance.\nWe thank Chong Peng, Taofeng Xue, and Qiumian Huang from the \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fmeituan\u002FEvoCUA\" target=\"_blank\">Meituan EvoCUA Team\u003C\u002Fa> for their contributions to vLLM integration.\nThe development of our tool is based on the open-source projects-\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FTheDuckAI\u002FDuckTrack\" target=\"_blank\">DuckTrack\u003C\u002Fa> and \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FOpenAdaptAI\u002FOpenAdapt\" target=\"_blank\">OpenAdapt\u003C\u002Fa>.\nWe are very grateful to their commitment to the open source community. Finally, we extend our deepest thanks to all annotators for their tremendous effort and contributions to this project.\n\u003C\u002Fp>\n\n## Research and Commercial Use\n\nOpenCUA (including the model, dataset, tools, and code) may be used for **research, educational, and commercial purposes** under the **MIT License** (see `LICENSE`).\n\n### Citation and Acknowledgement\nIf you use **OpenCUA models** and\u002For the **AgentNet dataset** in any **report, technical report, publication, thesis, presentation, blog post, documentation, or other publicly shared material**, we **kindly ask** that you include an explicit acknowledgement in the main text and cite the OpenCUA paper.\n\n### Prohibited Uses\n- The model, dataset, tool, and code may **not** be used for any purpose or activity that violates applicable laws or regulations in any jurisdiction\n- Use for illegal, unethical, or harmful activities is strictly prohibited\n\n### Disclaimer\n- The authors, contributors, and copyright holders are **not responsible** for any illegal, unethical, or harmful use of the Software, nor for any direct or indirect damages resulting from such use\n- Use of the \"OpenCUA\" name, logo, or trademarks does **not** imply any endorsement or affiliation unless separate written permission is obtained\n- Users are solely responsible for ensuring their use complies with applicable laws and regulations\n\n## Citation\n\nIf you use OpenCUA in your research, please cite our work:\n\n```bibtex\n@misc{wang2025opencuaopenfoundationscomputeruse,\n      title={OpenCUA: Open Foundations for Computer-Use Agents}, \n      author={Xinyuan Wang and Bowen Wang and Dunjie Lu and Junlin Yang and Tianbao Xie and Junli Wang and Jiaqi Deng and Xiaole Guo and Yiheng Xu and Chen Henry Wu and Zhennan Shen and Zhuokai Li and Ryan Li and Xiaochuan Li and Junda Chen and Boyuan Zheng and Peihang Li and Fangyu Lei and Ruisheng Cao and Yeqiao Fu and Dongchan Shin and Martin Shin and Jiarui Hu and Yuyan Wang and Jixuan Chen and Yuxiao Ye and Danyang Zhang and Dikang Du and Hao Hu and Huarong Chen and Zaida Zhou and Haotian Yao and Ziwei Chen and Qizheng Gu and Yipu Wang and Heng Wang and Diyi Yang and Victor Zhong and Flood Sung and Y. Charles and Zhilin Yang and Tao Yu},\n      year={2025},\n      eprint={2508.09123},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.09123}, \n}\n```\n\n\n\u003C\u002Fdiv>\n","\u003Ch1 style=\"\n  font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Helvetica,Arial,sans-serif;\n  font-size:48px;\n  font-weight:700;\n  line-height:1.25;\n  text-align:center;\n  margin:0 0 24px;\">\n  OpenCUA：面向计算机使用代理的开源基础模型\n\u003C\u002Fh1>\n\n\u003Cp align=\"center\">\n&nbsp&nbsp🌐 \u003Ca href=\"https:\u002F\u002Fopencua.xlang.ai\u002F\">官网\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp📑 \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.09123\">论文\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🤗 \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet\">数据集\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🔎 \u003Ca href=\"https:\u002F\u002Fagentnet_data_viewer.xlang.ai\u002F\">数据查看器\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🤖 \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fxlangai\u002Fopencua-open-foundations-for-computer-use-agents-6882014ebecdbbe46074a68d\">模型\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🔧  \u003Ca href=\"https:\u002F\u002Fagentnet-tool.xlang.ai\u002F\">工具\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp🎮  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fxlangai\u002FOpenCUA-demo\">模型演示\u003C\u002Fa>&nbsp&nbsp \n\u003C\u002Fp>\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_45ed918f76f8.png\" width=\"600\" alt=\"OpenCUA-7B性能扩展\">\n\u003C\u002Fdiv>\n\n\u003Cdiv style=\"max-width:900px;margin:0 auto;\">\n\n## 📢 最新动态\n- 2026-01-17：🎉 **vLLM现已全面支持OpenCUA-7B、OpenCUA-32B和OpenCUA-72B！** 感谢[美团EvoCUA团队](https:\u002F\u002Fgithub.com\u002Fmeituan\u002FEvoCUA)对vLLM集成所做的贡献。使用方法请参阅[vLLM Serve](model\u002FREADME.md)。\n\n- 2025-12-17：您现在可以通过[AgentNet数据查看器](https:\u002F\u002Fagentnet_data_viewer.xlang.ai\u002F)在线查看AgentNet数据集轨迹，或使用`data\u002Fvis\u002F`中的代码可视化您自己的轨迹数据。使用说明请参阅[vis\u002FREADME.md](.\u002Fdata\u002Fvis\u002FREADME.md)。我们还在此处汇总了AgentNet的元数据[Metadata json](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet\u002Fblob\u002Fmain\u002Fmeta_data_merged.jsonl)。\n\n\n- 2025-11-28：OpenCUA的vLLM支持已在[[Model] Add OpenCUA-7B support #29068](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fpull\u002F29068)中提供。非常感谢[lim4349](https:\u002F\u002Fgithub.com\u002Flim4349)！\n\n- 2025-10-12：\u003Cspan style=\"font-weight:bold\">[OpenCUA-7B-exl2](https:\u002F\u002Fhuggingface.co\u002Fsujitvasanth\u002FOpenCUA-7B-exl2)现已上线！\u003C\u002Fspan> ⚡️  \n  感谢[Sujit Vasanth](https:\u002F\u002Fhuggingface.co\u002Fsujitvasanth)制作了OpenCUA-7B的量化**exllamav2**版本——这使得推理速度更快，同时降低了显存占用。  \n\n\n- 2025-10-03：\u003Cspan style=\"color:red; font-weight:bold\">全新OpenCUA模型！\u003C\u002Fspan>🔥 \n[OpenCUA-72B](https:\u002F\u002Fhuggingface.co\u002Fxlangai\u002FOpenCUA-72B-preview)目前在[OSWorld-Verified排行榜](https:\u002F\u002Fos-world.github.io\u002F)上排名第一。它还具有强大的场景理解能力，在UI-Vision上的准确率为37.3%（SOTA），在ScreenSpot-Pro上的准确率为60.8%。\n- 2025-08-13：我们发布了[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.09123)和[项目页面](https:\u002F\u002Fopencua.xlang.ai\u002F)。欢迎查阅！\n\n# 简介\n\u003Cdiv style=\"\n  max-width: 880px;              \u002F* 可按需调节整体宽度 *\u002F\n  margin: 0 auto;               \u002F* 居中容器 *\u002F\n  text-align: justify;          \u002F* 关键：两端对齐 *\u002F\n  text-justify: inter-word;     \u002F* 优化英文对齐效果 *\u002F\n  line-height: 1.6;\">\n  \n\u003Cb>OpenCUA\u003C\u002Fb>是一个用于扩展CUA数据和基础模型的综合性开源框架，包括：\n- \u003Cb>[AgentNet](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet)\u003C\u002Fb>：首个大规模计算机使用任务数据集，覆盖3个操作系统和200多个应用及网站；\n- **[AgentNetTool](https:\u002F\u002Fagentnet-tool.xlang.ai\u002F)**：一个能够无缝捕捉人类计算机使用示范的标注基础设施；\n- \u003Cb>[AgentNetBench](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Ftree\u002Fmain\u002FAgentNetBench)\u003C\u002Fb>：一个离线评估工具，用于将模型预测的低级动作与真实轨迹进行对比。\n- **[OpenCUA模型](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fxlangai\u002Fopencua-open-foundations-for-computer-use-agents-6882014ebecdbbe46074a68d\")**：端到端的计算机使用基础模型，能够在计算机环境中生成可执行的操作，并具备出色的规划和场景理解能力。\n\n\n借助OpenCUA框架，我们的端到端代理模型在CUA基准测试中表现出色。特别是，\u003Cb>OpenCUA-72B\u003C\u002Fb>在[OSWorld-Verified](https:\u002F\u002Fos-world.github.io\u002F)上的平均成功率达到**45.0%**，在开源模型中树立了新的SOTA记录。\n\n\u003C\u002Fdiv>\n\n\n##  🚀 OpenCUA模型快速入门\n\u003Cdiv style=\"border-left: 6px solid #f28c28; background: #fff8e6; padding: 12px 16px; margin: 16px 0;\">\n  \u003Cstrong>⚠️ 对基于Qwen的模型（OpenCUA-7B、OpenCUA-32B、OpenCUA-72B）的重要提示：\u003C\u002Fstrong>\n  \n  为与我们的训练基础设施保持一致，我们在两个方面对模型进行了修改：\n  \u003Cul style=\"margin-top: 8px;\">\n    \u003Cli>1. 多模态旋转位置嵌入（M-RoPE）已被1D RoPE取代。\u003C\u002Fli>\n    \u003Cli>2. 使用与Kimi-VL相同的分词器和聊天模板。\u003C\u002Fli>\n    \u003Cli>请勿使用默认的transformers和vllm类加载模型。如果要训练这些模型，分词器和聊天模板必须保持一致。\u003C\u002Fli>\n  \u003C\u002Ful>\n\u003C\u002Fdiv>\n\n\n### 安装与下载\n\n首先，安装所需的transformers依赖项：\n\n```bash\nconda create -n opencua python=3.10\nconda activate opencua\npip install -r requirement.txt\n```\n\n从Hugging Face下载模型权重：\n```bash\nfrom huggingface_hub import snapshot_download\nsnapshot_download(\n    repo_id=\"xlangai\u002FOpenCUA-7B\",\n    local_dir=\"OpenCUA-7B\",                \n    local_dir_use_symlinks=False  \n)\n```\n\n### 🚀 vLLM服务\n\n我们建议使用vLLM进行生产部署。需要**vllm>=0.12.0**，并启用`--trust-remote-code`选项。\n\n```bash\n# OpenCUA-7B（单GPU）\nvllm serve xlangai\u002FOpenCUA-7B \\\n  --trust-remote-code \\\n  --served-model-name opencua-7b \\\n  --host 0.0.0.0 \\\n  --port 8000\n\n# OpenCUA-32B（4 GPU，张量并行）\nvllm serve xlangai\u002FOpenCUA-32B \\\n  --trust-remote-code \\\n  --tensor-parallel-size 4 \\\n  --served-model-name opencua-32b \\\n  --host 0.0.0.0 \\\n  --port 8000\n\n# OpenCUA-72B，采用数据并行（tp=2，dp=4，共4个实例，分布在8张GPU上）\nvllm serve xlangai\u002FOpenCUA-72B \\\n  --trust-remote-code \\\n  --tensor-parallel-size 2 \\\n  --data-parallel-size 4 \\\n  --gpu-memory-utilization 0.85 \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\n请根据您的硬件配置调整`--tensor-parallel-size`、`--data-parallel-size`和`--gpu-memory-utilization`参数。\n\n更多示例和推理代码，请参阅[model\u002Finference\u002Fvllm_inference.py](.\u002Fmodel\u002Finference\u002Fvllm_inference.py)。\n\n### 🎯 GUI场景理解\n\n首先启动vLLM服务器（以OpenCUA-7B为例）：\n```bash\nvllm serve xlangai\u002FOpenCUA-7B \\\n  --trust-remote-code \\\n  --served-model-name opencua-7b \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\n然后运行场景理解示例：\n```\ncd .\u002Fmodel\u002Finference\u002F\npython vllm_inference.py\n```\n\n或者使用Hugging Face Transformers（无需服务器）：\n```\npython huggingface_inference.py\n```\n\n### 🖥️ 计算机使用智能体\n**[OpenCUAAgent](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld\u002Fblob\u002Fmain\u002Fmm_agents\u002Fopencua_agent.py)** 是在 [OSWorld](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld) 环境中基于 OpenCUA 模型开发的。它通过截屏迭代地感知环境，生成反思性的长链式思维作为内部独白，并预测下一步要执行的动作。OpenCUA 代理默认使用 3 张图像和 L2 格式的 CoT。\n\n在 OSWorld 中运行 OpenCUA-7B 和 OpenCUA-32B 的命令如下：\n```\n    python run_multienv_opencua.py \\\n        --headless \\\n        --observation_type screenshot \\\n        --model OpenCUA-32B \\\n        --result_dir .\u002Fresults --test_all_meta_path evaluation_examples\u002Ftest_all_no_gdrive.json \\\n        --max_steps 100 \\\n        --num_envs 30  \\\n        --coordinate_type qwen25\n```\n\n---\n\n## 性能\n\n### 在线智能体评估\nOpenCUA 模型在 **[OSWorld-Verified](https:\u002F\u002Fos-world.github.io\u002F)** 上表现出色。\nOPENCUA-32B 以平均 34.8% 的成功率位居所有开源模型之首，大幅超越了之前的基线模型。\n同时，它也缩小了与专有 Claude 模型之间的差距。\n\u003Cdiv align=\"center\">\n\n| **模型**                        | **15 步** | **50 步** | **100 步** |\n|-------------------------------|:--------:|:--------:|:---------:|\n| **专有模型**               |          |          |           |\n| OpenAI CUA                    | 26.0     | 31.3     | 31.4      |\n| Seed 1.5-VL                   | 27.9     | —        | 34.1      |\n| Claude 3.7 Sonnet             | 27.1     | 35.8     | 35.9      |\n| Claude 4 Sonnet               | 31.2     | 43.9     | 41.5      |\n| **开源模型**               |          |          |           |\n| Qwen 2.5-VL-32B-Instruct      | 3.0      | —        | 3.9       |\n| Qwen 2.5-VL-72B-Instruct      | 4.4      | —        | 5.0       |\n| Kimi-VL-A3B                   | 9.7      | —        | 10.3      |\n| UI-TARS-72B-DPO               | 24.0     | 25.8     | 27.1      |\n| UI-TARS-1.5-7B                | 24.5     | 27.3     | 27.4      |\n| OpenCUA-7B *(我们)*           | 24.3     | 27.9     | 26.6      |\n| OpenCUA-32B *(我们)*          | **29.7** | **34.1** | 34.8      |\n| **OpenCUA-72B*(我们)***      | 39.0   | 44.9  | **45.0**  |\n\u003C\u002Fdiv>\n\n*OpenCUA 的得分是 3 次独立运行的平均值。*\n\n### GUI 定位性能\n\u003Cdiv align=\"center\">\n\n| **模型** | **OSWorld-G** | **ScreenSpot-V2** | **ScreenSpot-Pro** | **UI-Vision** |\n|-------|-----------|---------------|----------------| ---------- |\n| Qwen2.5-VL-7B   | 31.4 | 88.8 | 27.6 |  0.85 |\n| Qwen2.5-VL-32B  | 46.5 | 87.0 | 39.4 | - |\n| UI-TARS-72B     | 57.1 | 90.3 | 38.1 | 25.5 |\n| **OpenCUA-7B**  | 55.3 | 92.3 | 50.0 | 29.7 |\n| **OpenCUA-32B** | **59.6** | **93.4** | 55.3 | 33.3 |\n| **OpenCUA-72B** | 59.2 | 92.9 | **60.8** | **37.3** |\n\u003C\u002Fdiv>\n\n\n### AgentNetBench（离线评估）\n\u003Cdiv align=\"center\">\n\n| **模型** | **坐标动作** | **内容动作** | **功能动作** | **平均** |\n|-------|-------------------|-----------------|------------------|---------|\n| Qwen2.5-VL-7B | 50.7 | 40.8 | 3.1 | 48.0 |\n| Qwen2.5-VL-32B | 66.6 | 47.2 | 41.5 | 64.8 |\n| Qwen2.5-VL-72B | 67.2 | 52.6 | 50.5 | 67.0 |\n| OpenAI CUA          | 71.7 | 57.3 | **80.0** | 73.1 |\n| **OpenCUA-7B**  | 79.0 | 62.0 | 44.3 | 75.2 |\n| **OpenCUA-32B** | **81.9** | 66.1 | 55.7 | **79.1** |\n\u003C\u002Fdiv>\n\n---\n\n## AgentNet 数据集——大规模计算机使用数据集\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_435e6fde1fb6.png\" width=\"400\" alt=\"AgentNet 数据集领域分布\">\n\u003C\u002Fdiv>\n\nAgentNet 是首个大规模桌面计算机使用智能体轨迹数据集，包含 22,600 个由人工标注的 Windows、macOS 和 Ubuntu 系统上的计算机使用任务。\n\n👉 **[AgentNet Huggingface 数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet)**\n\n在这里下载数据集：\n```\npip install -U huggingface_hub\nhuggingface-cli download xlangai\u002FAgentNet --repo-type dataset --local-dir .\u002FAgentNet\n```\n\n使用以下命令解压文件（例如 Ubuntu 数据）：\n```\ncd path_to_your_zip_files\n\n# 合并所有压缩包\nzip -s 0 images.zip --out images-full.zip\n\n# 解压\nunzip images-full.zip -d path_to_your_target_dir\n```\n\n收集计算机使用智能体训练数据需要 3 个步骤：\n- 使用 [AgentNetTool](https:\u002F\u002Fagentnet-tool.xlang.ai\u002F) 展示人类的计算机使用任务；\n- 使用 [动作简化与状态-动作匹配](.\u002Fdata\u002Fdata-processor) 对演示进行预处理；\n- 针对每一步，[合成反思性的长链式思维](.\u002Fdata\u002Fcot-generator)。\n\n\n### 1 AgentNetTool – 标注与验证工具\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_1a6fa28847d9.png\" width=\"700\" alt=\"AgentNet 工具\">\n\u003C\u002Fdiv>\n\n\n我们的 **AgentNetTool** 是一款跨平台的 GUI 录制工具，可在标注者的设备上无感运行。它会捕获同步的 **屏幕视频**、**鼠标\u002F键盘事件** 和 **可访问性树**，然后提供一个浏览器内界面用于查看、修剪和提交演示。AgentNet Tool 支持 Windows、macOS 和 Ubuntu。\n\n👉 **[AgentNetTool 文档](https:\u002F\u002Fagentnet-tool.xlang.ai\u002F)**\n\n\n\n### 2 DataProcessor – 动作简化与状态–动作匹配\n原始演示可能包含数千个低层级事件，过于密集，不适合模型训练。\n**DataProcessor** 模块（`.\u002Fdata\u002Fdata-process\u002F`）执行两个关键步骤：\n\n1. **动作简化** — 将细粒度的信号合并为简洁、语义明确的 PyAutoGUI 动作（例如，将连续的鼠标移动归并为单次点击，整合滚动操作，将按键序列分组为文本或快捷键）。  \n2. **状态–动作匹配** — 将每个简化的动作与该动作开始 *之前* 的 *最后一个视觉上不同的帧* 进行对齐，避免未来信息泄露，从而生成紧凑的状态–动作对。\n\n这些经过处理的轨迹是所有下游训练和评估的基础。\n\n---\n\n### 3 CoTGenerator – 合成反思性的长链式思维内部独白\n为了提升鲁棒性和可解释性，我们在每条轨迹中加入了 **反思性的长链式思维（CoT）推理**。\n**CoTGenerator** 流程（`.\u002Fdata\u002Fcot-generator\u002F`）会合成逐步的反思内容，包括：\n\n* 反思前一步的动作，\n* 根据当前观察和历史记录解释为何选择该动作，\n* 提及可能的替代方案，\n* 预测预期的下一状态。\n\n经验表明，使用这些丰富的 CoT 训练的模型能够更好地适应数据规模，并在未见过的应用场景中实现更好的泛化能力。\n\n## AgentNetBench\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_ed62ced0018b.png\" width=\"800\" alt=\"AgentNetBench\">\n\u003C\u002Fdiv>\n\n\n**AgentNetBench** (`.\u002FAgentNetBench\u002F`) 提供了一个用于操作系统代理轨迹的真实离线评估器。它将模型预测的低层操作（点击、移动到、输入、按键、滚动、终止等）与人类的真实操作进行对比，并报告详细的指标。\n\n👉 请参阅 **[AgentNetBench\u002FREADME.md](.\u002Fevaluation\u002Fagentnetbench\u002FREADME.md)** 获取使用说明。\n\n## 待办事项\n- [x] **vLLM 支持** ✅\n  - vLLM 现已完全支持 OpenCUA-7B、OpenCUA-32B 和 OpenCUA-72B。\n  - 使用说明请参阅 [vLLM Serve](#vllm-serve) 部分。\n  - 感谢美团 EvoCUA 团队的贡献！\n\n- [ ] **训练代码**\n  - OpenCUA 模型基于 Kimi 团队的训练基础设施开发。\n  - 目前正在基于开源基础设施开发训练流水线。\n\n## 星标历史\n\n[![星标历史图](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_readme_fe1896dbe37a.png)](https:\u002F\u002Fwww.star-history.com\u002F#xlang-ai\u002FOpenCUA&type=date&legend=top-left)\n\n## 致谢\n\u003Cp>\n我们感谢 Yu Su、Caiming Xiong 以及匿名审稿人提供的富有洞见的讨论和宝贵反馈。\n我们衷心感谢 Moonshot AI 提供的训练基础设施和标注数据。\n同时，我们也真诚地感谢 Kimi 团队的 Hao Yang、Zhengtao Wang 和 Yanxu Chen 提供的强大基础设施支持和有益指导。\n此外，我们还要感谢来自 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fmeituan\u002FEvoCUA\" target=\"_blank\">美团 EvoCUA 团队\u003C\u002Fa>的 Chong Peng、Taofeng Xue 和 Qiumian Huang 在 vLLM 集成方面的贡献。\n我们的工具开发基于开源项目——\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FTheDuckAI\u002FDuckTrack\" target=\"_blank\">DuckTrack\u003C\u002Fa> 和 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FOpenAdaptAI\u002FOpenAdapt\" target=\"_blank\">OpenAdapt\u003C\u002Fa>。\n我们非常感激他们对开源社区的奉献精神。最后，我们要向所有标注人员致以最深切的谢意，感谢他们为本项目付出的巨大努力和贡献。\n\u003C\u002Fp>\n\n## 研究与商业用途\nOpenCUA（包括模型、数据集、工具和代码）可在 **MIT 许可证** 下用于 **研究、教育和商业目的**（详见 `LICENSE` 文件）。\n\n### 引用与致谢\n如果您在任何 **报告、技术报告、出版物、论文、演示文稿、博客文章、文档或其他公开分享的材料** 中使用了 **OpenCUA 模型** 和\u002F或 **AgentNet 数据集**，我们**诚恳地请求**您在正文中明确注明致谢，并引用 OpenCUA 论文。\n\n### 禁止用途\n- 该模型、数据集、工具和代码**不得**用于任何违反任何司法管辖区适用法律或法规的目的或活动。\n- 严禁将其用于非法、不道德或有害的活动。\n\n### 免责声明\n- 作者、贡献者及版权所有者对软件的任何非法、不道德或有害使用，以及由此产生的任何直接或间接损害概不负责。\n- 未经另行书面许可，使用“OpenCUA”名称、标志或商标并不意味着任何形式的背书或隶属关系。\n- 用户应自行确保其使用符合相关法律法规。\n\n## 引用\n如果您在研究中使用 OpenCUA，请引用我们的工作：\n\n```bibtex\n@misc{wang2025opencuaopenfoundationscomputeruse,\n      title={OpenCUA: Open Foundations for Computer-Use Agents}, \n      author={Xinyuan Wang and Bowen Wang and Dunjie Lu and Junlin Yang and Tianbao Xie and Junli Wang and Jiaqi Deng and Xiaole Guo and Yiheng Xu and Chen Henry Wu and Zhennan Shen and Zhuokai Li and Ryan Li and Xiaochuan Li and Junda Chen and Boyuan Zheng and Peihang Li and Fangyu Lei and Ruisheng Cao and Yeqiao Fu and Dongchan Shin and Martin Shin and Jiarui Hu and Yuyan Wang and Jixuan Chen and Yuxiao Ye and Danyang Zhang and Dikang Du and Hao Hu and Huarong Chen and Zaida Zhou and Haotian Yao and Ziwei Chen and Qizheng Gu and Yipu Wang and Heng Wang and Diyi Yang and Victor Zhong and Flood Sung and Y. Charles and Zhilin Yang and Tao Yu},\n      year={2025},\n      eprint={2508.09123},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.09123}, \n}\n```\n\n\n\u003C\u002Fdiv>","# OpenCUA 快速上手指南\n\nOpenCUA 是一个用于扩展计算机使用（Computer-Use）代理数据和基础模型的开源框架。本指南将帮助您快速部署并运行 OpenCUA 模型（如 OpenCUA-7B\u002F32B\u002F72B）。\n\n## 1. 环境准备\n\n### 系统要求\n- **操作系统**: Linux (推荐 Ubuntu), macOS, 或 Windows (WSL2)\n- **Python 版本**: 3.10\n- **GPU 要求**:\n  - **OpenCUA-7B**: 单卡 GPU (建议显存 ≥ 16GB，量化版本可降低需求)\n  - **OpenCUA-32B**: 多卡并行 (建议 4 张 GPU，支持 Tensor Parallelism)\n  - **OpenCUA-72B**: 多卡并行 (建议 8 张 GPU，支持 Data + Tensor Parallelism)\n- **依赖库**: `vllm` (推荐用于生产部署，版本 ≥ 0.12.0) 或 `transformers`\n\n### ⚠️ 重要注意事项 (基于 Qwen 的模型)\nOpenCUA 模型（7B\u002F32B\u002F72B）对原生 Qwen 架构进行了修改以适配训练基础设施：\n1. **位置编码**: 多维旋转位置嵌入 (M-RoPE) 已替换为 **1D RoPE**。\n2. **分词器**: 使用与 **Kimi-VL** 相同的 Tokenizer 和 ChatTemplate。\n3. **加载警告**: 请勿直接使用默认的 `transformers` 或 `vllm` 类加载模型，必须启用 `--trust-remote-code` 以确保加载正确的自定义代码。\n\n## 2. 安装步骤\n\n### 创建虚拟环境并安装依赖\n```bash\nconda create -n opencua python=3.10\nconda activate opencua\npip install -r requirement.txt\n```\n> **提示**: 如果下载速度较慢，可添加国内镜像源加速：\n> `pip install -r requirement.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n### 下载模型权重\n使用 Python 脚本从 Hugging Face 下载模型（以 OpenCUA-7B 为例）：\n\n```python\nfrom huggingface_hub import snapshot_download\n\nsnapshot_download(\n    repo_id=\"xlangai\u002FOpenCUA-7B\",\n    local_dir=\"OpenCUA-7B\",                \n    local_dir_use_symlinks=False  \n)\n```\n\n*注：如需下载 32B 或 72B 版本，请相应修改 `repo_id` 为 `xlangai\u002FOpenCUA-32B` 或 `xlangai\u002FOpenCUA-72B`。*\n\n## 3. 基本使用\n\n推荐使用 **vLLM** 进行推理部署，以获得最佳性能。\n\n### 启动 vLLM 服务\n\n根据您拥有的硬件资源，选择以下对应的启动命令：\n\n**方案 A: 运行 OpenCUA-7B (单卡)**\n```bash\nvllm serve xlangai\u002FOpenCUA-7B \\\n  --trust-remote-code \\\n  --served-model-name opencua-7b \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\n**方案 B: 运行 OpenCUA-32B (4 卡并行)**\n```bash\nvllm serve xlangai\u002FOpenCUA-32B \\\n  --trust-remote-code \\\n  --tensor-parallel-size 4 \\\n  --served-model-name opencua-32b \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\n**方案 C: 运行 OpenCUA-72B (8 卡，混合并行)**\n```bash\nvllm serve xlangai\u002FOpenCUA-72B \\\n  --trust-remote-code \\\n  --tensor-parallel-size 2 \\\n  --data-parallel-size 4 \\\n  --gpu-memory-utilization 0.85 \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\n### 执行推理测试\n\n服务启动后，您可以运行官方提供的推理脚本进行测试。\n\n**方式一：调用 vLLM 服务 (推荐)**\n```bash\ncd .\u002Fmodel\u002Finference\u002F\npython vllm_inference.py\n```\n\n**方式二：直接使用 HuggingFace Transformers (无需启动服务器)**\n如果您只想快速测试而不部署服务，可直接运行：\n```bash\npython huggingface_inference.py\n```\n\n### 进阶：运行 Computer Use Agent\n若需在 OSWorld 环境中运行完整的智能体（包含截图感知、思维链推理及动作执行），请使用以下命令（以 OpenCUA-32B 为例）：\n\n```bash\npython run_multienv_opencua.py \\\n    --headless \\\n    --observation_type screenshot \\\n    --model OpenCUA-32B \\\n    --result_dir .\u002Fresults --test_all_meta_path evaluation_examples\u002Ftest_all_no_gdrive.json \\\n    --max_steps 100 \\\n    --num_envs 30  \\\n    --coordinate_type qwen25\n```","某电商运营团队需要每日跨 Windows 和 macOS 系统，在后台管理系统、ERP 软件及多个网页间重复执行数据核对与报表生成任务。\n\n### 没有 OpenCUA 时\n- **开发成本高昂**：为每个特定软件界面编写自动化脚本（如 Selenium 或 PyAutoGUI）耗时费力，一旦 UI 微调脚本即刻失效。\n- **跨平台能力缺失**：团队成员混用不同操作系统，需维护两套完全不同的自动化逻辑，难以统一调度。\n- **泛化能力极弱**：传统规则机器人无法理解未见过的新页面布局，遇到弹窗或动态元素往往直接卡死。\n- **数据标注困难**：缺乏高效工具记录人工操作轨迹，导致无法积累高质量的“电脑使用”训练数据来优化内部模型。\n\n### 使用 OpenCUA 后\n- **端到端智能执行**：OpenCUA 模型直接理解屏幕图像与自然语言指令，自动规划点击、输入等底层动作，无需硬编码具体坐标。\n- **统一跨系统适配**：基于 AgentNet 数据集训练的 OpenCUA 天然支持多操作系统与数百种应用，一套模型即可覆盖全员设备。\n- **强鲁棒性与泛化**：凭借 SOTA 级的视觉定位能力，即使界面布局变更或出现未知弹窗，OpenCUA 也能像人类一样灵活调整操作策略。\n- **闭环数据迭代**：利用 AgentNetTool 无缝捕获员工演示过程，快速转化为训练数据，持续微调 OpenCUA 以适应公司特有软件环境。\n\nOpenCUA 将繁琐的界面自动化从“写代码”升级为“教模型”，让智能体真正具备像人类一样操作任意电脑的通用能力。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_OpenCUA_1a6fa288.png","xlang-ai","XLANG Lab","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fxlang-ai_02033ec0.png","Developing embodied AI agents that empower users to use language to interact with digital and physical environments to carry out real-world tasks.",null,"XLangNLP","https:\u002F\u002Fxlang.ai","https:\u002F\u002Fgithub.com\u002Fxlang-ai",[83,87,91],{"name":84,"color":85,"percentage":86},"Python","#3572A5",96.1,{"name":88,"color":89,"percentage":90},"HTML","#e34c26",3.8,{"name":92,"color":93,"percentage":94},"Shell","#89e051",0.1,737,97,"2026-04-16T17:18:56","MIT","Linux, macOS, Windows","运行推理推荐 NVIDIA GPU。OpenCUA-7B 单卡即可；OpenCUA-32B 需 4 卡张量并行；OpenCUA-72B 推荐 8 卡（2 卡张量并行 +4 数据并行）。显存需求取决于模型大小及量化版本（如 exllamav2 可降低显存），具体未说明，但需配合 --gpu-memory-utilization 参数调整。","未说明",{"notes":103,"python":104,"dependencies":105},"1. 基于 Qwen 的模型（7B\u002F32B\u002F72B）修改了位置编码（使用 1D RoPE 替代 M-RoPE）并使用了与 Kimi-VL 相同的 Tokenizer 和 ChatTemplate，加载时务必添加 --trust-remote-code 参数，不要使用默认的 transformers 类直接加载。\n2. 提供量化版本 OpenCUA-7B-exl2 以降低显存占用并加速推理。\n3. AgentNetTool 标注工具支持跨平台（Windows\u002FmacOS\u002FUbuntu）运行以采集数据。","3.10",[106,107,108],"transformers","vllm>=0.12.0","huggingface_hub",[13,110,16,35],"其他",[112,113,114,115,116,117],"benchmark","computer-use-agent","dataset","foundation-models","gui","vision-language-model","2026-03-27T02:49:30.150509","2026-04-18T00:45:43.686428",[121,126,131,136,141,146],{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},38069,"如何正确解压分卷压缩的 AgentNet 数据集图片文件？","不要直接使用 `unzip` 或手动 `cat` 合并文件，这会导致校验错误。请使用以下命令正确合并并解压：\n1. 进入包含所有分卷文件的目录。\n2. 运行 `zip -s 0 images.zip --out images-full.zip` 将分卷合并为完整文件。\n3. 运行 `unzip images-full.zip -d \u003C目标目录>` 进行解压。","https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Fissues\u002F10",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},38070,"为什么 OpenCUAAgent 在推理测试中默认使用 L2 CoT 格式，而不是包含更长历史记录的 L3 格式？","这是基于实验结果的决定。理论上更长的历史记录包含更多信息，但实验发现其帮助微乎其微甚至产生负面影响。原因可能是思维历史（thought history）中有用信息过于稀疏，而动作历史（action history）已足以让代理知道已完成的操作。此外，思维历史会消耗更多的训练和推理 Token。目前动作历史是更高效有效的表示形式。","https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Fissues\u002F12",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},38071,"使用 AgentNetTool 录制数据后，如何获取本地文件并生成 CoT？","最近版本已移除登录功能，录制的演示数据会自动存储在本地安装路径下。例如在 Windows 上路径通常为：`agentnet-annotator-win32-x64-main-\u003C日期>\\resources\\backend_internal\\Recordings`。获取 recordings 文件夹后，可以使用项目 `data` 目录下的代码进行处理或添加 CoT（如使用 extract_raw 和 raw_to_standardized 脚本处理）。","https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Fissues\u002F15",{"id":137,"question_zh":138,"answer_zh":139,"source_url":140},38072,"在哪里可以找到 Data Viewer 中显示的详细元数据（如 complexity, applications 等）？","Data Viewer 中显示的扩展元数据字段已开源，可以通过以下两个位置获取：\n1. HuggingFace 数据集：https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fxlangai\u002FAgentNet\u002Fblob\u002Fmain\u002Fmeta_data_merged.jsonl\n2. GitHub 仓库文件：https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Ftree\u002Fmain\u002Fdata\u002Fvis","https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Fissues\u002F49",{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},38073,"当网页正在加载（显示白屏或旋转图标）时，模型为什么不执行“等待”操作？","这是一个典型的幻觉现象，即模型的思维（Thought）与动作（Action）未完全对齐。由于包含“等待”动作的任务在整个数据集中占比较小，模型选择该动作的概率较低。即使明确指令要求等待，模型仍可能生成不相关的动作。这反映了模型在处理低频率场景时的局限性。","https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Fissues\u002F35",{"id":147,"question_zh":148,"answer_zh":149,"source_url":150},38074,"近期是否有计划基于 Qwen3-VL 训练新版本的 OpenCUA？","短期内没有计划训练新的基础模型。团队目前正在探索 72B 模型的相关工作。虽然 Qwen3-VL 表现优异且值得关注，但目前的重点不在立即切换基座模型进行重新训练。","https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOpenCUA\u002Fissues\u002F43",[152],{"id":153,"version":154,"summary_zh":78,"released_at":155},306242,"v1.0.0","2025-08-11T16:21:27"]