[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-zubair-irshad--Awesome-Robotics-3D":3,"tool-zubair-irshad--Awesome-Robotics-3D":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",140436,2,"2026-04-05T23:32:43",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":81,"owner_email":82,"owner_twitter":83,"owner_website":84,"owner_url":85,"languages":86,"stars":87,"forks":88,"last_commit_at":89,"license":86,"difficulty_score":90,"env_os":91,"env_gpu":92,"env_ram":92,"env_deps":93,"category_tags":96,"github_topics":97,"view_count":23,"oss_zip_url":86,"oss_zip_packed_at":86,"status":16,"created_at":117,"updated_at":118,"faqs":119,"releases":120},3029,"zubair-irshad\u002FAwesome-Robotics-3D","Awesome-Robotics-3D","A curated list of 3D Vision papers relating to Robotics domain in the era of large models i.e. LLMs\u002FVLMs, inspired by awesome-computer-vision, including papers, codes, and related websites","Awesome-Robotics-3D 是一个专注于大模型时代机器人 3D 视觉领域的精选资源库。它系统性地整理了与机器人技术密切相关的 3D 视觉学术论文、开源代码实现及相关网站，旨在填补该细分领域高质量资料聚合的空白。\n\n随着大型语言模型（LLM）和视觉 - 语言模型（VLM）的兴起，如何让机器人更好地理解和操作三维世界成为研究热点，但相关成果分散且难以追踪。Awesome-Robotics-3D 通过分类梳理，有效解决了研究人员查找前沿文献和技术复现困难的问题。其内容涵盖策略学习（Policy Learning）、预训练方法、多模态大模型应用、3D 表征技术以及仿真数据集等多个核心板块，并收录了如 SAM2Act、3D Diffuser Actor 等结合扩散模型与 3D 场景表示的最新突破性工作。\n\n这份资源特别适合机器人学、计算机视觉领域的研究人员、算法工程师及高校学生使用。无论是希望快速把握\"3D 高斯泼溅在机器人中的应用”等新兴趋势，还是寻找具体的代码基线进行二次开发，用户都能在此获得极具价值的指引。作为一个由社区共同维护的开放项目，它持续更新，是探索智能机器人 3D 感","Awesome-Robotics-3D 是一个专注于大模型时代机器人 3D 视觉领域的精选资源库。它系统性地整理了与机器人技术密切相关的 3D 视觉学术论文、开源代码实现及相关网站，旨在填补该细分领域高质量资料聚合的空白。\n\n随着大型语言模型（LLM）和视觉 - 语言模型（VLM）的兴起，如何让机器人更好地理解和操作三维世界成为研究热点，但相关成果分散且难以追踪。Awesome-Robotics-3D 通过分类梳理，有效解决了研究人员查找前沿文献和技术复现困难的问题。其内容涵盖策略学习（Policy Learning）、预训练方法、多模态大模型应用、3D 表征技术以及仿真数据集等多个核心板块，并收录了如 SAM2Act、3D Diffuser Actor 等结合扩散模型与 3D 场景表示的最新突破性工作。\n\n这份资源特别适合机器人学、计算机视觉领域的研究人员、算法工程师及高校学生使用。无论是希望快速把握\"3D 高斯泼溅在机器人中的应用”等新兴趋势，还是寻找具体的代码基线进行二次开发，用户都能在此获得极具价值的指引。作为一个由社区共同维护的开放项目，它持续更新，是探索智能机器人 3D 感知与决策技术不可或缺的导航图。","# Awesome-Robotics-3D [![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fsindresorhus\u002Fawesome) [![Maintenance](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMaintained%3F-yes-green.svg)](https:\u002F\u002FGitHub.com\u002FNaereen\u002FStrapDown.js\u002Fgraphs\u002Fcommit-activity) [![PR's Welcome](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPRs-welcome-brightgreen.svg?style=flat)](http:\u002F\u002Fmakeapullrequest.com)  \u003Ca href=\"\" target='_blank'>\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fzubair-irshad_Awesome-Robotics-3D_readme_e2f07aa91fb4.png\"> \u003C\u002Fa>\r\n\r\n\u003Cdiv align=\"center\">\r\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fzubair-irshad_Awesome-Robotics-3D_readme_12e489837e7c.jpg\" width=\"100%\">\r\n\u003C\u002Fdiv>\r\n\r\n## ✨ About\r\n\r\nThis repo contains a curated list of **3D Vision papers relating to Robotics domain in the era of large models i.e. LLMs\u002FVLMs**, inspired by [awesome-computer-vision](https:\u002F\u002Fgithub.com\u002Fjbhuang0604\u002Fawesome-computer-vision) \u003Cbr>\r\n\r\n#### Please feel free to send me [pull requests](https:\u002F\u002Fgithub.com\u002Fzubair-irshad\u002FAwesome-Robotics-3D\u002Fblob\u002Fmain\u002Fhow-to-PR.md) or [email](mailto:muhammadzubairirshad@gmail.com) to add papers! \r\n\r\nIf you find this repository useful, please consider [citing](#citation) 📝 and STARing ⭐ this list. \r\n\r\nFeel free to share this list with others! List curated and maintained by [Zubair Irshad](https:\u002F\u002Fzubairirshad.com). If you have any questions, please get in touch!\r\n\r\n:fire: Other relevant survey papers:\r\n\r\n* \"Neural Fields in Robotics\", *arXiv, Oct 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.20220)]\r\n\r\n* \"When LLMs step into the 3D World: A Survey and Meta-Analysis of 3D Tasks via Multi-modal Large Language Models\", *arXiv, May 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.10255)]\r\n\r\n* \"3D Gaussian Splatting in Robotics: A Survey\", *arXiv, Oct 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.12262)]\r\n\r\n* \"A Comprehensive Study of 3-D Vision-Based Robot Manipulation\", *TCYB 2021*. [[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9541299)]\r\n\r\n---\r\n## 🏠 Overview\r\n\r\n  - [Policy Learning](#policy-learning)\r\n  - [Pretraining](#pretraining)\r\n  - [VLM and LLM](#vlm-and-llm)\r\n  - [Representations](#representation)\r\n  - [Simulations, Datasets and Benchmarks](#simulations-datasets-and-benchmarks)\r\n  - [Citation](#citation)\r\n \r\n---\r\n\r\n## Policy Learning\r\n\r\n* **SAM2Act**: \"Integrating Visual Foundation Model with a Memory Architecture for Robotic Manipulation\", *ICML 2025*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18564)] [[Webpage](https:\u002F\u002Fsam2act.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fsam2act\u002FSAM2Act)]\r\n\r\n* **3D Diffuser Actor**: \"Policy diffusion with 3d scene representations\", *arXiv Feb 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10885)] [[Webpage](https:\u002F\u002F3d-diffuser-actor.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fnickgkan\u002F3d_diffuser_actor)]\r\n\r\n* **3D Diffusion Policy**: \"Generalizable Visuomotor Policy Learning via Simple 3D Representations\", *RSS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03954)] [[Webpage](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.039545)] [[Code](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002F3D-Diffusion-Policy)]\r\n\r\n* **DNAct**: \"Diffusion Guided Multi-Task 3D Policy Learning\", *arXiv Mar 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.04115 )] [[Webpage](https:\u002F\u002Fdnact.github.io\u002F)]\r\n\r\n* **ManiCM**: \"Real-time 3D Diffusion Policy via Consistency Model for Robotic Manipulation\", *arXiv Jun 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.01586)] [[Webpage](https:\u002F\u002Fmanicm-fast.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FManiCM-fast\u002FManiCM)]\r\n\r\n* **HDP**: \"Hierarchical Diffusion Policy for Kinematics-Aware Multi-Task Robotic Manipulation\", *CVPR 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03890)] [[Webpage](https:\u002F\u002Fyusufma03.github.io\u002Fprojects\u002Fhdp\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fdyson-ai\u002Fhdp)]\r\n\r\n* **Imagination Policy**: \"Using Generative Point Cloud Models for Learning Manipulation Policies\", *arXiv Jun 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11740)] [[Webpage](https:\u002F\u002Fhaojhuang.github.io\u002Fimagine_page\u002F)]\r\n\r\n* **PCWM**: \"Point Cloud Models Improve Visual Robustness in Robotic Learners\", *ICRA 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.18926)] [[Webpage](https:\u002F\u002Fpvskand.github.io\u002Fprojects\u002FPCWM)]\r\n\r\n* **RVT**: \"Generalizable Visuomotor Policy Learning via Simple 3D Representations\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.14896)] [[Webpage](https:\u002F\u002Frobotic-view-transformer.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fnvlabs\u002Frvt)]\r\n\r\n* **Act3D**: \"3D Feature Field Transformers for Multi-Task Robotic Manipulation\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17817)] [[Webpage](https:\u002F\u002Fact3d.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fzhouxian\u002Fchained-diffuser)]\r\n\r\n* **VIHE**: \"Transformer-Based 3D Object Manipulation Using Virtual In-Hand View\", *arXiv, Mar 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.11461)] [[Webpage](https:\u002F\u002Fvihe-3d.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fdoublelei\u002FVIHE.git)]\r\n\r\n* **SGRv2**: \"Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation\", *arXiv, Jun 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.10615)] [[Webpage](https:\u002F\u002Fsgrv2-robot.github.io\u002F)] \r\n\r\n* **Sigma-Agent**: \"Contrastive Imitation Learning for Language-guided Multi-Task Robotic Manipulation\", *arXiv June 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09738)]\r\n\r\n* **RVT-2**: \"Learning Precise Manipulation from Few Demonstrations\", *RSS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08545)] [[Webpage](https:\u002F\u002Frobotic-view-transformer-2.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fnvlabs\u002Frvt)]\r\n\r\n* **SAM-E**: \"Leveraging Visual Foundation Model with Sequence Imitation for Embodied Manipulation\", *ICML 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.19586)] [[Webpage](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19586)] [[Code](https:\u002F\u002Fgithub.com\u002Fpipixiaqishi1\u002FSAM-E)]\r\n\r\n* **RISE**: \"3D Perception Makes Real-World Robot Imitation Simple and Effective\", *arXiv, Apr 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.12281)] [[Webpage](https:\u002F\u002Fgithub.com\u002Frise-policy\u002Frise)]  [[Code](https:\u002F\u002Fgithub.com\u002Frise-policy\u002FRISE)]\r\n\r\n* **Polarnet**: \"3D Point Clouds for Language-Guided Robotic Manipulation\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17817)] [[Webpage](https:\u002F\u002Fwww.di.ens.fr\u002Fwillow\u002Fresearch\u002Fpolarnet\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fvlc-robot\u002Fpolarnet\u002F)]\r\n\r\n* **Chaineddiffuser**: \"Unifying Trajectory Diffusion and Keypose Prediction for Robotic Manipulation\", *CORL 2023*. [[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv229\u002Fxian23a\u002Fxian23a.pdf)] [[Webpage](https:\u002F\u002Fchained-diffuser.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fzhouxian\u002Fchained-diffuser)]\r\n\r\n* **Pointcloud_RL**: \"On the Efficacy of 3D Point Cloud Reinforcement Learning\", *arXiv, June 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06799)] [[Code](https:\u002F\u002Fgithub.com\u002Flz1oceani\u002Fpointcloud_rl)]\r\n\r\n* **Perceiver-Actor**: \"A Multi-Task Transformer for Robotic Manipulation\", *CORL 2022*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.05451)] [[Webpage](https:\u002F\u002Fcliport.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fcliport\u002Fcliport)]\r\n\r\n* **CLIPort**: \"What and Where Pathways for Robotic Manipulation\", *CORL 2021*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2109.12098.pdf)] [[Webpage](https:\u002F\u002Fwww.di.ens.fr\u002Fwillow\u002Fresearch\u002Fpolarnet\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fvlc-robot\u002Fpolarnet\u002F)]\r\n\r\n* **Polarnet**: \"3D Point Clouds for Language-Guided Robotic Manipulation\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17817)] [[Webpage](https:\u002F\u002Fwww.di.ens.fr\u002Fwillow\u002Fresearch\u002Fpolarnet\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fvlc-robot\u002Fpolarnet\u002F)]\r\n\r\n\r\n\r\n---\r\n## Pretraining\r\n\r\n* **3D-MVP**: \"3D Multiview Pretraining for Robotic Manipulation\", *arXiv, June 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18158)] [[Webpage](https:\u002F\u002Fjasonqsy.github.io\u002F3DMVP\u002F)]\r\n\r\n* **DexArt**: \"Benchmarking Generalizable Dexterous Manipulation with Articulated Objects\", *CVPR 2023*. [[Paper](https:\u002F\u002Fwww.chenbao.tech\u002Fdexart\u002Fstatic\u002Fpaper\u002Fdexart.pdf)] [[Webpage](https:\u002F\u002Fwww.chenbao.tech\u002Fdexart\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FKami-code\u002Fdexart-release)]\r\n\r\n* **RoboUniView**: \"Visual-Language Model with Unified View Representation for Robotic Manipulaiton\", *arXiv, Jun 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18977)] [[Website](https:\u002F\u002Fliufanfanlff.github.io\u002FRoboUniview.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fliufanfanlff\u002FRoboUniviewd)] \r\n\r\n* **SUGAR**: \"Pre-training 3D Visual Representations for Robotics\", *CVPR 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.01491)] [[Webpage](https:\u002F\u002Fcshizhe.github.io\u002Fprojects\u002Frobot_sugar.html)] [[Code](https:\u002F\u002Fgithub.com\u002Fcshizhe\u002Frobot_sugar)]\r\n\r\n* **DPR**: \"Visual Robotic Manipulation with Depth-Aware Pretraining\", *arXiv, Jan 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.09038)]\r\n\r\n* **MV-MWM**: \"Multi-View Masked World Models for Visual Robotic Manipulation\", *ICML 2023*. [[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fseo23a\u002Fseo23a.pdf)] [[Code](https:\u002F\u002Fgithub.com\u002Fyounggyoseo\u002FMV-MWM)] \r\n\r\n* **Point Cloud Matters**: \"Rethinking the Impact of Different Observation Spaces on Robot Learning\", *arXiv, Feb 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.02500)] [[Code](https:\u002F\u002Fgithub.com\u002FHaoyiZhu\u002FPointCloudMatters)] \r\n\r\n* **RL3D**: \"Visual Reinforcement Learning with Self-Supervised 3D Representations\", *IROS 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.07241)] [[Website](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002Frl3d)] [[Code](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002Frl3d)] \r\n\r\n---\r\n\r\n## VLM and LLM\r\n\r\n* **RoboTracer**: \"Mastering Spatial Trace with Reasoning in Vision-Language Models for Robotics\", *ArXiv 2025*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2512.13660)] [[Website](https:\u002F\u002Fzhoues.github.io\u002FRoboTracer\u002F)] \r\n\r\n* **RoboRefer**: \"Towards Spatial Referring with Reasoning in Vision-Language Models for Robotics\", *ArXiv 2025*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2506.04308)] [[Website](https:\u002F\u002Fzhoues.github.io\u002FRoboRefer\u002F)] \r\n\r\n* **AHA**: \"A Vision-Language-Model for Detecting and Reasoning over Failures in Robotic Manipulation\", *ArXiv 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.00371)] [[Website](https:\u002F\u002Faha-vlm.github.io\u002F)] \r\n\r\n* **ShapeLLM**: \"ShapeLLM: Universal 3D Object Understanding for Embodied Interaction\", *ECCV 2024*. [[Paper\u002FPDF](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.17766)] [[Code](https:\u002F\u002Fgithub.com\u002Fqizekun\u002FShapeLLM)] [[Website](https:\u002F\u002Fqizekun.github.io\u002Fshapellm\u002F)]\r\n\r\n* **3D-VLA**: \"3D Vision-Language-Action Generative World Model\", *ICML 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09631)] [[Website](https:\u002F\u002Fvis-www.cs.umass.edu\u002F3dvla\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FUMass-Foundation-Model\u002F3D-VLA)] \r\n\r\n* **RoboPoint**: \"A Vision-Language Model for Spatial Affordance Prediction for Robotics\", *CORL 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.07241)] [[Website](https:\u002F\u002Frobo-point.github.io\u002F)] [[Demo](https:\u002F\u002Frobo-point.github.io\u002F)]\r\n\r\n* **Open6DOR**: \"Benchmarking Open-instruction 6-DoF Object Rearrangement and A VLM-based Approach\", *IROS 2024*. [[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=axAmAy3Ghl)] [[Website](https:\u002F\u002Fpku-epic.github.io\u002FOpen6DOR\u002F)] [[Code](https:\u002F\u002F007e03d34429a2517b.gradio.live\u002F)]\r\n\r\n* **ReasoningGrasp**: \"Reasoning Grasping via Multimodal Large Language Model\", *CORL 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.06798)]\r\n\r\n* **SpatialVLM**: \"Endowing Vision-Language Models with Spatial Reasoning Capabilities\", *CVPR 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12168)] [[Website](https:\u002F\u002Fspatial-vlm.github.io\u002F)] [[Code](https:\u002F\u002Fspatial-vlm.github.io\u002F#community-implementation)]\r\n\r\n* **SpatialRGPT**: \"Grounded Spatial Reasoning in Vision Language Model\", *arXiv, June 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01584)] [[Website](https:\u002F\u002Fwww.anjiecheng.me\u002FSpatialRGPT)]\r\n\r\n* **Scene-LLM**: \"Extending Language Model for 3D Visual Understanding and Reasoning\", *arXiv, Mar 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.11401)] \r\n\r\n* **ManipLLM**: \"Embodied Multimodal Large Language Model for Object-Centric Robotic Manipulation \", *CVPR 2024*. [[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fpapers\u002FLi_ManipLLM_Embodied_Multimodal_Large_Language_Model_for_Object-Centric_Robotic_Manipulation_CVPR_2024_paper.pdf)] [[Website](https:\u002F\u002Fsites.google.com\u002Fview\u002Fmanipllm)] [[Code](https:\u002F\u002Fgithub.com\u002Fclorislili\u002FManipLLM)]\r\n\r\n* **Manipulate-Anything**: \"Manipulate-Anything: Automating Real-World Robots using Vision-Language Models\", *CoRL, 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18915)] [[Website](https:\u002F\u002Frobot-ma.github.io\u002F)]\r\n  \r\n* **MOKA**: \"Open-Vocabulary Robotic Manipulation through Mark-Based Visual Prompting\", *RSS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03174)] [[Website](https:\u002F\u002Fmoka-manipulation.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fmoka-manipulation\u002Fmoka)] \r\n\r\n* **Agent3D-Zero**: \"An Agent for Zero-shot 3D Understanding\", *arXIv, Mar 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.11835)] [[Website](https:\u002F\u002Fzhangsha1024.github.io\u002FAgent3D-Zero\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fzhangsha1024\u002FAgent3d-zero-code)] \r\n\r\n* **MultiPLY**: \"A Multisensory Object-Centric Embodied Large Language Model in 3D World\", *CVPR 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.08577)] [[Website](https:\u002F\u002Fvis-www.cs.umass.edu\u002Fmultiply\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FUMass-Foundation-Model\u002FMultiPLY)] \r\n\r\n* **ThinkGrasp**: \"A Vision-Language System for Strategic Part Grasping in Clutter\", *arXiv, Jul 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.11298)] [[Website](https:\u002F\u002Fh-freax.github.io\u002Fthinkgrasp_page\u002F)]\r\n\r\n* **VoxPoser**: \"Composable 3D Value Maps for Robotic Manipulation with Language Models\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.05973)] [[Website](https:\u002F\u002Fvoxposer.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fhuangwl18\u002FVoxPoser)] \r\n\r\n* **Dream2Real**: \"Zero-Shot 3D Object Rearrangement with Vision-Language Models\", *ICRA 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.04533.pdf)] [[Website](https:\u002F\u002Fwww.robot-learning.uk\u002Fdream2real)] [[Code](https:\u002F\u002Fgithub.com\u002FFlyCole\u002FDream2Real)] \r\n\r\n* **LEO**: \"An Embodied Generalist Agent in 3D World\", *ICML 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.12871)] [[Website](https:\u002F\u002Fembodied-generalist.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fembodied-generalist\u002Fembodied-generalist)] \r\n\r\n* **SpatialPIN**: \"Enhancing Spatial Reasoning Capabilities of Vision-Language Models through Prompting and Interacting 3D Priors\", *arXiv, Mar 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.13438)] [[Website](https:\u002F\u002Fdannymcy.github.io\u002Fzeroshot_task_hallucination\u002F)]\r\n\r\n* **SpatialBot**: \"Precise Spatial Understanding with Vision Language Models\", *arXiv, Jun 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.13642)] [[Code](https:\u002F\u002Fgithub.com\u002FBAAI-DCAI\u002FSpatialBot)]\r\n\r\n* **COME-robot**: \"Closed-Loop Open-Vocabulary Mobile Manipulation with GPT-4V\", *arXiv, Apr 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.10220)] [[Website](https:\u002F\u002Fcome-robot.github.io\u002F)]\r\n\r\n* **3D-LLM**: \"Open-Vocabulary Robotic Manipulation through Mark-Based Visual Prompting\", *Neurips 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.12981)] [[Website](https:\u002F\u002Fvis-www.cs.umass.edu\u002F3dllm\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FUMass-Foundation-Model\u002F3D-LLM)] \r\n\r\n* **VLMaps**: \"Visual Language Maps for Robot Navigation\", *ICRA 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.05714)] [[Website](https:\u002F\u002Fvlmaps.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fvlmaps\u002Fvlmaps.gita)] \r\n\r\n* **MoMa-LLM**: \"Language-Grounded Dynamic Scene Graphs for Interactive Object Search with Mobile Manipulation\", *RA-L 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.08605)] [[Website](http:\u002F\u002Fmoma-llm.cs.uni-freiburg.de\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Frobot-learning-freiburg\u002FMoMa-LLM)]\r\n\r\n* **LGrasp6D**: \"Language-Driven 6-DoF Grasp Detection Using Negative Prompt Guidance\", *ECCV 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.13842)] [[Website](https:\u002F\u002Fairvlab.github.io\u002Fgrasp-anything\u002F)]\r\n\r\n* **OpenAD**: \"Open-Vocabulary Affordance Detection in 3D Point Clouds\", *IROS 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.02401)] [[Website](https:\u002F\u002Fopenad2023.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FOpen-Vocabulary-Affordance-Detection-in-3D-Point-Clouds)]\r\n\r\n* **3DAPNet**: \"Language-Conditioned Affordance-Pose Detection in 3D Point Clouds\", *ICRA 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.10911)] [[Website](https:\u002F\u002F3dapnet.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FLanguage-Conditioned-Affordance-Pose-Detection-in-3D-Point-Clouds)]\r\n\r\n* **OpenKD**: \"Open-Vocabulary Affordance Detection using Knowledge Distillation and Text-Point Correlation\", *ICRA 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.10932)] [[Code](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FOpen-Vocabulary-Affordance-Detection-using-Knowledge-Distillation-and-Text-Point-Correlation)]\r\n\r\n* **PARIS3D**: \"Reasoning Based 3D Part Segmentation Using Large Multimodal Model\", *ECCV 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.03836)] [[Code](https:\u002F\u002Fgithub.com\u002FAmrinKareem\u002FPARIS3D)]\r\n\r\n\r\n---\r\n## Representation\r\n\r\n* **RoVi-Aug**: \"Robot and Viewpoint Augmentation for Cross-Embodiment Robot Learning\", *CORL 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.03403)] [[Webpage](https:\u002F\u002Frovi-aug.github.io\u002F)]\r\n\r\n* **Vista**: \"View-Invariant Policy Learning via Zero-Shot Novel View Synthesis\", *CORL 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.03685)] [[Webpage](https:\u002F\u002Fs-tian.github.io\u002Fprojects\u002Fvista\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fs-tian\u002FVISTA)]\r\n\r\n* **GraspSplats**: \"Efficient Manipulation with 3D Feature Splatting\", *CORL 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.02084)] [[Webpage](https:\u002F\u002Fgraspsplats.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fjimazeyu\u002FGraspSplats)]\r\n\r\n* **RAM**: \"Retrieval-Based Affordance Transfer for Generalizable Zero-Shot Robotic Manipulation\", *CORL 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.04689)] [[Webpage](https:\u002F\u002Fyxkryptonite.github.io\u002FRAM\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FyxKryptonite\u002FRAM_code)]\r\n\r\n* **Language-Embedded Gaussian Splats (LEGS)**: \"Incrementally Building Room-Scale Representations with a Mobile Robot\", *IROS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.18108)] [[Webpage](https:\u002F\u002Fberkeleyautomation.github.io\u002FLEGS\u002F)]\r\n\r\n* **Splat-MOVER**: \"Multi-Stage, Open-Vocabulary Robotic Manipulation via Editable Gaussian Splatting\", *arXiv May 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04378)] [[Webpage](https:\u002F\u002Fsplatmover.github.io\u002F)]\r\n\r\n* **GNFactor**: \"Multi-Task Real Robot Learning with Generalizable Neural Feature Fields\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.16891)] [[Webpage](https:\u002F\u002Fyanjieze.com\u002FGNFactor\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002FGNFactor)]\r\n\r\n* **ManiGaussian**: \"Dynamic Gaussian Splatting for Multi-task Robotic Manipulation\", *ECCV 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.08321)] [[Webpage](https:\u002F\u002Fguanxinglu.github.io\u002FManiGaussian\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FGuanxingLu\u002FManiGaussian)]\r\n\r\n* **GaussianGrasper**: \"3D Language Gaussian Splatting for Open-vocabulary Robotic Grasping\", *arXiv Mar 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09637.pdf)] [[Webpage](https:\u002F\u002Fmrsecant.github.io\u002FGaussianGrasper\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FMrSecant\u002FGaussianGrasper)]\r\n\r\n* **ORION**: \"Vision-based Manipulation from Single Human Video with Open-World Object Graphs\", *arXiv May 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.20321)] [[Webpage](https:\u002F\u002Fut-austin-rpl.github.io\u002FORION-release)]\r\n\r\n* **ConceptGraphs**: \"Open-Vocabulary 3D Scene Graphs for Perception and Planning\", *ICRA 2024*. [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.16650)] [[Webpage](https:\u002F\u002Fconcept-graphs.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fconcept-graphs\u002Fconcept-graphs)]\r\n\r\n* **SparseDFF**: \"Sparse-View Feature Distillation for One-Shot Dexterous Manipulation\", *ICLR 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.16838)] [[Webpage](https:\u002F\u002Fhelloqxwang.github.io\u002FSparseDFF\u002F)]\r\n\r\n* **GROOT**: \"Learning Generalizable Manipulation Policies with Object-Centric 3D Representations\", *CORL 2023*. [[Paper](http:\u002F\u002Farxiv.org\u002Fabs\u002F2310.14386)] [[Webpage](https:\u002F\u002Fut-austin-rpl.github.io\u002FGROOT\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FUT-Austin-RPL\u002FGROOT)]\r\n\r\n* **Distilled Feature Fields**: \"Enable Few-Shot Language-Guided Manipulation\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07931)] [[Webpage](https:\u002F\u002Ff3rm.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Ff3rm\u002Ff3rm)]\r\n\r\n* **SGR**: \"A Universal Semantic-Geometric Representation for Robotic Manipulation\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.10474.pdf)] [[Webpage](https:\u002F\u002Fsemantic-geometric-representation.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FTongZhangTHU\u002Fsgr)]\r\n\r\n* **OVMM**: \"Open-vocabulary Mobile Manipulation in Unseen Dynamic Environments with 3D Semantic Maps\", *arXiv, Jun 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18115)] \r\n\r\n* **CLIP-Fields**: \"Weakly Supervised Semantic Fields for Robotic Memory\", *RSS 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.05663)] [[Webpage](https:\u002F\u002Fmahis.life\u002Fclip-fields)] [[Code](https:\u002F\u002Fgithub.com\u002Fnotmahi\u002Fclip-fields)]\r\n\r\n* **NeRF in the Palm of Your Hand**: \"Corrective Augmentation for Robotics via Novel-View Synthesis\", *CVPR 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.08556)] [[Webpage](https:\u002F\u002Fbland.website\u002Fspartn\u002F)]\r\n\r\n* **JCR**: \"Unifying Scene Representation and Hand-Eye Calibration with 3D Foundation Models\", *arXiv, Apr 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fabpdfs\u002F2404.11683v1)] [[Code](https:\u002F\u002Fgithub.com\u002Ftomtang502\u002Farm_3d_reconstruction)]\r\n\r\n* **D3Fields**: \"Dynamic 3D Descriptor Fields for Zero-Shot Generalizable Robotic Manipulation\", *arXiv, Sep 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.16118)] [[Webpage](https:\u002F\u002Frobopil.github.io\u002Fd3fields\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FWangYixuan12\u002Fd3fields)]\r\n\r\n* **SayPlan**: \"Grounding Large Language Models using 3D Scene Graphs for Scalable Robot Task Planning\", *CORL 2023*. [[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=wMpOMO0Ss7a)] [[Webpage](https:\u002F\u002Fsayplan.github.io\u002F)]\r\n\r\n* **Dex-NeRF**: \"Using a Neural Radiance field to Grasp Transparent Objects\", *CORL 2021*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.14217)] [[Webpage](https:\u002F\u002Fsites.google.com\u002Fview\u002Fdex-nerf)]\r\n\r\n---\r\n## Simulations, Datasets and Benchmarks\r\n\r\n* **RoboTracer**: \"Mastering Spatial Trace with Reasoning in Vision-Language Models for Robotics\", *ArXiv 2025*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2512.13660)] [[Website](https:\u002F\u002Fzhoues.github.io\u002FRoboTracer\u002F)] \r\n\r\n* **RoboRefer**: \"Towards Spatial Referring with Reasoning in Vision-Language Models for Robotics\", *ArXiv 2025*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2506.04308)] [[Website](https:\u002F\u002Fzhoues.github.io\u002FRoboRefer\u002F)] \r\n\r\n* **The Colosseum**: \"A Benchmark for Evaluating Generalization for Robotic Manipulation\", *RSS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08191)] [[Website](https:\u002F\u002Frobot-colosseum.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Frobot-colosseum\u002Frobot-colosseum)] \r\n\r\n* **OpenEQA**: \"Embodied Question Answering in the Era of Foundation Models\", *CVPR 2024*. [[Paper](https:\u002F\u002Fopen-eqa.github.io\u002Fassets\u002Fpdfs\u002Fpaper.pdf)] [[Website](https:\u002F\u002Fopen-eqa.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fopen-eqa)] \r\n\r\n* **DROID**: \"A Large-Scale In-the-Wild Robot Manipulation Dataset\", *RSS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.12945)] [[Website](https:\u002F\u002Fdroid-dataset.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fdroid-dataset\u002Fdroid)] \r\n\r\n* **RH20T**: \"A Comprehensive Robotic Dataset for Learning Diverse Skills in One-Shot\", *ICRA 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.00595)] [[Website](https:\u002F\u002Frh20t.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Frh20t\u002Frh20t_api)] \r\n\r\n* **Gen2Sim**: \"A Comprehensive Robotic Dataset for Learning Diverse Skills in One-Shot\", *ICRA 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.18308)] [[Website](https:\u002F\u002Fgen2sim.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fpushkalkatara\u002FGen2Sim)] \r\n\r\n* **BEHAVIOR Vision Suite**: \"Customizable Dataset Generation via Simulation\", *CVPR 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.09546)] [[Website](https:\u002F\u002Fbehavior-vision-suite.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fbehavior-vision-suite\u002Fbehavior-vision-suite.github.io)] \r\n\r\n* **RoboCasa**: \"Large-Scale Simulation of Everyday Tasks for Generalist Robots\", *RSS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.02523)] [[Website](https:\u002F\u002Frobocasa.ai\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Frobocasa\u002Frobocasa)] \r\n\r\n* **ARNOLD**: \"ARNOLD: A Benchmark for Language-Grounded Task Learning With Continuous States in Realistic 3D Scenes\", *ICCV 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.04321)] [[Webpage](https:\u002F\u002Farnold-benchmark.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Farnold-benchmark\u002Farnold)]\r\n\r\n* **VIMA**: \"General Robot Manipulation with Multimodal Prompts\", *ICML 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.03094)] [[Website](https:\u002F\u002Fvimalabs.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fvimalabs\u002FVIMA)] \r\n\r\n* **ManiSkill2**: \"A Unified Benchmark for Generalizable Manipulation Skills\", *ICLR 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.04659)] [[Website](https:\u002F\u002Fmaniskill2.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fhaosulab\u002FManiSkill2)] \r\n\r\n* **Robo360**: \"A 3D Omnispective Multi-Material Robotic Manipulation Dataset\", *arxiv, Dec 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.06686)]\r\n\r\n* **AR2-D2**: \"Training a Robot Without a Robot\", *CORL 2023*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.13818)] [[Website](https:\u002F\u002Far2d2.site\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fjiafei1224\u002FAR2-D2)] \r\n\r\n* **Habitat 2.0**: \"Training Home Assistants to Rearrange their Habitat\", *Neuips 2021*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.14405)] [[Website](https:\u002F\u002Faihabitat.org\u002Fdocs\u002Fhabitat2\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fhabitat-lab)] \r\n\r\n* **VL-Grasp**: \"a 6-Dof Interactive Grasp Policy for Language-Oriented Objects in Cluttered Indoor Scenes\", *IROS 2023*. [[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10341379)] [[Code](https:\u002F\u002Fgithub.com\u002Fluyh20\u002Fvl-grasp)] \r\n\r\n* **OCID-Ref**: \"A 3D Robotic Dataset with Embodied Language for Clutter Scene Grounding\", *NAACL 2021*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.07679)] [[Code](https:\u002F\u002Fgithub.com\u002Flluma\u002FOCID-Ref)]  \r\n\r\n* **ManipulaTHOR**: \"A Framework for Visual Object Manipulation\", *CVPR 2021*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.11213.pdf)] [[Website](https:\u002F\u002Fai2thor.allenai.org\u002Fmanipulathor\u002F)] [[Code](https:\u002F\u002Fai2thor.allenai.org\u002Fmanipulathor\u002F)] \r\n\r\n* **RoboTHOR**: \"An Open Simulation-to-Real Embodied AI Platform\", *CVPR 2020*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.06799)] [[Website](https:\u002F\u002Fai2thor.allenai.org\u002Frobothor\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fai2thor)]\r\n\r\n* **HabiCrowd**: \"HabiCrowd: A High Performance Simulator for Crowd-Aware Visual Navigation\", *IROS 2024*. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.11377)] [[Website](https:\u002F\u002Fhabicrowd.github.io\u002F)] [[Code](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FHabiCrowd)]\r\n\r\n----\r\n\r\n[![Star History Chart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fzubair-irshad_Awesome-Robotics-3D_readme_a72dbe8072cf.png)](https:\u002F\u002Fstar-history.com\u002F#zubair-irshad\u002FAwesome-Robotics-3D&Date)\r\n\r\n----\r\n## Citation\r\nIf you find this repository useful, please consider citing this list:\r\n```\r\n@misc{irshad2024roboticd3D,\r\n    title = {Awesome Robotics 3D - A curated list of resources on 3D vision papers relating to robotics},\r\n    author = {Muhammad Zubair Irshad},\r\n    journal = {GitHub repository},\r\n    url = {https:\u002F\u002Fgithub.com\u002Fzubair-irshad\u002FAwesome-Robotics-3D},\r\n    year = {2024},\r\n}\r\n```\r\n","# Awesome-Robotics-3D [![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fsindresorhus\u002Fawesome) [![维护中](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMaintained%3F-yes-green.svg)](https:\u002F\u002FGitHub.com\u002FNaereen\u002FStrapDown.js\u002Fgraphs\u002Fcommit-activity) [![欢迎提交PR](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPRs-welcome-brightgreen.svg?style=flat)](http:\u002F\u002Fmakeapullrequest.com)  \u003Ca href=\"\" target='_blank'>\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fzubair-irshad_Awesome-Robotics-3D_readme_e2f07aa91fb4.png\"> \u003C\u002Fa>\n\n\u003Cdiv align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fzubair-irshad_Awesome-Robotics-3D_readme_12e489837e7c.jpg\" width=\"100%\">\n\u003C\u002Fdiv>\n\n## ✨ 关于\n\n本仓库收录了一份精心整理的**与机器人领域相关的3D视觉论文清单，聚焦于大模型时代（即LLMs\u002FVLMs）**，灵感来源于[awesome-computer-vision](https:\u002F\u002Fgithub.com\u002Fjbhuang0604\u002Fawesome-computer-vision)。\u003Cbr>\n\n#### 欢迎随时向我发送[拉取请求](https:\u002F\u002Fgithub.com\u002Fzubair-irshad\u002FAwesome-Robotics-3D\u002Fblob\u002Fmain\u002Fhow-to-PR.md)或[邮件](mailto:muhammadzubairirshad@gmail.com)，以添加新的论文！ \n\n如果您觉得本仓库有所帮助，请考虑[引用](#citation)并为这份列表点个赞⭐。 \n\n也欢迎您将此列表分享给他人！ 该列表由[Zubair Irshad](https:\u002F\u002Fzubairirshad.com)整理并维护。 如有任何问题，欢迎随时联系！\n\n:fire: 其他相关综述论文：\n\n* “机器人中的神经场”，*arXiv，2024年10月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.20220)]\n\n* “当LLM步入3D世界：基于多模态大型语言模型的3D任务综述与元分析”，*arXiv，2024年5月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.10255)]\n\n* “机器人领域的3D高斯泼溅技术综述”，*arXiv，2024年10月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.12262)]\n\n* “基于3D视觉的机器人操作综合研究”，*TCYB 2021*。[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9541299)]\n\n---\n## 🏠 概览\n\n  - [策略学习](#policy-learning)\n  - [预训练](#pretraining)\n  - [VLM与LLM](#vlm-and-llm)\n  - [表示方法](#representation)\n  - [仿真、数据集与基准测试](#simulations-datasets-and-benchmarks)\n  - [引用](#citation)\n \n---\n\n## 策略学习\n\n* **SAM2Act**: “将视觉基础模型与记忆架构结合用于机器人操作”，*ICML 2025*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18564)] [[网页](https:\u002F\u002Fsam2act.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fsam2act\u002FSAM2Act)]\n\n* **3D Diffuser Actor**: “基于3D场景表示的策略扩散”，*arXiv 2024年2月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10885)] [[网页](https:\u002F\u002F3d-diffuser-actor.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fnickgkan\u002F3d_diffuser_actor)]\n\n* **3D Diffusion Policy**: “通过简单的3D表示实现可泛化的视觉—运动策略学习”，*RSS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03954)] [[网页](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.039545)] [[代码](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002F3D-Diffusion-Policy)]\n\n* **DNAct**: “扩散引导的多任务3D策略学习”，*arXiv 2024年3月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.04115)] [[网页](https:\u002F\u002Fdnact.github.io\u002F)]\n\n* **ManiCM**: “基于一致性模型的实时3D扩散策略，用于机器人操作”，*arXiv 2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.01586)] [[网页](https:\u002F\u002Fmanicm-fast.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FManiCM-fast\u002FManiCM)]\n\n* **HDP**: “面向运动学感知的多任务机器人操作的层次化扩散策略”，*CVPR 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03890)] [[网页](https:\u002F\u002Fyusufma03.github.io\u002Fprojects\u002Fhdp\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fdyson-ai\u002Fhdp)]\n\n* **Imagination Policy**: “利用生成式点云模型学习操作策略”，*arXiv 2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11740)] [[网页](https:\u002F\u002Fhaojhuang.github.io\u002Fimagine_page\u002F)]\n\n* **PCWM**: “点云模型提升机器人学习者的视觉鲁棒性”，*ICRA 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.18926)] [[网页](https:\u002F\u002Fpvskand.github.io\u002Fprojects\u002FPCWM)]\n\n* **RVT**: “通过简单的3D表示实现可泛化的视觉—运动策略学习”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.14896)] [[网页](https:\u002F\u002Frobotic-view-transformer.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fnvlabs\u002Frvt)]\n\n* **Act3D**: “用于多任务机器人操作的3D特征场变换器”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17817)] [[网页](https:\u002F\u002Fact3d.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fzhouxian\u002Fchained-diffuser)]\n\n* **VIHE**: “基于Transformer的虚拟手内视角3D物体操作”，*arXiv 2024年3月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.11461)] [[网页](https:\u002F\u002Fvihe-3d.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fdoublelei\u002FVIHE.git)]\n\n* **SGRv2**: “利用局部性提升机器人操作中的样本效率”，*arXiv 2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.10615)] [[网页](https:\u002F\u002Fsgrv2-robot.github.io\u002F)]\n\n* **Sigma-Agent**: “面向语言指导的多任务机器人操作的对比模仿学习”，*arXiv 2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09738)]\n\n* **RVT-2**: “从少量示范中学习精确操作”，*RSS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08545)] [[网页](https:\u002F\u002Frobotic-view-transformer-2.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fnvlabs\u002Frvt)]\n\n* **SAM-E**: “利用视觉基础模型结合序列模仿进行具身操作”，*ICML 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.19586)] [[网页](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19586)] [[代码](https:\u002F\u002Fgithub.com\u002Fpipixiaqishi1\u002FSAM-E)]\n\n* **RISE**: “3D感知使现实世界机器人模仿简单而有效”，*arXiv 2024年4月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.12281)] [[网页](https:\u002F\u002Fgithub.com\u002Frise-policy\u002Frise)] [[代码](https:\u002F\u002Fgithub.com\u002Frise-policy\u002FRISE)]\n\n* **Polarnet**: “用于语言指导机器人操作的3D点云”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17817)] [[网页](https:\u002F\u002Fwww.di.ens.fr\u002Fwillow\u002Fresearch\u002Fpolarnet\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fvlc-robot\u002Fpolarnet\u002F)]\n\n* **Chaineddiffuser**: “统一轨迹扩散与关键姿态预测用于机器人操作”，*CORL 2023*。[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv229\u002Fxian23a\u002Fxian23a.pdf)] [[网页](https:\u002F\u002Fchained-diffuser.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fzhouxian\u002Fchained-diffuser)]\n\n* **Pointcloud_RL**: “关于3D点云强化学习的有效性”，*arXiv 2023年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06799)] [[代码](https:\u002F\u002Fgithub.com\u002Flz1oceani\u002Fpointcloud_rl)]\n\n* **Perceiver-Actor**: “用于机器人操作的多任务Transformer”，*CORL 2022*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.05451)] [[网页](https:\u002F\u002Fcliport.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fcliport\u002Fcliport)]\n\n* **CLIPort**: “机器人操作中的‘什么’和‘哪里’路径”，*CORL 2021*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2109.12098.pdf)] [[网页](https:\u002F\u002Fwww.di.ens.fr\u002Fwillow\u002Fresearch\u002Fpolarnet\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fvlc-robot\u002Fpolarnet\u002F)]\n\n* **Polarnet**: “用于语言指导机器人操作的3D点云”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17817)] [[网页](https:\u002F\u002Fwww.di.ens.fr\u002Fwillow\u002Fresearch\u002Fpolarnet\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fvlc-robot\u002Fpolarnet\u002F)]\n\n\n\n---\n## 预训练\n\n* **3D-MVP**: “用于机器人操作的3D多视角预训练”，*arXiv 2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18158)] [[网页](https:\u002F\u002Fjasonqsy.github.io\u002F3DMVP\u002F)]\n\n* **DexArt**: “使用关节物体基准测试可泛化的灵巧操作”，*CVPR 2023*。[[论文](https:\u002F\u002Fwww.chenbao.tech\u002Fdexart\u002Fstatic\u002Fpaper\u002Fdexart.pdf)] [[网页](https:\u002F\u002Fwww.chenbao.tech\u002Fdexart\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FKami-code\u002Fdexart-release)]\n\n* **RoboUniView**: “具有统一视图表示的视觉—语言模型用于机器人操作”，*arXiv 2023年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18977)] [[网站](https:\u002F\u002Fliufanfanlff.github.io\u002FRoboUniview.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fliufanfanlff\u002FRoboUniviewd)]\n\n* **SUGAR**: “为机器人技术预训练3D视觉表示”，*CVPR 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.01491)] [[网页](https:\u002F\u002Fcshizhe.github.io\u002Fprojects\u002Frobot_sugar.html)] [[代码](https:\u002F\u002Fgithub.com\u002Fcshizhe\u002Frobot_sugar)]\n\n* **DPR**: “带有深度感知预训练的视觉机器人操作”，*arXiv 2024年1月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.09038)]\n\n* **MV-MWM**: “用于视觉机器人操作的多视角掩码世界模型”，*ICML 2023*。[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fseo23a\u002Fseo23a.pdf)] [[代码](https:\u002F\u002Fgithub.com\u002Fyounggyoseo\u002FMV-MWM)]\n\n* **Point Cloud Matters**: “重新思考不同观测空间对机器人学习的影响”，*arXiv 2024年2月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.02500)] [[代码](https:\u002F\u002Fgithub.com\u002FHaoyiZhu\u002FPointCloudMatters)]\n\n* **RL3D**: “带有自监督3D表示的视觉强化学习”，*IROS 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.07241)] [[网站](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002Frl3d)] [[代码](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002Frl3d)] \n\n---\n\n## 视觉语言模型与大语言模型\n\n* **RoboTracer**：“在机器人领域，通过视觉-语言模型中的推理掌握空间轨迹”，*ArXiv 2025*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2512.13660)] [[网站](https:\u002F\u002Fzhoues.github.io\u002FRoboTracer\u002F)]\n\n* **RoboRefer**：“面向机器人的视觉-语言模型中基于推理的空间指代”，*ArXiv 2025*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2506.04308)] [[网站](https:\u002F\u002Fzhoues.github.io\u002FRoboRefer\u002F)]\n\n* **AHA**：“用于检测和推理机器人操作中故障的视觉-语言模型”，*ArXiv 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.00371)] [[网站](https:\u002F\u002Faha-vlm.github.io\u002F)]\n\n* **ShapeLLM**：“ShapeLLM：面向具身交互的通用3D物体理解”，*ECCV 2024*。[[论文\u002FPDF](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.17766)] [[代码](https:\u002F\u002Fgithub.com\u002Fqizekun\u002FShapeLLM)] [[网站](https:\u002F\u002Fqizekun.github.io\u002Fshapellm\u002F)]\n\n* **3D-VLA**：“3D视觉-语言-动作生成式世界模型”，*ICML 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09631)] [[网站](https:\u002F\u002Fvis-www.cs.umass.edu\u002F3dvla\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FUMass-Foundation-Model\u002F3D-VLA)]\n\n* **RoboPoint**：“用于机器人空间可用性预测的视觉-语言模型”，*CORL 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.07241)] [[网站](https:\u002F\u002Frobo-point.github.io\u002F)] [[演示](https:\u002F\u002Frobo-point.github.io\u002F)]\n\n* **Open6DOR**：“开放指令下6自由度物体重排的基准测试及基于VLM的方法”，*IROS 2024*。[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=axAmAy3Ghl)] [[网站](https:\u002F\u002Fpku-epic.github.io\u002FOpen6DOR\u002F)] [[代码](https:\u002F\u002F007e03d34429a2517b.gradio.live\u002F)]\n\n* **ReasoningGrasp**：“通过多模态大语言模型进行推理抓取”，*CORL 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.06798)]\n\n* **SpatialVLM**：“赋予视觉-语言模型空间推理能力”，*CVPR 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12168)] [[网站](https:\u002F\u002Fspatial-vlm.github.io\u002F)] [[代码](https:\u002F\u002Fspatial-vlm.github.io\u002F#community-implementation)]\n\n* **SpatialRGPT**：“视觉语言模型中的 grounded空间推理”，*arXiv，2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01584)] [[网站](https:\u002F\u002Fwww.anjiecheng.me\u002FSpatialRGPT)]\n\n* **Scene-LLM**：“扩展语言模型以实现3D视觉理解和推理”，*arXiv，2024年3月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.11401)]\n\n* **ManipLLM**：“面向以物体为中心的机器人操作的具身多模态大语言模型”，*CVPR 2024*。[[论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fpapers\u002FLi_ManipLLM_Embodied_Multimodal_Large_Language_Model_for_Object-Centric_Robotic_Manipulation_CVPR_2024_paper.pdf)] [[网站](https:\u002F\u002Fsites.google.com\u002Fview\u002Fmanipllm)] [[代码](https:\u002F\u002Fgithub.com\u002Fclorislili\u002FManipLLM)]\n\n* **Manipulate-Anything**：“Manipulate-Anything：利用视觉-语言模型自动化现实世界机器人”，*CoRL，2024年*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18915)] [[网站](https:\u002F\u002Frobot-ma.github.io\u002F)]\n\n* **MOKA**：“通过基于标记的视觉提示实现开放词汇的机器人操作”，*RSS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03174)] [[网站](https:\u002F\u002Fmoka-manipulation.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fmoka-manipulation\u002Fmoka)]\n\n* **Agent3D-Zero**：“用于零样本3D理解的智能体”，*arXiv，2024年3月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.11835)] [[网站](https:\u002F\u002Fzhangsha1024.github.io\u002FAgent3D-Zero\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fzhangsha1024\u002FAgent3d-zero-code)]\n\n* **MultiPLY**：“3D世界中以物体为中心的多感官具身大语言模型”，*CVPR 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.08577)] [[网站](https:\u002F\u002Fvis-www.cs.umass.edu\u002Fmultiply\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FUMass-Foundation-Model\u002FMultiPLY)]\n\n* **ThinkGrasp**：“用于杂乱环境中战略性部分抓取的视觉-语言系统”，*arXiv，2024年7月*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.11298)] [[网站](https:\u002F\u002Fh-freax.github.io\u002Fthinkgrasp_page\u002F)]\n\n* **VoxPoser**：“结合语言模型的可组合3D价值地图用于机器人操作”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.05973)] [[网站](https:\u002F\u002Fvoxposer.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fhuangwl18\u002FVoxPoser)]\n\n* **Dream2Real**：“利用视觉-语言模型实现零样本3D物体重排”，*ICRA 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.04533.pdf)] [[网站](https:\u002F\u002Fwww.robot-learning.uk\u002Fdream2real)] [[代码](https:\u002F\u002Fgithub.com\u002FFlyCole\u002FDream2Real)]\n\n* **LEO**：“3D世界中的具身通用智能体”，*ICML 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.12871)] [[网站](https:\u002F\u002Fembodied-generalist.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fembodied-generalist\u002Fembodied-generalist)]\n\n* **SpatialPIN**：“通过提示和交互式3D先验增强视觉-语言模型的空间推理能力”，*arXiv，2024年3月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.13438)] [[网站](https:\u002F\u002Fdannymcy.github.io\u002Fzeroshot_task_hallucination\u002F)]\n\n* **SpatialBot**：“借助视觉语言模型实现精确的空间理解”，*arXiv，2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.13642)] [[代码](https:\u002F\u002Fgithub.com\u002FBAAI-DCAI\u002FSpatialBot)]\n\n* **COME-robot**：“使用GPT-4V实现闭环开放词汇移动操作”，*arXiv，2024年4月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.10220)] [[网站](https:\u002F\u002Fcome-robot.github.io\u002F)]\n\n* **3D-LLM**：“通过基于标记的视觉提示实现开放词汇的机器人操作”，*NeurIPS 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.12981)] [[网站](https:\u002F\u002Fvis-www.cs.umass.edu\u002F3dllm\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FUMass-Foundation-Model\u002F3D-LLM)]\n\n* **VLMaps**：“用于机器人导航的视觉语言地图”，*ICRA 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.05714)] [[网站](https:\u002F\u002Fvlmaps.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fvlmaps\u002Fvlmaps.gita)]\n\n* **MoMa-LLM**：“基于语言的动态场景图，用于移动操作中的交互式物体搜索”，*RA-L 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.08605)] [[网站](http:\u002F\u002Fmoma-llm.cs.uni-freiburg.de\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Frobot-learning-freiburg\u002FMoMa-LLM)]\n\n* **LGrasp6D**：“利用负向提示引导的语言驱动6自由度抓取检测”，*ECCV 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.13842)] [[网站](https:\u002F\u002Fairvlab.github.io\u002Fgrasp-anything\u002F)]\n\n* **OpenAD**：“3D点云中的开放词汇可用性检测”，*IROS 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.02401)] [[网站](https:\u002F\u002Fopenad2023.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FOpen-Vocabulary-Affordance-Detection-in-3D-Point-Clouds)]\n\n* **3DAPNet**：“3D点云中基于语言条件的可用性-姿态检测”，*ICRA 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.10911)] [[网站](https:\u002F\u002F3dapnet.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FLanguage-Conditioned-Affordance-Pose-Detection-in-3D-Point-Clouds)]\n\n* **OpenKD**：“利用知识蒸馏和文本-点相关性进行开放词汇可用性检测”，*ICRA 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.10932)] [[代码](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FOpen-Vocabulary-Affordance-Detection-using-Knowledge-Distillation-and-Text-Point-Correlation)]\n\n* **PARIS3D**：“基于推理的大规模多模态模型进行3D部件分割”，*ECCV 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.03836)] [[代码](https:\u002F\u002Fgithub.com\u002FAmrinKareem\u002FPARIS3D)]\n\n\n---\n\n## 表示方法\r\n\r\n* **RoVi-Aug**：“用于跨具身机器人学习的机器人与视点增强”，*CORL 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.03403)] [[网页](https:\u002F\u002Frovi-aug.github.io\u002F)]\r\n\r\n* **Vista**：“通过零样本新视图合成实现视图不变策略学习”，*CORL 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.03685)] [[网页](https:\u002F\u002Fs-tian.github.io\u002Fprojects\u002Fvista\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fs-tian\u002FVISTA)]\r\n\r\n* **GraspSplats**：“基于3D特征泼溅的高效操作”，*CORL 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.02084)] [[网页](https:\u002F\u002Fgraspsplats.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fjimazeyu\u002FGraspSplats)]\r\n\r\n* **RAM**：“基于检索的可及性迁移，用于可泛化的零样本机器人操作”，*CORL 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.04689)] [[网页](https:\u002F\u002Fyxkryptonite.github.io\u002FRAM\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FyxKryptonite\u002FRAM_code)]\r\n\r\n* **语言嵌入高斯泼溅（LEGS）**：“利用移动机器人增量构建房间尺度表示”，*IROS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.18108)] [[网页](https:\u002F\u002Fberkeleyautomation.github.io\u002FLEGS\u002F)]\r\n\r\n* **Splat-MOVER**：“通过可编辑的高斯泼溅实现多阶段、开放词汇的机器人操作”，*arXiv 2024年5月*。[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04378)] [[网页](https:\u002F\u002Fsplatmover.github.io\u002F)]\r\n\r\n* **GNFactor**：“使用可泛化的神经特征场进行多任务真实机器人学习”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.16891)] [[网页](https:\u002F\u002Fyanjieze.com\u002FGNFactor\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FYanjieZe\u002FGNFactor)]\r\n\r\n* **ManiGaussian**：“用于多任务机器人操作的动态高斯泼溅”，*ECCV 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.08321)] [[网页](https:\u002F\u002Fguanxinglu.github.io\u002FManiGaussian\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FGuanxingLu\u002FManiGaussian)]\r\n\r\n* **GaussianGrasper**：“用于开放词汇机器人抓取的3D语言高斯泼溅”，*arXiv 2024年3月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09637.pdf)] [[网页](https:\u002F\u002Fmrsecant.github.io\u002FGaussianGrasper\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FMrSecant\u002FGaussianGrasper)]\r\n\r\n* **ORION**：“基于视觉的单人视频操作，结合开放世界物体图”，*arXiv 2024年5月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.20321)] [[网页](https:\u002F\u002Fut-austin-rpl.github.io\u002FORION-release)]\r\n\r\n* **ConceptGraphs**：“用于感知和规划的开放词汇3D场景图”，*ICRA 2024*。[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.16650)] [[网页](https:\u002F\u002Fconcept-graphs.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fconcept-graphs\u002Fconcept-graphs)]\r\n\r\n* **SparseDFF**：“稀疏视角特征蒸馏，用于一次完成灵巧操作”，*ICLR 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.16838)] [[网页](https:\u002F\u002Fhelloqxwang.github.io\u002FSparseDFF\u002F)]\r\n\r\n* **GROOT**：“利用以对象为中心的3D表示学习可泛化的操作策略”，*CORL 2023*。[[论文](http:\u002F\u002Farxiv.org\u002Fabs\u002F2310.14386)] [[网页](https:\u002F\u002Fut-austin-rpl.github.io\u002FGROOT\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FUT-Austin-RPL\u002FGROOT)]\r\n\r\n* **蒸馏特征场**：“实现少量示例下的语言引导操作”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07931)] [[网页](https:\u002F\u002Ff3rm.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Ff3rm\u002Ff3rm)]\r\n\r\n* **SGR**：“一种用于机器人操作的通用语义—几何表示”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.10474.pdf)] [[网页](https:\u002F\u002Fsemantic-geometric-representation.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FTongZhangTHU\u002Fsgr)]\r\n\r\n* **OVMM**：“在未见的动态环境中，利用3D语义地图实现开放词汇的移动操作”，*arXiv，2024年6月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18115)]\r\n\r\n* **CLIP-Fields**：“用于机器人记忆的弱监督语义场”，*RSS 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.05663)] [[网页](https:\u002F\u002Fmahis.life\u002Fclip-fields)] [[代码](https:\u002F\u002Fgithub.com\u002Fnotmahi\u002Fclip-fields)]\r\n\r\n* **掌中NeRF**：“通过新视图合成对机器人进行修正增强”，*CVPR 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.08556)] [[网页](https:\u002F\u002Fbland.website\u002Fspartn\u002F)]\r\n\r\n* **JCR**：“利用3D基础模型统一场景表示与手眼标定”，*arXiv，2024年4月*。[[论文](https:\u002F\u002Farxiv.org\u002Fabpdfs\u002F2404.11683v1)] [[代码](https:\u002F\u002Fgithub.com\u002Ftomtang502\u002Farm_3d_reconstruction)]\r\n\r\n* **D3Fields**：“用于零样本可泛化机器人操作的动态3D描述子场”，*arXiv，2023年9月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.16118)] [[网页](https:\u002F\u002Frobopil.github.io\u002Fd3fields\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FWangYixuan12\u002Fd3fields)]\r\n\r\n* **SayPlan**：“利用3D场景图接地大型语言模型，实现可扩展的机器人任务规划”，*CORL 2023*。[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=wMpOMO0Ss7a)] [[网页](https:\u002F\u002Fsayplan.github.io\u002F)]\r\n\r\n* **Dex-NeRF**：“使用神经辐射场抓取透明物体”，*CORL 2021*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.14217)] [[网页](https:\u002F\u002Fsites.google.com\u002Fview\u002Fdex-nerf)]\r\n\r\n---\n\n## 模拟环境、数据集和基准测试\n\n* **RoboTracer**：“通过视觉-语言模型中的推理掌握机器人领域的空间轨迹”，*ArXiv 2025*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2512.13660)] [[官网](https:\u002F\u002Fzhoues.github.io\u002FRoboTracer\u002F)] \n\n* **RoboRefer**：“面向机器人的视觉-语言模型中基于推理的空间指代”，*ArXiv 2025*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2506.04308)] [[官网](https:\u002F\u002Fzhoues.github.io\u002FRoboRefer\u002F)] \n\n* **The Colosseum**：“用于评估机器人操作泛化能力的基准测试”，*RSS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08191)] [[官网](https:\u002F\u002Frobot-colosseum.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Frobot-colosseum\u002Frobot-colosseum)] \n\n* **OpenEQA**：“基础模型时代下的具身问答”，*CVPR 2024*。[[论文](https:\u002F\u002Fopen-eqa.github.io\u002Fassets\u002Fpdfs\u002Fpaper.pdf)] [[官网](https:\u002F\u002Fopen-eqa.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fopen-eqa)] \n\n* **DROID**：“大规模野外机器人操作数据集”，*RSS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.12945)] [[官网](https:\u002F\u002Fdroid-dataset.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fdroid-dataset\u002Fdroid)] \n\n* **RH20T**：“用于一次性学习多样化技能的综合性机器人数据集”，*ICRA 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.00595)] [[官网](https:\u002F\u002Frh20t.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Frh20t\u002Frh20t_api)] \n\n* **Gen2Sim**：“用于一次性学习多样化技能的综合性机器人数据集”，*ICRA 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.18308)] [[官网](https:\u002F\u002Fgen2sim.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fpushkalkatara\u002FGen2Sim)] \n\n* **BEHAVIOR Vision Suite**：“通过仿真实现可定制的数据集生成”，*CVPR 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.09546)] [[官网](https:\u002F\u002Fbehavior-vision-suite.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fbehavior-vision-suite\u002Fbehavior-vision-suite.github.io)] \n\n* **RoboCasa**：“面向通用型机器人的大规模日常任务仿真”，*RSS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.02523)] [[官网](https:\u002F\u002Frobocasa.ai\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Frobocasa\u002Frobocasa)] \n\n* **ARNOLD**：“ARNOLD：在真实3D场景中基于连续状态的语言接地任务学习基准”，*ICCV 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.04321)] [[网页](https:\u002F\u002Farnold-benchmark.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Farnold-benchmark\u002Farnold)] \n\n* **VIMA**：“利用多模态提示进行通用机器人操作”，*ICML 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.03094)] [[官网](https:\u002F\u002Fvimalabs.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fvimalabs\u002FVIMA)] \n\n* **ManiSkill2**：“用于可泛化操作技能的统一基准”，*ICLR 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.04659)] [[官网](https:\u002F\u002Fmaniskill2.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fhaosulab\u002FManiSkill2)] \n\n* **Robo360**：“包含多种材质的3D全方位机器人操作数据集”，*arxiv，2023年12月*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.06686)] \n\n* **AR2-D2**：“无需机器人即可训练机器人”，*CORL 2023*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.13818)] [[官网](https:\u002F\u002Far2d2.site\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fjiafei1224\u002FAR2-D2)] \n\n* **Habitat 2.0**：“训练家庭助手重新布置其居住环境”，*Neuips 2021*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.14405)] [[官网](https:\u002F\u002Faihabitat.org\u002Fdocs\u002Fhabitat2\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fhabitat-lab)] \n\n* **VL-Grasp**：“针对杂乱室内场景中以语言为导向物体的6自由度交互式抓取策略”，*IROS 2023*。[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10341379)] [[代码](https:\u002F\u002Fgithub.com\u002Fluyh20\u002Fvl-grasp)] \n\n* **OCID-Ref**：“包含具身语言的3D机器人数据集，用于杂乱场景中的对象定位”，*NAACL 2021*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.07679)] [[代码](https:\u002F\u002Fgithub.com\u002Flluma\u002FOCID-Ref)]  \n\n* **ManipulaTHOR**：“用于视觉对象操作的框架”，*CVPR 2021*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.11213.pdf)] [[官网](https:\u002F\u002Fai2thor.allenai.org\u002Fmanipulathor\u002F)] [[代码](https:\u002F\u002Fai2thor.allenai.org\u002Fmanipulathor\u002F)] \n\n* **RoboTHOR**：“开放的模拟到现实具身AI平台”，*CVPR 2020*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.06799)] [[官网](https:\u002F\u002Fai2thor.allenai.org\u002Frobothor\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fai2thor)] \n\n* **HabiCrowd**：“HabiCrowd：用于人群感知视觉导航的高性能仿真器”，*IROS 2024*。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.11377)] [[官网](https:\u002F\u002Fhabicrowd.github.io\u002F)] [[代码](https:\u002F\u002Fgithub.com\u002FFsoft-AIC\u002FHabiCrowd)] \n\n----\n\n[![Star历史图](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fzubair-irshad_Awesome-Robotics-3D_readme_a72dbe8072cf.png)](https:\u002F\u002Fstar-history.com\u002F#zubair-irshad\u002FAwesome-Robotics-3D&Date)\n\n----\n\n## 引用\n如果您觉得本仓库有用，请考虑引用此列表：\n```bibtex\n@misc{irshad2024roboticd3D,\n    title = {Awesome Robotics 3D - 关于机器人相关3D视觉论文的精选资源列表},\n    author = {Muhammad Zubair Irshad},\n    journal = {GitHub仓库},\n    url = {https:\u002F\u002Fgithub.com\u002Fzubair-irshad\u002FAwesome-Robotics-3D},\n    year = {2024},\n}\n```","# Awesome-Robotics-3D 快速上手指南\n\n**Awesome-Robotics-3D** 并非一个单一的可执行软件或 Python 库，而是一个**精选资源列表仓库**。它汇集了在大模型（LLM\u002FVLM）时代下，与机器人领域相关的 3D 视觉论文、代码实现、数据集和基准测试。\n\n本指南将指导你如何获取该列表，并快速启动其中收录的典型项目（如策略学习或 VLM 模型）。\n\n## 环境准备\n\n由于列表中包含多个不同的开源项目，建议先准备好通用的开发环境。大多数项目基于 PyTorch 生态。\n\n### 系统要求\n- **操作系统**: Linux (Ubuntu 20.04\u002F22.04 推荐) 或 macOS\n- **GPU**: NVIDIA GPU (推荐显存 ≥ 16GB，用于运行 3D 扩散策略或大模型)\n- **CUDA**: 版本需与安装的 PyTorch 匹配 (通常建议 CUDA 11.8 或 12.1+)\n\n### 前置依赖\n确保已安装以下基础工具：\n- Python 3.8+\n- Git\n- Conda (推荐用于管理不同项目的虚拟环境)\n\n```bash\n# 检查 Python 和 Git 版本\npython --version\ngit --version\n```\n\n## 安装步骤\n\n### 1. 克隆资源列表仓库\n首先获取完整的论文和代码索引列表。\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fzubair-irshad\u002FAwesome-Robotics-3D.git\ncd Awesome-Robotics-3D\n```\n\n### 2. 选择并安装具体项目\n浏览仓库中的 `README.md` 或分类章节（如 [Policy Learning](#policy-learning), [VLM and LLM](#vlm-and-llm)），找到你感兴趣的项目链接。\n\n**示例：以 \"3D Diffusion Policy\" 为例进行安装**\n\n大多数项目会提供独立的 GitHub 仓库链接。以下是通用的安装流程：\n\n```bash\n# 创建独立的虚拟环境 (避免依赖冲突)\nconda create -n robotics3d python=3.9\nconda activate robotics3d\n\n# 假设目标项目地址为 https:\u002F\u002Fgithub.com\u002FYanjieZe\u002F3D-Diffusion-Policy\ngit clone https:\u002F\u002Fgithub.com\u002FYanjieZe\u002F3D-Diffusion-Policy.git\ncd 3D-Diffusion-Policy\n\n# 安装项目依赖 (国内用户推荐使用清华源加速)\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n\n# 安装项目本身 (如果存在 setup.py 或 pyproject.toml)\npip install -e . -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n> **提示**：每个具体项目的依赖可能不同（如需要特定的 Mujoco, Isaac Gym 或 CUDA 版本），请务必查阅该项目主页的 `Installation` 章节。\n\n## 基本使用\n\n由于这是一个资源聚合列表，\"使用\"通常指运行列表中某个具体算法的演示或训练脚本。\n\n### 场景示例：运行预训练模型进行推理\n以 **3D Diffusion Policy** 为例，在配置好环境后，通常可以通过以下命令加载预训练权重并进行简单的推理演示。\n\n```bash\n# 进入项目目录\ncd 3D-Diffusion-Policy\n\n# 下载预训练权重 (具体命令参考该项目 README，此处为示例)\nwget https:\u002F\u002Fhuggingface.co\u002F...\u002Fcheckpoint.pt -P checkpoints\u002F\n\n# 运行推理演示\npython scripts\u002Fplay_policy.py \\\n    --config configs\u002Fcube_push.yaml \\\n    --checkpoint checkpoints\u002Fcheckpoint.pt\n```\n\n### 如何查找更多用法\n1. 打开本地克隆的 `Awesome-Robotics-3D\u002FREADME.md` 文件。\n2. 搜索关键词（如 `VLM`, `Grasping`, `Simulation`）。\n3. 点击对应论文的 **[Code]** 链接跳转至源代码仓库。\n4. 在源代码仓库中查看 `Usage` 或 `Quick Start` 部分获取具体命令。\n\n### 贡献与更新\n如果你想添加新的论文或工具到该列表：\n```bash\n# 创建分支\ngit checkout -b add-new-paper\n\n# 编辑 README.md 添加新条目\n# ... 编辑内容 ...\n\n# 提交并推送\ngit add README.md\ngit commit -m \"Add new paper: [Paper Name]\"\ngit push origin add-new-paper\n```\n随后在 GitHub 页面发起 Pull Request。","某机器人初创公司的算法团队正致力于开发一款能理解自然语言指令并执行复杂抓取任务的智能机械臂，急需整合最新的 3D 视觉与大模型技术。\n\n### 没有 Awesome-Robotics-3D 时\n- **文献检索如大海捞针**：研究人员需在 arXiv、GitHub 和各类会议网站间手动穿梭，难以区分哪些论文真正结合了 LLM\u002FVLM 与 3D 机器人任务，耗时数周仍可能遗漏关键成果。\n- **代码复现门槛极高**：找到论文后，往往发现官方代码未开源或链接失效，缺乏统一入口验证算法的可用性，导致大量时间浪费在寻找可运行代码上。\n- **技术选型盲目试错**：面对分散的策略学习（Policy Learning）和预训练（Pretraining）方法，团队难以快速对比如\"3D Diffuser Actor\"与\"ManiCM\"等方案的优劣，容易选错技术路线。\n- **前沿动态更新滞后**：大模型领域迭代极快，人工追踪难以实时获取如\"Neural Fields in Robotics\"等最新综述，导致研发方案刚起步就已落后于社区主流。\n\n### 使用 Awesome-Robotics-3D 后\n- **一站式精准定位资源**：团队直接通过分类目录（如 VLM and LLM、Representations）锁定目标，瞬间获取包含论文、网页演示及代码链接的完整清单，调研效率提升十倍。\n- **即拿即用的代码生态**：每个条目均附带经过验证的代码仓库链接（如 SAM2Act、HDP），工程师可立即克隆并测试基线模型，大幅缩短从理论到原型的周期。\n- **科学决策技术路线**：借助 curated list 中对不同策略学习方法的系统梳理，团队快速对比出适合自身场景的“分层扩散策略”，避免了盲目的重复造轮子。\n- **同步全球最新进展**：依托社区的持续维护，团队能第一时间掌握 2024-2025 年的最新突破（如 ICML 2025 的新作），确保产品架构始终处于行业前沿。\n\nAwesome-Robotics-3D 将原本碎片化、高成本的科研探索过程，转化为高效、结构化的技术落地流程，是连接学术前沿与机器人工程实践的关键桥梁。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fzubair-irshad_Awesome-Robotics-3D_12e48983.jpg","zubair-irshad","Zubair Irshad","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fzubair-irshad_ec5c139f.jpg","🔭 Machine Learning Research Scientist @ToyotaResearch\r\n🔭 Researching 3D Vision | Scene Understanding | Embodied AI \r\n🎓 PhD in AI and ML from @GeorgiaTech","@GeorgiaTech @TRI-ML @GT-RIPL","Silicon Valley, CA, USA","muhammadzubairirshad@gmail.com","mzubairirshad","https:\u002F\u002Fzubairirshad.com","https:\u002F\u002Fgithub.com\u002Fzubair-irshad",null,806,41,"2026-04-01T19:57:39",1,"","未说明",{"notes":94,"python":92,"dependencies":95},"该仓库是一个论文和资源列表（Awesome List），并非一个可直接运行的单一软件工具。它收录了多个独立的机器人 3D 视觉项目（如 SAM2Act, 3D Diffuser Actor 等），每个子项目都有各自独立的运行环境、依赖库和硬件要求。用户需访问列表中具体项目的代码仓库链接以获取相应的安装和运行指南。",[],[54,14,13,26],[98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116],"3d","benchmarks","computer-vision","gaussian-splatting","grasping","llm","manipulation","nerf","pointclouds","policy-learning","pretraining","robotics","scene-graph","simulations","vision-language-model","vlm","diffusion-models","foundation-models","navigation","2026-03-27T02:49:30.150509","2026-04-06T08:46:50.521867",[],[]]