[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-peteanderson80--Matterport3DSimulator":3,"tool-peteanderson80--Matterport3DSimulator":64},[4,18,28,36,44,52],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":24,"last_commit_at":25,"category_tags":26,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160015,2,"2026-04-18T11:30:52",[14,13,27],"语言模型",{"id":29,"name":30,"github_repo":31,"description_zh":32,"stars":33,"difficulty_score":10,"last_commit_at":34,"category_tags":35,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[27,15,13,14],{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":24,"last_commit_at":42,"category_tags":43,"status":17},8553,"spec-kit","github\u002Fspec-kit","Spec Kit 是一款专为提升软件开发效率而设计的开源工具包，旨在帮助团队快速落地“规格驱动开发”（Spec-Driven Development）模式。传统开发中，需求文档往往与代码实现脱节，导致沟通成本高且结果不可控；而 Spec Kit 通过将规格说明书转化为可执行的指令，让 AI 直接依据明确的业务场景生成高质量代码，从而减少从零开始的随意编码，确保产出结果的可预测性。\n\n该工具特别适合希望利用 AI 辅助编程的开发者、技术负责人及初创团队。无论是启动全新项目还是在现有工程中引入规范化流程，用户只需通过简单的命令行操作，即可初始化项目并集成主流的 AI 编程助手。其核心技术亮点在于“规格即代码”的理念，支持社区扩展与预设模板，允许用户根据特定技术栈定制开发流程。此外，Spec Kit 强调官方维护的安全性，提供稳定的版本管理，帮助开发者在享受 AI 红利的同时，依然牢牢掌握架构设计的主动权，真正实现从“凭感觉写代码”到“按规格建系统”的转变。",88749,"2026-04-17T09:48:14",[27,15,13,14],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":24,"last_commit_at":50,"category_tags":51,"status":17},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[14,27],{"id":53,"name":54,"github_repo":55,"description_zh":56,"stars":57,"difficulty_score":24,"last_commit_at":58,"category_tags":59,"status":17},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85267,"2026-04-18T11:00:28",[15,16,60,61,13,62,27,14,63],"视频","插件","其他","音频",{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":81,"owner_email":79,"owner_twitter":82,"owner_website":83,"owner_url":84,"languages":85,"stars":114,"forks":115,"last_commit_at":116,"license":117,"difficulty_score":10,"env_os":118,"env_gpu":119,"env_ram":120,"env_deps":121,"category_tags":132,"github_topics":133,"view_count":24,"oss_zip_url":79,"oss_zip_packed_at":79,"status":17,"created_at":141,"updated_at":142,"faqs":143,"releases":178},9174,"peteanderson80\u002FMatterport3DSimulator","Matterport3DSimulator","AI Research Platform for Reinforcement Learning from Real Panoramic Images.","Matterport3DSimulator 是一个专为强化学习研究打造的 AI 仿真平台，旨在让智能体在真实的 3D 室内环境中，通过视觉信息进行交互与导航。它有效解决了传统仿真器依赖合成数据、缺乏真实世界视觉复杂度的痛点，为计算机视觉、自然语言处理与机器人学的交叉研究提供了高保真的实验场。\n\n该工具主要面向人工智能研究人员、算法开发者及机器人学专家，特别适用于需要训练智能体理解“视觉 - 语言”指令（如根据文字描述在陌生建筑中寻路）的场景。其核心亮点在于基于 Matterport3D 数据集，提供了 90 个涵盖住宅、办公室等真实场景的 360 度全景 RGB-D 图像，而非计算机生成的模拟画面，从而极大提升了训练数据的真实性。\n\n技术上，Matterport3DSimulator 支持高效的离线渲染，在高性能 GPU 上可实现每秒千帧的处理速度，并具备批量代理模拟能力，显著提升了训练效率。同时，它提供灵活的 Python 和 C++ 接口，允许用户自定义相机参数与图像分辨率，并支持 Docker 部署以降低环境配置难度。无论是探索视觉导航算法，还是验证多模态交互模型，这都是一个强大","Matterport3DSimulator 是一个专为强化学习研究打造的 AI 仿真平台，旨在让智能体在真实的 3D 室内环境中，通过视觉信息进行交互与导航。它有效解决了传统仿真器依赖合成数据、缺乏真实世界视觉复杂度的痛点，为计算机视觉、自然语言处理与机器人学的交叉研究提供了高保真的实验场。\n\n该工具主要面向人工智能研究人员、算法开发者及机器人学专家，特别适用于需要训练智能体理解“视觉 - 语言”指令（如根据文字描述在陌生建筑中寻路）的场景。其核心亮点在于基于 Matterport3D 数据集，提供了 90 个涵盖住宅、办公室等真实场景的 360 度全景 RGB-D 图像，而非计算机生成的模拟画面，从而极大提升了训练数据的真实性。\n\n技术上，Matterport3DSimulator 支持高效的离线渲染，在高性能 GPU 上可实现每秒千帧的处理速度，并具备批量代理模拟能力，显著提升了训练效率。同时，它提供灵活的 Python 和 C++ 接口，允许用户自定义相机参数与图像分辨率，并支持 Docker 部署以降低环境配置难度。无论是探索视觉导航算法，还是验证多模态交互模型，这都是一个强大且专业的开源基石。","# Matterport3D Simulator\nAI Research Platform for Reinforcement Learning from Real Panoramic Images.\n\nThe Matterport3D Simulator enables development of AI **agents that interact with real 3D environments using visual information** (RGB-D images). It is primarily intended for research in deep reinforcement learning, at the intersection of computer vision, natural language processing and robotics.\n\n![Concept](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpeteanderson80_Matterport3DSimulator_readme_8047a4cda0c1.jpg)\n\nVisit the main [website](https:\u002F\u002Fbringmeaspoon.org\u002F) to view a demo.\n\n*NEW February 2019*: We have released several updates. The simulator is now dockerized, it supports batches of agents instead of just a single agent, and it is far more efficient (faster) than before. Also, it now outputs depth maps as well as RGB images. As a consequence, there are some changes to the original API (mainly, all inputs and outputs are now batched). Therefore, to mark the first release we have tagged it as [v0.1](https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Ftree\u002Fv0.1) for any users that don't want to change to the new version. \n\n## Features\n- Dataset consisting of 90 different predominantly indoor environments,\n- Outputs RGB and depth images\n- All images and depth maps are real, not synthetic (providing much more visual complexity),\n- API for C++ and Python\n- Customizable image resolution, camera parameters, etc,\n- Supports off-screen rendering (both GPU and CPU based)\n- Fast (Around 1000 fps RGB-D off-screen rendering at 640x480 resolution using a Titan X GPU)\n- Unit tests for the rendering pipeline and agent's motions etc\n- Future releases may support class and instance object segmentations.\n\n## Reference\n\nThe Matterport3D Simulator and the Room-to-Room (R2R) navigation dataset are described in:\n- [Vision-and-Language Navigation: Interpreting visually-grounded navigation instructions in real environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.07280).\n\nIf you use the simulator or our dataset, please cite our paper (CVPR 2018 spotlight oral):\n\n### Bibtex:\n```\n@inproceedings{mattersim,\n  title={{Vision-and-Language Navigation}: Interpreting visually-grounded navigation instructions in real environments},\n  author={Peter Anderson and Qi Wu and Damien Teney and Jake Bruce and Mark Johnson and Niko S{\\\"u}nderhauf and Ian Reid and Stephen Gould and Anton van den Hengel},\n  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},\n  year={2018}\n}\n```\n\n## Simulator Data\n\nMatterport3D Simulator is based on densely sampled 360-degree indoor RGB-D images from the [Matterport3D dataset](https:\u002F\u002Fniessner.github.io\u002FMatterport\u002F). The dataset consists of 90 different indoor environments, including homes, offices, churches and hotels. Each environment contains full 360-degree RGB-D scans from between 8 and 349 viewpoints, spread on average 2.25m apart throughout the entire walkable floorplan of the scene.\n\n### Actions\n\nAt each viewpoint location, the agent can pan and elevate the camera. The agent can also choose to move between viewpoints. The precise details of the agent's observations and actions are [described below](#simulator-api) and in the paper.\n\n### Room-to-Room (R2R) Navigation Task\n\nThe simulator includes the training data and evaluation metrics for the Room-to-Room (R2R) Navigation task, which requires an autonomous agent to follow a natural language navigation instruction to navigate to a goal location in a previously unseen building. Please refer to [specific instructions](tasks\u002FR2R\u002FREADME.md) to setup and run this task. There is a test server and leaderboard available at [EvalAI](https:\u002F\u002Fevalai.cloudcv.org\u002Fweb\u002Fchallenges\u002Fchallenge-page\u002F97\u002Foverview).\n\n## Installation \u002F Build Instructions\n\nWe recommend using our [Dockerfile](Dockerfile) to install the simulator. The simulator can also be [built without docker](#building-without-docker) but satisfying the project dependencies may be more difficult.\n\n### Prerequisites\n\n- Nvidia GPU with driver >= 396.37\n- Install [docker](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstallation\u002F)\n- Install [nvidia-docker2.0](https:\u002F\u002Fgithub.com\u002Fnvidia\u002Fnvidia-docker\u002Fwiki\u002FInstallation-(version-2.0))\n- Note: CUDA \u002F CuDNN toolkits do not need to be installed (these are provided by the docker image)\n\n### Clone Repo\n\nClone the Matterport3DSimulator repository:\n```\n# Make sure to clone with --recursive\ngit clone --recursive https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator.git\ncd Matterport3DSimulator\n```\n\nIf you didn't clone with the `--recursive` flag, then you'll need to manually clone the pybind submodule from the top-level directory:\n```\ngit submodule update --init --recursive\n```\n\n### Dataset Download\n\nTo use the simulator you must first download the [Matterport3D Dataset](https:\u002F\u002Fniessner.github.io\u002FMatterport\u002F) which is available after requesting access [here](https:\u002F\u002Fniessner.github.io\u002FMatterport\u002F). The download script that will be provided allows for downloading of selected data types. At minimum you must download the `matterport_skybox_images`. If you wish to use depth outputs then also download `undistorted_depth_images` and `undistorted_camera_parameters`.\n\nSet an environment variable to the location of the **unzipped** dataset, where \u003CPATH> is the full absolute path (not a relative path or symlink) to the directory containing the individual matterport scan directories (17DRP5sb8fy, 2t7WUuJeko7, etc):\n```\nexport MATTERPORT_DATA_DIR=\u003CPATH>\n```\n\nNote that if \u003CPATH> is a remote sshfs mount, you will need to mount it with the `-o allow_root` option or the docker container won't be able to access this directory. \n\n### Building using Docker\n\nBuild the docker image:\n```\ndocker build -t mattersim:9.2-devel-ubuntu18.04 .\n```\n\nRun the docker container, mounting both the git repo and the dataset:\n```\nnvidia-docker run -it --mount type=bind,source=$MATTERPORT_DATA_DIR,target=\u002Froot\u002Fmount\u002FMatterport3DSimulator\u002Fdata\u002Fv1\u002Fscans --volume `pwd`:\u002Froot\u002Fmount\u002FMatterport3DSimulator mattersim:9.2-devel-ubuntu18.04\n```\n\nNow (from inside the docker container), build the simulator code:\n```\ncd \u002Froot\u002Fmount\u002FMatterport3DSimulator\nmkdir build && cd build\ncmake -DEGL_RENDERING=ON ..\nmake\ncd ..\u002F\n```\n\n#### Rendering Options (GPU, CPU, off-screen)\n\nNote that there are three rendering options, which are selected using [cmake](https:\u002F\u002Fcmake.org\u002F) options during the build process (by varying line 3 in the build commands immediately above):\n- GPU rendering using OpenGL (requires an X server): `cmake ..` (default)\n- Off-screen GPU rendering using [EGL](https:\u002F\u002Fwww.khronos.org\u002Fegl\u002F): `cmake -DEGL_RENDERING=ON ..`\n- Off-screen CPU rendering using [OSMesa](https:\u002F\u002Fwww.mesa3d.org\u002Fosmesa.html): `cmake -DOSMESA_RENDERING=ON ..`\n\nThe recommended (fast) approach for training agents is using off-screen GPU rendering (EGL).\n\n### Dataset Preprocessing\n\nTo make data loading faster and to reduce memory usage we preprocess the `matterport_skybox_images` by downscaling and combining all cube faces into a single image. While still inside the docker container, run the following script:\n```\n.\u002Fscripts\u002Fdownsize_skybox.py\n```\n\nThis will take a while depending on the number of processes used (which is a setting in the script). \n\nAfter completion, the `matterport_skybox_images` subdirectories in the dataset will contain image files with filename format `\u003CPANO_ID>_skybox_small.jpg`. By default images are downscaled by 50% and 20 processes are used.\n\n#### Depth Outputs\n\nIf you need depth outputs as well as RGB (via `sim.setDepthEnabled(True)`), precompute matching depth skybox images by running this script:\n```\n.\u002Fscripts\u002Fdepth_to_skybox.py\n```\n\nDepth skyboxes are generated from the `undistorted_depth_images` using a simple blending approach. As the depth images contain many missing values (corresponding to shiny, bright, transparent, and distant surfaces, which are common in the dataset) we apply a simple crossbilateral filter based on the [NYUv2](https:\u002F\u002Fcs.nyu.edu\u002F~silberman\u002Fdatasets\u002Fnyu_depth_v2.html) code to fill all but the largest holes. A couple of things to keep in mind:\n- We assume that the `undistorted depth images` are aligned to the `matterport_skybox_images`, but in fact this alignment is not perfect. For certain applications where better alignment is required (e.g., generating RGB pointclouds) it might be necessary to replace the `matterport_skybox_images` by stitching together `undistorted_color_images` (which are perfectly aligned to the `undistorted_depth_images`).\n- In the generated depth skyboxes, the depth value is the euclidean distance from the camera center (not the distance in the z direction). This is corrected by the simulator (see Simulator API, below).\n\n\n### Running Tests\n\nNow (still from inside the docker container), run the unit tests:\n```\n.\u002Fbuild\u002Ftests ~Timing\n```\n\nAssuming all tests pass, `sim_imgs` will now contain some test images rendered by the simulator. You may also wish to test the rendering frame rate. The following command will try to load all the Matterport environments into memory (requiring around 50 GB memory), and then some information about the rendering frame rate (at 640x480 resolution, RGB outputs only) will be printed to stdout:\n```\n.\u002Fbuild\u002Ftests Timing\n```\n\nThe timing test must be run individually from the other tests to get accurate results. Not that the Timing test will fail if there is insufficient memory. As long as all the other tests pass (i.e., `.\u002Fbuild\u002Ftests ~Timing`) then the install is good. Refer to the [Catch](https:\u002F\u002Fgithub.com\u002Fphilsquared\u002FCatch) documentation for unit test configuration options.\n\nNow exit the docker container:\n```\nexit\n```\n\n## Interactive Demo\n\nTo run an interactive demo, after completing the Installation \u002F Build Instructions above, run the docker container while sharing the host's X server and DISPLAY environment variable with the container:\n```\nxhost +\nnvidia-docker run -it -e DISPLAY -v \u002Ftmp\u002F.X11-unix:\u002Ftmp\u002F.X11-unix --mount type=bind,source=$MATTERPORT_DATA_DIR,target=\u002Froot\u002Fmount\u002FMatterport3DSimulator\u002Fdata\u002Fv1\u002Fscans,readonly --volume `pwd`:\u002Froot\u002Fmount\u002FMatterport3DSimulator mattersim:9.2-devel-ubuntu18.04\ncd \u002Froot\u002Fmount\u002FMatterport3DSimulator\n```\n\nIf you get an error like `Error: BadShmSeg (invalid shared segment parameter) 128` you may also need to include `-e=\"QT_X11_NO_MITSHM=1\"` in the docker run command above.\n\nCommands for running both python and C++ demos are provided below. These are very simple demos designed to illustrate the use of the simulator in python and C++. By default, *these demos have depth rendering off*. Check the code and turn it on if you have preprocessed the depth outputs and want to see depth as well (see Depth Outputs above). These demos should work regardless of which rendering option was used when building the simulator. \n\nPython demo:\n```\npython3 src\u002Fdriver\u002Fdriver.py\n```\nC++ demo:\n```\nbuild\u002Fmattersim_main\n```\n\nThe javascript code in the `web` directory can also be used as an interactive demo, or to generate videos from the simulator in first-person view, or as an interface on Amazon Mechanical Turk to collect natural language instruction data. \n\n\n### Building without Docker\n\nThe simulator can be built outside of a docker container using the cmake build commands described above. However, this is not the recommended approach, as all dependencies will need to be installed locally and may conflict with existing libraries. The main requirements are:\n- Ubuntu >= 14.04\n- Nvidia-driver with CUDA installed \n- C++ compiler with C++11 support\n- [CMake](https:\u002F\u002Fcmake.org\u002F) >= 3.10\n- [OpenCV](http:\u002F\u002Fopencv.org\u002F) >= 2.4 including 3.x\n- [OpenGL](https:\u002F\u002Fwww.opengl.org\u002F)\n- [GLM](https:\u002F\u002Fglm.g-truc.net\u002F0.9.8\u002Findex.html)\n- [Numpy](http:\u002F\u002Fwww.numpy.org\u002F)\n\nOptional dependences (depending on the cmake rendering options):\n- [OSMesa](https:\u002F\u002Fwww.mesa3d.org\u002Fosmesa.html) for OSMesa backend support\n- [epoxy](https:\u002F\u002Fgithub.com\u002Fanholt\u002Flibepoxy) for EGL backend support\n\nThe provided [Dockerfile](Dockerfile) contains install commands for most of these libraries. For example, to install OpenGL and related libraries:\n```\nsudo apt-get install libjsoncpp-dev libepoxy-dev libglm-dev libosmesa6 libosmesa6-dev libglew-dev\n```\n\n### Simulator API\n\nThe simulator [API in Python](src\u002Flib_python\u002FMatterSimPython.cpp) exactly matches the extensively commented [MatterSim.hpp](include\u002FMatterSim.hpp) C++ header file, but using python lists in place of C++ std::vectors etc. In general, there are various functions beginning with `set` that set the agent and simulator configuration (such as batch size, rendering parameters, enabling depth output etc). For training agents, we recommend setting `setPreloadingEnabled(True)`, `setBatchSize(X)` and `setCacheSize(2X)`, where X is the desired batch size, e.g.:\n```\nimport MatterSim\nsim = MatterSim.Simulator()\nsim.setCameraResolution(640, 480)\nsim.setPreloadingEnabled(True)\nsim.setDepthEnabled(True)\nsim.setBatchSize(100)\nsim.setCacheSize(200) # cacheSize 200 uses about 1.2GB of GPU memory for caching pano textures\n``` \n\nWhen preloading is enabled, all the pano images will be loaded into memory before starting. Preloading takes several minutes and requires around 50G memory for RGB output (about 80G if depth output is enabled), but rendering is much faster. \n\nTo start the simulator, call `initialize` followed by the `newEpisode` function, which takes as arguments a list of scanIds, a list of viewpoint ids, a list of headings (in radians), and a list of camera elevations (in radians), e.g.:\n```\nsim.initialize()\n# Assuming batchSize = 1\nsim.newEpisode(['2t7WUuJeko7'], ['1e6b606b44df4a6086c0f97e826d4d15'], [0], [0])\n```\n\nHeading is defined from the y-axis with the z-axis up (turning right is positive). Camera elevation is measured from the horizon defined by the x-y plane (up is positive). There is also a `newRandomEpisode` function which only requires a list of scanIds, and randomly determines a viewpoint and heading (with zero camera elevation). \n\nInteraction with the simulator is through the `makeAction` function, which takes as arguments a list of navigable location indices, a list of heading changes (in radians) and a list of elevation changes (in radians). The navigable location indices select which nearby camera viewpoint the agent should move to. *By default, only camera viewpoints that are within the agent's current field of view are considered navigable, unless restricted navigation is turned off* (i.e., the agent can't move backwards, for example). For agent `n`, navigable locations are given by `getState()[n].navigableLocations`. Index 0 always contains the current viewpoint (i.e., the agent always has the option to stay in the same place). As the navigation graph is irregular, the remaining viewpoints are sorted by their angular distance from the centre of the image, so index 1 (if available) will approximate moving directly forward. For example, to turn 30 degrees left without moving (keeping camera elevation unchanged): \n```\nsim.makeAction([0], [-0.523599], [0])\n```\n\nAt any time the simulator state can be returned by calling `getState`. The returned state contains a list of objects (one for each agent in the batch), with attributes as in the following example:\n```javascript\n[\n  {\n    \"scanId\" : \"2t7WUuJeko7\"  \u002F\u002F Which building the agent is in\n    \"step\" : 5,               \u002F\u002F Number of frames since the last newEpisode() call\n    \"rgb\" : \u003Cimage>,          \u002F\u002F 8 bit image (in BGR channel order), access with np.array(rgb, copy=False)\n    \"depth\" : \u003Cimage>,        \u002F\u002F 16 bit single-channel image containing the pixel's distance in the z-direction from the camera center \n                              \u002F\u002F (not the euclidean distance from the camera center), 0.25 mm per value (divide by 4000 to get meters). \n                              \u002F\u002F A zero value denotes 'no reading'. Access with np.array(depth, copy=False)\n    \"location\" : {            \u002F\u002F The agent's current 3D location\n        \"viewpointId\" : \"1e6b606b44df4a6086c0f97e826d4d15\",  \u002F\u002F Viewpoint identifier\n        \"ix\" : 5,                                            \u002F\u002F Viewpoint index, used by simulator\n        \"x\" : 3.59775996208,                                 \u002F\u002F 3D position in world coordinates\n        \"y\" : -0.837355971336,\n        \"z\" : 1.68884003162,\n        \"rel_heading\" : 0,                                   \u002F\u002F Robot relative coords to this location\n        \"rel_elevation\" : 0,\n        \"rel_distance\" : 0\n    }\n    \"heading\" : 3.141592,     \u002F\u002F Agent's current camera heading in radians\n    \"elevation\" : 0,          \u002F\u002F Agent's current camera elevation in radians\n    \"viewIndex\" : 0,          \u002F\u002F Index of the agent's current viewing angle [0-35] (only valid with discretized viewing angles)\n                              \u002F\u002F [0-11] is looking down, [12-23] is looking at horizon, is [24-35] looking up\n    \"navigableLocations\": [   \u002F\u002F List of viewpoints you can move to. Index 0 is always the current viewpoint, i.e. don't move.\n        {                     \u002F\u002F The remaining valid viewpoints are sorted by their angular distance from the image centre.\n            \"viewpointId\" : \"1e6b606b44df4a6086c0f97e826d4d15\",  \u002F\u002F Viewpoint identifier\n            \"ix\" : 5,                                            \u002F\u002F Viewpoint index, used by simulator\n            \"x\" : 3.59775996208,                                 \u002F\u002F 3D position in world coordinates\n            \"y\" : -0.837355971336,\n            \"z\" : 1.68884003162,\n            \"rel_heading\" : 0,                                   \u002F\u002F Robot relative coords to this location\n            \"rel_elevation\" : 0,\n            \"rel_distance\" : 0\n        },\n        {\n            \"viewpointId\" : \"1e3a672fa1d24d668866455162e5b58a\",  \u002F\u002F Viewpoint identifier\n            \"ix\" : 14,                                           \u002F\u002F Viewpoint index, used by simulator\n            \"x\" : 4.03619003296,                                 \u002F\u002F 3D position in world coordinates\n            \"y\" : 1.11550998688,\n            \"z\" : 1.65892004967,\n            \"rel_heading\" : 0.220844170027,                      \u002F\u002F Robot relative coords to this location\n            \"rel_elevation\" : -0.0149478448723,\n            \"rel_distance\" : 2.00169944763\n        },\n        {...}\n    ]\n  }\n]\n```\n\nRefer to [src\u002Fdriver\u002Fdriver.py](src\u002Fdriver\u002Fdriver.py) for example usage. To build html docs for C++ classes in the `doxygen` directory, run this command and navigate in your browser to `doxygen\u002Fhtml\u002Findex.html`:\n```\ndoxygen\n```\n\n\n### Precomputing ResNet Image Features\n\nIn our initial work using this simulator, we discretized heading and elevation into 30 degree increments, and precomputed image features for each view. Now that the simulator is much faster, this is no longer necessary, but for completeness we include the details of this setting below.\n\nWe generate image features using Caffe. To replicate our approach, first download and save some Caffe ResNet-152 weights into the `models` directory. We experiment with weights pretrained on [ImageNet](https:\u002F\u002Fgithub.com\u002FKaimingHe\u002Fdeep-residual-networks), and also weights finetuned on the [Places365](https:\u002F\u002Fgithub.com\u002FCSAILVision\u002Fplaces365) dataset. The script `scripts\u002Fprecompute_features.py` can then be used to precompute ResNet-152 features. Features are saved in tsv format in the `img_features` directory. \n\nAlternatively, skip the generation and just download and extract our tsv files into the `img_features` directory:\n- [ResNet-152-imagenet features [380K\u002F2.9GB]](https:\u002F\u002Fwww.dropbox.com\u002Fs\u002Fo57kxh2mn5rkx4o\u002FResNet-152-imagenet.zip?dl=1)\n- [ResNet-152-places365 features [380K\u002F2.9GB]](https:\u002F\u002Fwww.dropbox.com\u002Fs\u002F85tpa6tc3enl5ud\u002FResNet-152-places365.zip?dl=1)\n\n\n### Directory Structure\n\n- `connectivity`: Json navigation graphs.\n- `webgl_imgs`: Contains dataset views rendered with javascript (for test comparisons).\n- `sim_imgs`: Will contain simulator rendered images after running tests.\n- `models`: Caffe models for precomputing ResNet image features.\n- `img_features`: Storage for precomputed image features.\n- `data`: Matterport3D dataset.\n- `tasks`: Currently just the Room-to-Room (R2R) navigation task.\n- `web`: Javascript code for visualizing trajectories and collecting annotations using Amazon Mechanical Turk (AMT).\n\nOther directories are mostly self-explanatory.\n\n\n## License\n\nThe Matterport3D dataset, and data derived from it, is released under the [Matterport3D Terms of Use](http:\u002F\u002Fdovahkiin.stanford.edu\u002Fmatterport\u002Fpublic\u002FMP_TOS.pdf). Our code is released under the MIT license.\n\n## Acknowledgements\n\nWe would like to thank Matterport for allowing the Matterport3D dataset to be used by the academic community. This project is supported by a Facebook ParlAI Research Award and by the [Australian Centre for Robotic Vision](https:\u002F\u002Fwww.roboticvision.org\u002F).\n\n## Contributing\n\nWe welcome contributions from the community. All submissions require review and in most cases would require tests.\n","# Matterport3D 模拟器\n基于真实全景图像的强化学习人工智能研究平台。\n\nMatterport3D 模拟器使开发者能够构建**利用视觉信息（RGB-D 图像）与真实 3D 环境交互的人工智能智能体**。它主要用于深度强化学习领域的研究，该领域横跨计算机视觉、自然语言处理和机器人技术。\n\n![概念图](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpeteanderson80_Matterport3DSimulator_readme_8047a4cda0c1.jpg)\n\n访问主[网站](https:\u002F\u002Fbringmeaspoon.org\u002F)观看演示。\n\n*新增：2019年2月* 我们发布了多项更新。现在模拟器已容器化，支持批量智能体而不仅限于单个智能体，并且效率（速度）比之前大幅提升。此外，它现在不仅能输出 RGB 图像，还能输出深度图。因此，原始 API 发生了一些变化（主要是所有输入和输出都改为批量处理）。为了区分首次发布版本，我们为那些暂不升级到新版本的用户标记了 [v0.1](https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Ftree\u002Fv0.1) 版本。\n\n## 功能特性\n- 数据集包含 90 种以室内为主的环境；\n- 输出 RGB 和深度图像；\n- 所有图像和深度图均为真实采集，而非合成数据（提供更丰富的视觉复杂度）；\n- 提供 C++ 和 Python 的 API；\n- 可自定义图像分辨率、相机参数等；\n- 支持离屏渲染（基于 GPU 和 CPU）；\n- 高速渲染（使用 Titan X GPU，在 640x480 分辨率下可实现约 1000 fps 的 RGB-D 离屏渲染）；\n- 包含渲染管线和智能体运动等的单元测试；\n- 未来版本可能支持类别级和实例级目标分割。\n\n## 参考文献\n\nMatterport3D 模拟器和 Room-to-Room (R2R) 导航数据集在以下论文中有所介绍：\n- [视觉与语言导航：在真实环境中理解基于视觉的导航指令](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.07280)。\n\n如果您使用该模拟器或我们的数据集，请引用我们的论文（CVPR 2018 Spotlight Oral）：\n\n### Bibtex 格式：\n```\n@inproceedings{mattersim,\n  title={{Vision-and-Language Navigation}: Interpreting visually-grounded navigation instructions in real environments},\n  author={Peter Anderson and Qi Wu and Damien Teney and Jake Bruce and Mark Johnson and Niko S{\\\"u}nderhauf and Ian Reid and Stephen Gould and Anton van den Hengel},\n  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},\n  year={2018}\n}\n```\n\n## 模拟器数据\n\nMatterport3D 模拟器基于来自 [Matterport3D 数据集](https:\u002F\u002Fniessner.github.io\u002FMatterport\u002F) 的密集采样的 360 度室内 RGB-D 图像。该数据集包含 90 种不同的室内场景，涵盖住宅、办公室、教堂和酒店等。每个场景包含从 8 到 349 个视角采集的完整 360 度 RGB-D 扫描数据，这些视角平均间隔 2.25 米，覆盖整个可行走的平面布局。\n\n### 行动选项\n\n在每个视角位置，智能体可以平移和俯仰相机，也可以选择在不同视角之间移动。智能体的具体观测和行动细节将在[下方](#simulator-api)及论文中详细说明。\n\n### Room-to-Room (R2R) 导航任务\n\n该模拟器包含了 Room-to-Room (R2R) 导航任务的训练数据和评估指标。此任务要求自主智能体根据自然语言导航指令，前往一个此前未见过的建筑中的目标位置。请参阅[具体说明](tasks\u002FR2R\u002FREADME.md)，以设置并运行此任务。EvalAI 上设有测试服务器和排行榜，地址为 [EvalAI](https:\u002F\u002Fevalai.cloudcv.org\u002Fweb\u002Fchallenges\u002Fchallenge-page\u002F97\u002Foverview)。\n\n## 安装\u002F构建说明\n\n我们建议使用我们的 [Dockerfile](Dockerfile) 来安装模拟器。当然，您也可以[不使用 Docker 进行构建](#building-without-docker)，但满足项目依赖可能会更加困难。\n\n### 先决条件\n\n- NVIDIA GPU，驱动程序版本 ≥ 396.37；\n- 安装 [Docker](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstallation\u002F)；\n- 安装 [NVIDIA Docker 2.0](https:\u002F\u002Fgithub.com\u002Fnvidia\u002Fnvidia-docker\u002Fwiki\u002FInstallation-(version-2.0))；\n- 注意：无需单独安装 CUDA\u002FCuDNN 工具包，这些工具将由 Docker 镜像提供。\n\n### 克隆代码库\n\n克隆 Matterport3DSimulator 仓库：\n```\n# 请务必使用 --recursive 选项进行克隆\ngit clone --recursive https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator.git\ncd Matterport3DSimulator\n```\n\n如果您未使用 `--recursive` 选项克隆，则需要手动从顶级目录克隆 pybind 子模块：\n```\ngit submodule update --init --recursive\n``。\n\n### 下载数据集\n\n要使用该模拟器，您首先需要下载 [Matterport3D 数据集](https:\u002F\u002Fniessner.github.io\u002FMatterport\u002F)。申请访问权限后即可下载，下载脚本支持选择性下载特定类型的数据。至少需要下载 `matterport_skybox_images`。如果希望使用深度输出，还需下载 `undistorted_depth_images` 和 `undistorted_camera_parameters`。\n\n设置一个环境变量指向解压后的数据集路径，其中 `\u003CPATH>` 是包含各个 Matterport 扫描目录（17DRP5sb8fy、2t7WUuJeko7 等）的完整绝对路径（不是相对路径或符号链接）：\n```\nexport MATTERPORT_DATA_DIR=\u003CPATH>\n```\n\n请注意，如果 `\u003CPATH>` 是远程的 sshfs 挂载点，您需要使用 `-o allow_root` 选项挂载，否则 Docker 容器将无法访问该目录。\n\n### 使用 Docker 构建\n\n构建 Docker 镜像：\n```\ndocker build -t mattersim:9.2-devel-ubuntu18.04 .\n```\n\n运行 Docker 容器，并同时挂载 Git 仓库和数据集：\n```\nnvidia-docker run -it --mount type=bind,source=$MATTERPORT_DATA_DIR,target=\u002Froot\u002Fmount\u002FMatterport3DSimulator\u002Fdata\u002Fv1\u002Fscans --volume `pwd`:\u002Froot\u002Fmount\u002FMatterport3DSimulator mattersim:9.2-devel-ubuntu18.04\n```\n\n现在（在 Docker 容器内），构建模拟器代码：\n```\ncd \u002Froot\u002Fmount\u002FMatterport3DSimulator\nmkdir build && cd build\ncmake -DEGL_RENDERING=ON ..\nmake\ncd ..\u002F\n```\n\n#### 渲染选项（GPU、CPU、离屏）\n\n请注意，共有三种渲染方式，可通过构建过程中的 [CMake](https:\u002F\u002Fcmake.org\u002F) 选项进行选择（只需调整上述构建命令中的第 3 行）：\n- 基于 OpenGL 的 GPU 渲染（需要 X 服务器）：`cmake ..`（默认）；\n- 基于 [EGL](https:\u002F\u002Fwww.khronos.org\u002Fegl\u002F) 的离屏 GPU 渲染：`cmake -DEGL_RENDERING=ON ..`；\n- 基于 [OSMesa](https:\u002F\u002Fwww.mesa3d.org\u002Fosmesa.html) 的离屏 CPU 渲染：`cmake -DOSMESA_RENDERING=ON ..`。\n\n对于智能体训练，推荐采用高速的离屏 GPU 渲染（EGL）。\n\n### 数据集预处理\n\n为了加快数据加载速度并减少内存使用，我们通过缩小尺寸并将所有立方体面合并为一张图像来预处理 `matterport_skybox_images`。在 Docker 容器内运行以下脚本：\n```\n.\u002Fscripts\u002Fdownsize_skybox.py\n```\n\n这将花费一些时间，具体取决于使用的进程数量（该设置可在脚本中调整）。\n\n完成之后，数据集中的 `matterport_skybox_images` 子目录将包含文件名为 `\u003CPANO_ID>_skybox_small.jpg` 的图像文件。默认情况下，图像会被缩小 50%，并使用 20 个进程。\n\n#### 深度输出\n\n如果您需要同时获取深度和 RGB 图像（通过调用 `sim.setDepthEnabled(True)`），请运行以下脚本以预计算匹配的深度天空盒图像：\n```\n.\u002Fscripts\u002Fdepth_to_skybox.py\n```\n\n深度天空盒是基于 `undistorted_depth_images` 使用简单的混合方法生成的。由于深度图像中存在大量缺失值（对应于数据集中常见的反光、明亮、透明以及远处的表面），我们应用了一种基于 [NYUv2](https:\u002F\u002Fcs.nyu.edu\u002F~silberman\u002Fdatasets\u002Fnyu_depth_v2.html) 代码的简单交叉双边滤波器，以填充除最大孔洞之外的所有区域。需要注意的是：\n- 我们假设 `undistorted depth images` 已经与 `matterport_skybox_images` 对齐，但实际上这种对齐并不完美。对于某些需要更高对齐精度的应用场景（例如生成 RGB 点云），可能需要将 `matterport_skybox_images` 替换为拼接后的 `undistorted_color_images`（这些图像与 `undistorted_depth_images` 完全对齐）。\n- 在生成的深度天空盒中，深度值表示的是从相机中心到目标点的欧几里得距离，而不是 z 方向上的距离。这一点会在模拟器中进行校正（见下文的模拟器 API）。\n\n### 运行测试\n\n现在（仍然在 Docker 容器内），运行单元测试：\n```\n.\u002Fbuild\u002Ftests ~Timing\n```\n\n如果所有测试都通过，`sim_imgs` 目录中将包含由模拟器渲染的一些测试图像。您也可以测试渲染帧率。以下命令会尝试将所有 Matterport 环境加载到内存中（大约需要 50 GB 内存），然后在标准输出中打印出关于渲染帧率的信息（分辨率为 640x480，仅输出 RGB）：\n```\n.\u002Fbuild\u002Ftests Timing\n```\n\n为了获得准确的结果，必须单独运行计时测试，而不能与其他测试同时进行。请注意，如果内存不足，计时测试将会失败。只要其他所有测试都通过（即 `.\u002Fbuild\u002Ftests ~Timing`），则说明安装成功。有关单元测试配置选项，请参阅 [Catch](https:\u002F\u002Fgithub.com\u002Fphilsquared\u002FCatch) 文档。\n\n现在退出 Docker 容器：\n```\nexit\n```\n\n## 交互式演示\n\n要运行交互式演示，在完成上述安装和构建步骤后，启动 Docker 容器，并将主机的 X 服务器和 DISPLAY 环境变量共享给容器：\n```\nxhost +\nnvidia-docker run -it -e DISPLAY -v \u002Ftmp\u002F.X11-unix:\u002Ftmp\u002F.X11-unix --mount type=bind,source=$MATTERPORT_DATA_DIR,target=\u002Froot\u002Fmount\u002FMatterport3DSimulator\u002Fdata\u002Fv1\u002Fscans,readonly --volume `pwd`:\u002Froot\u002Fmount\u002FMatterport3DSimulator mattersim:9.2-devel-ubuntu18.04\ncd \u002Froot\u002Fmount\u002FMatterport3DSimulator\n```\n\n如果您遇到类似 `Error: BadShmSeg (invalid shared segment parameter) 128` 的错误，可能还需要在上述 docker run 命令中添加 `-e=\"QT_X11_NO_MITSHM=1\"`。\n\n下面提供了运行 Python 和 C++ 演示的命令。这些是非常简单的演示，旨在展示如何在 Python 和 C++ 中使用模拟器。默认情况下，*这些演示未启用深度渲染*。如果您已经预处理了深度输出并希望查看深度信息，请检查代码并将其打开（参见上文的深度输出部分）。无论构建模拟器时使用了哪种渲染选项，这些演示都应该能够正常运行。\n\nPython 演示：\n```\npython3 src\u002Fdriver\u002Fdriver.py\n```\n\nC++ 演示：\n```\nbuild\u002Fmattersim_main\n```\n\n`web` 目录中的 JavaScript 代码也可以用作交互式演示，或者用于从模拟器中以第一人称视角生成视频，亦或作为 Amazon Mechanical Turk 上的界面来收集自然语言指令数据。\n\n\n### 不使用 Docker 构建\n\n模拟器可以在不使用 Docker 容器的情况下，通过上述的 CMake 构建命令进行构建。然而，这不是推荐的做法，因为所有依赖项都需要在本地安装，且可能会与现有库发生冲突。主要要求如下：\n- Ubuntu >= 14.04\n- 已安装 CUDA 的 Nvidia 驱动程序\n- 支持 C++11 的 C++ 编译器\n- [CMake](https:\u002F\u002Fcmake.org\u002F) >= 3.10\n- [OpenCV](http:\u002F\u002Fopencv.org\u002F) >= 2.4，包括 3.x 版本\n- [OpenGL](https:\u002F\u002Fwww.opengl.org\u002F)\n- [GLM](https:\u002F\u002Fglm.g-truc.net\u002F0.9.8\u002Findex.html)\n- [Numpy](http:\u002F\u002Fwww.numpy.org\u002F)\n\n可选依赖项（取决于 CMake 的渲染选项）：\n- [OSMesa](https:\u002F\u002Fwww.mesa3d.org\u002Fosmesa.html) 用于支持 OSMesa 后端\n- [epoxy](https:\u002F\u002Fgithub.com\u002Fanholt\u002Flibepoxy) 用于支持 EGL 后端\n\n提供的 [Dockerfile](Dockerfile) 包含了大多数这些库的安装命令。例如，要安装 OpenGL 及相关库：\n```\nsudo apt-get install libjsoncpp-dev libepoxy-dev libglm-dev libosmesa6 libosmesa6-dev libglew-dev\n```\n\n### 模拟器 API\n\n模拟器的 [Python API](src\u002Flib_python\u002FMatterSimPython.cpp) 与详细注释的 [MatterSim.hpp](include\u002FMatterSim.hpp) C++ 头文件完全一致，只是用 Python 列表代替了 C++ std::vector 等数据结构。通常，以 `set` 开头的函数用于设置智能体和模拟器的配置，例如批量大小、渲染参数、是否启用深度输出等。对于训练智能体，我们建议设置 `setPreloadingEnabled(True)`、`setBatchSize(X)` 和 `setCacheSize(2X)`，其中 X 是所需的批量大小，例如：\n```\nimport MatterSim\nsim = MatterSim.Simulator()\nsim.setCameraResolution(640, 480)\nsim.setPreloadingEnabled(True)\nsim.setDepthEnabled(True)\nsim.setBatchSize(100)\nsim.setCacheSize(200) # cacheSize 200 会占用约 1.2GB 的 GPU 内存来缓存全景纹理\n```\n\n当启用预加载时，所有全景图像将在开始之前被加载到内存中。预加载需要几分钟时间，并且对于 RGB 输出大约需要 50GB 的内存（如果启用深度输出，则需要约 80GB），但这样可以显著提高渲染速度。\n\n要启动模拟器，先调用 `initialize`，然后调用 `newEpisode` 函数，该函数接受扫描 ID 列表、视点 ID 列表、方向角列表（以弧度为单位）以及相机高度角列表（以弧度为单位）作为参数，例如：\n```\nsim.initialize()\n\n# 假设批次大小为 1\nsim.newEpisode(['2t7WUuJeko7'], ['1e6b606b44df4a6086c0f97e826d4d15'], [0], [0])\n```\n\n航向是从 y 轴定义的，z 轴朝上（向右转为正）。相机俯仰角是从由 x-y 平面定义的地平线开始测量的（向上为正）。还有一个 `newRandomEpisode` 函数，它只需要一个扫描 ID 列表，并随机确定一个视点和航向（相机俯仰角为零）。\n\n与模拟器的交互通过 `makeAction` 函数进行，该函数接受可导航位置索引列表、航向变化列表（以弧度为单位）和俯仰角变化列表（以弧度为单位）作为参数。可导航位置索引用于选择智能体应移动到的附近相机视点。*默认情况下，只有位于智能体当前视野范围内的相机视点才被视为可导航，除非禁用了受限导航*（例如，智能体不能后退）。对于智能体 `n`，可导航位置由 `getState()[n].navigableLocations` 给出。索引 0 始终包含当前视点（即智能体始终可以选择停留在原地）。由于导航图是不规则的，其余视点按其与图像中心的角距离排序，因此索引 1（如果可用）将近似于直接向前移动。例如，要在不移动的情况下左转 30 度（保持相机俯仰角不变）：\n```\nsim.makeAction([0], [-0.523599], [0])\n```\n\n在任何时候，都可以通过调用 `getState` 来获取模拟器状态。返回的状态包含一个对象列表（每个批次中的每个智能体对应一个对象），其属性如下例所示：\n```javascript\n[\n  {\n    \"scanId\" : \"2t7WUuJeko7\"  \u002F\u002F 智能体所在的建筑物\n    \"step\" : 5,               \u002F\u002F 自上次 newEpisode() 调用以来的帧数\n    \"rgb\" : \u003Cimage>,          \u002F\u002F 8 位图像（BGR 通道顺序），可通过 np.array(rgb, copy=False) 访问\n    \"depth\" : \u003Cimage>,        \u002F\u002F 16 位单通道图像，包含像素在 z 方向相对于相机中心的距离 \n                              \u002F\u002F （不是相对于相机中心的欧几里得距离），每 0.25 毫米对应一个值（除以 4000 即可得到米）。 \n                              \u002F\u002F 零值表示“无读数”。可通过 np.array(depth, copy=False) 访问\n    \"location\" : {            \u002F\u002F 智能体当前的 3D 位置\n        \"viewpointId\" : \"1e6b606b44df4a6086c0f97e826d4d15\",  \u002F\u002F 视点标识符\n        \"ix\" : 5,                                            \u002F\u002F 视点索引，由模拟器使用\n        \"x\" : 3.59775996208,                                 \u002F\u002F 世界坐标系中的 3D 位置\n        \"y\" : -0.837355971336,\n        \"z\" : 1.68884003162,\n        \"rel_heading\" : 0,                                   \u002F\u002F 相对于该位置的机器人相对坐标\n        \"rel_elevation\" : 0,\n        \"rel_distance\" : 0\n    }\n    \"heading\" : 3.141592,     \u002F\u002F 智能体当前的相机航向，以弧度为单位\n    \"elevation\" : 0,          \u002F\u002F 智能体当前的相机俯仰角，以弧度为单位\n    \"viewIndex\" : 0,          \u002F\u002F 智能体当前视角的索引 [0-35]（仅在视角离散化时有效）\n                              \u002F\u002F [0-11] 表示低头，[12-23] 表示平视，[24-35] 表示抬头\n    \"navigableLocations\": [   \u002F\u002F 可移动到的视点列表。索引 0 始终是当前视点，即不动。\n        {                     \u002F\u002F 其余有效视点按其与图像中心的角距离排序。\n            \"viewpointId\" : \"1e6b606b44df4a6086c0f97e826d4d15\",  \u002F\u002F 视点标识符\n            \"ix\" : 5,                                            \u002F\u002F 视点索引，由模拟器使用\n            \"x\" : 3.59775996208,                                 \u002F\u002F 世界坐标系中的 3D 位置\n            \"y\" : -0.837355971336,\n            \"z\" : 1.68884003162,\n            \"rel_heading\" : 0,                                   \u002F\u002F 相对于该位置的机器人相对坐标\n            \"rel_elevation\" : 0,\n            \"rel_distance\" : 0\n        },\n        {\n            \"viewpointId\" : \"1e3a672fa1d24d668866455162e5b58a\",  \u002F\u002F 视点标识符\n            \"ix\" : 14,                                           \u002F\u002F 视点索引，由模拟器使用\n            \"x\" : 4.03619003296,                                 \u002F\u002F 世界坐标系中的 3D 位置\n            \"y\" : 1.11550998688,\n            \"z\" : 1.65892004967,\n            \"rel_heading\" : 0.220844170027,                      \u002F\u002F 相对于该位置的机器人相对坐标\n            \"rel_elevation\" : -0.0149478448723,\n            \"rel_distance\" : 2.00169944763\n        },\n        {...}\n    ]\n  }\n]\n```\n\n有关示例用法，请参阅 [src\u002Fdriver\u002Fdriver.py](src\u002Fdriver\u002Fdriver.py)。要为 `doxygen` 目录中的 C++ 类生成 HTML 文档，请运行以下命令，并在浏览器中导航至 `doxygen\u002Fhtml\u002Findex.html`：\n```\ndoxygen\n```\n\n\n### 预计算 ResNet 图像特征\n\n在我们最初使用此模拟器的工作中，我们将航向和俯仰角离散化为 30 度的增量，并为每个视角预先计算了图像特征。现在模拟器的速度已经快得多，这已不再必要，但为了完整性，我们仍将在下面列出该设置的详细信息。\n\n我们使用 Caffe 生成图像特征。要复制我们的方法，首先下载一些 Caffe ResNet-152 的权重并将其保存到 `models` 目录中。我们尝试了在 [ImageNet](https:\u002F\u002Fgithub.com\u002FKaimingHe\u002Fdeep-residual-networks) 数据集上预训练的权重，以及在 [Places365](https:\u002F\u002Fgithub.com\u002FCSAILVision\u002Fplaces365) 数据集上微调后的权重。然后可以使用脚本 `scripts\u002Fprecompute_features.py` 来预先计算 ResNet-152 的特征。特征将以 tsv 格式保存在 `img_features` 目录中。\n\n或者，您可以跳过生成步骤，直接下载并解压我们的 tsv 文件到 `img_features` 目录：\n- [ResNet-152-ImageNet 特征 [38 万\u002F2.9 GB]](https:\u002F\u002Fwww.dropbox.com\u002Fs\u002Fo57kxh2mn5rkx4o\u002FResNet-152-imagenet.zip?dl=1)\n- [ResNet-152-Places365 特征 [38 万\u002F2.9 GB]](https:\u002F\u002Fwww.dropbox.com\u002Fs\u002F85tpa6tc3enl5ud\u002FResNet-152-places365.zip?dl=1)\n\n\n### 目录结构\n\n- `connectivity`: JSON 格式的导航图。\n- `webgl_imgs`: 包含使用 JavaScript 渲染的数据集视图（用于测试比较）。\n- `sim_imgs`: 将存放运行测试后模拟器渲染的图像。\n- `models`: 用于预先计算 ResNet 图像特征的 Caffe 模型。\n- `img_features`: 存储预先计算的图像特征。\n- `data`: Matterport3D 数据集。\n- `tasks`: 目前仅包含 Room-to-Room (R2R) 导航任务。\n- `web`: 用于可视化轨迹并使用 Amazon Mechanical Turk (AMT) 收集标注的 JavaScript 代码。\n\n其他目录大多不言自明。\n\n\n## 许可证\n\nMatterport3D 数据集及其衍生数据根据 [Matterport3D 使用条款](http:\u002F\u002Fdovahkiin.stanford.edu\u002Fmatterport\u002Fpublic\u002FMP_TOS.pdf)发布。我们的代码采用 MIT 许可证发布。\n\n## 致谢\n\n我们谨向 Matterport 表示感谢，感谢其允许学术界使用 Matterport3D 数据集。本项目得到了 Facebook ParlAI 研究奖以及 [澳大利亚机器人视觉中心](https:\u002F\u002Fwww.roboticvision.org\u002F) 的支持。\n\n## 贡献\n\n我们欢迎社区的贡献。所有提交的内容均需经过评审，在大多数情况下还需要通过测试。","# Matterport3D Simulator 快速上手指南\n\nMatterport3D Simulator 是一个用于强化学习研究的 AI 平台，支持智能体在真实的 3D 室内环境中通过视觉信息（RGB-D 图像）进行交互。它广泛应用于计算机视觉、自然语言处理和机器人领域的深度强化学习研究。\n\n## 环境准备\n\n### 系统要求\n- **操作系统**: Ubuntu >= 14.04 (推荐使用 Ubuntu 18.04)\n- **GPU**: Nvidia GPU，驱动程序版本 >= 396.37\n- **内存**: 运行完整数据集测试建议预留 50GB+ 内存\n\n### 前置依赖\n推荐使用 Docker 部署以避免复杂的本地依赖冲突。需安装以下工具：\n- [Docker](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstallation\u002F)\n- [nvidia-docker2.0](https:\u002F\u002Fgithub.com\u002Fnvidia\u002Fnvidia-docker\u002Fwiki\u002FInstallation-(version-2.0))\n\n> **注意**: 无需手动安装 CUDA 或 CuDNN，Docker 镜像中已包含。\n\n## 安装步骤\n\n### 1. 克隆代码仓库\n确保使用 `--recursive` 参数克隆，以获取必要的子模块：\n```bash\ngit clone --recursive https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator.git\ncd Matterport3DSimulator\n```\n*如果忘记加参数，可手动初始化子模块：*\n```bash\ngit submodule update --init --recursive\n```\n\n### 2. 下载数据集\n访问 [Matterport3D 官网](https:\u002F\u002Fniessner.github.io\u002FMatterport\u002F) 申请权限并下载数据。\n- **必需**: `matterport_skybox_images`\n- **可选 (如需深度图)**: `undistorted_depth_images` 和 `undistorted_camera_parameters`\n\n解压后，设置环境变量指向数据集根目录（必须是绝对路径）：\n```bash\nexport MATTERPORT_DATA_DIR=\u003CPATH>\n```\n*> 注：若路径为远程 sshfs 挂载，请使用 `-o allow_root` 选项挂载。*\n\n### 3. 构建 Docker 镜像\n在项目根目录下构建镜像：\n```bash\ndocker build -t mattersim:9.2-devel-ubuntu18.04 .\n```\n\n### 4. 启动容器并编译\n运行容器，挂载代码库和数据集目录：\n```bash\nnvidia-docker run -it --mount type=bind,source=$MATTERPORT_DATA_DIR,target=\u002Froot\u002Fmount\u002FMatterport3DSimulator\u002Fdata\u002Fv1\u002Fscans --volume `pwd`:\u002Froot\u002Fmount\u002FMatterport3DSimulator mattersim:9.2-devel-ubuntu18.04\n```\n\n进入容器后，创建构建目录并编译（推荐使用 EGL 进行离屏 GPU 渲染以提升速度）：\n```bash\ncd \u002Froot\u002Fmount\u002FMatterport3DSimulator\nmkdir build && cd build\ncmake -DEGL_RENDERING=ON ..\nmake\ncd ..\u002F\n```\n\n### 5. 数据预处理\n为加速加载并减少内存占用，需对天空盒图像进行下采样处理：\n```bash\n.\u002Fscripts\u002Fdownsize_skybox.py\n```\n*如需使用深度图输出，还需运行：*\n```bash\n.\u002Fscripts\u002Fdepth_to_skybox.py\n```\n\n## 基本使用\n\n编译完成后，可直接在容器内运行示例程序。默认示例仅开启 RGB 渲染，如需深度图请在代码中启用 `sim.setDepthEnabled(True)` 并确保已完成深度数据预处理。\n\n### Python 示例\n运行简单的 Python 驱动演示：\n```bash\npython3 src\u002Fdriver\u002Fdriver.py\n```\n\n### C++ 示例\n运行 C++ 主程序演示：\n```bash\nbuild\u002Fmattersim_main\n```\n\n### 验证安装\n运行单元测试（不含耗时测速测试）以确认安装成功：\n```bash\n.\u002Fbuild\u002Ftests ~Timing\n```\n若测试通过，`sim_imgs` 目录中将生成渲染出的测试图像。","某机器人研发团队正在训练一个能听懂“去厨房拿水杯”这类自然语言指令的室内导航 AI 代理。\n\n### 没有 Matterport3DSimulator 时\n- 只能使用纯合成或简单的虚拟环境进行训练，缺乏真实世界的光影变化和物体纹理细节，导致模型在真实场景中泛化能力极差。\n- 难以获取大规模、带有精确深度信息（Depth）的真实全景图像数据，手动采集和标注成本高昂且效率低下。\n- 单智能体串行训练模式耗时漫长，无法利用批量处理加速深度强化学习算法的收敛过程。\n- 缺乏标准的“房间到房间”（R2R）导航任务基准，难以客观评估算法在未见过的建筑环境中的实际表现。\n\n### 使用 Matterport3DSimulator 后\n- 直接基于 90 个真实扫描的室内环境（如家庭、办公室）进行训练，AI 代理能学习到真实的视觉复杂性，显著提升落地适应性。\n- 原生支持输出真实的 RGB 及深度图，无需额外标注即可让代理理解三维空间结构，大幅降低数据准备门槛。\n- 利用其高效的批量智能体支持和 GPU 离线渲染能力（Titan X 上可达 1000 fps），将模型训练速度提升了数倍。\n- 内置标准的 R2R 导航任务数据集与评估指标，团队可立即在统一基准下验证并优化指令跟随算法的性能。\n\nMatterport3DSimulator 通过提供高保真的真实世界环境与高效的训练架构，彻底解决了视觉语言导航研究从仿真到现实落地的“鸿沟”难题。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpeteanderson80_Matterport3DSimulator_8047a4cd.jpg","peteanderson80","Peter Anderson","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fpeteanderson80_e0bdfd22.jpg",null,"Balyasny Asset Management","Austin, Texas USA","panderson_me","http:\u002F\u002Fpanderson.me","https:\u002F\u002Fgithub.com\u002Fpeteanderson80",[86,90,94,98,102,106,110],{"name":87,"color":88,"percentage":89},"C++","#f34b7d",77.3,{"name":91,"color":92,"percentage":93},"Python","#3572A5",10.9,{"name":95,"color":96,"percentage":97},"HTML","#e34c26",5.7,{"name":99,"color":100,"percentage":101},"JavaScript","#f1e05a",4.5,{"name":103,"color":104,"percentage":105},"CMake","#DA3434",1.2,{"name":107,"color":108,"percentage":109},"Dockerfile","#384d54",0.2,{"name":111,"color":112,"percentage":113},"Shell","#89e051",0.1,686,138,"2026-04-13T09:42:24","NOASSERTION","Linux","必需 NVIDIA GPU (驱动版本 >= 396.37)，推荐 Titan X 或更高以达最佳性能；支持基于 EGL 的离屏 GPU 渲染或 OSMesa CPU 渲染","运行全量环境测试需约 50GB，常规使用未明确说明（建议大内存）",{"notes":122,"python":123,"dependencies":124},"强烈建议使用提供的 Docker 镜像进行安装以避免依赖冲突。使用前需申请并下载 Matterport3D 数据集（至少包含 skybox 图像），并设置环境变量指向数据集绝对路径。若需深度图输出，需额外下载并预处理深度数据。支持 C++ 和 Python API。","Python 3 (README 示例使用 python3)",[125,126,127,128,129,130,131],"CMake >= 3.10","OpenCV >= 2.4","OpenGL","GLM","Numpy","Docker","nvidia-docker2",[27,16,62],[134,135,136,137,138,139,140],"rl","simulator","matterport3d-dataset","matterport3d-simulator","reinforcement-learning","vision-and-language","natural-language-processing","2026-03-27T02:49:30.150509","2026-04-19T03:05:10.341784",[144,149,154,159,164,169,173],{"id":145,"question_zh":146,"answer_zh":147,"source_url":148},41189,"运行 sim.initialize() 时出现 EGL error 0x3001 错误怎么办？","这通常是因为缺少显示服务器或显卡驱动问题。可以尝试以下解决方案：\n1. 安装虚拟帧缓冲区并设置 DISPLAY 环境变量：\n   sudo apt install xvfb\n   Xvfb :99 -screen 0 1024x768x24 &\n   export DISPLAY=:99\n2. 如果上述方法无效，尝试卸载并重新安装 Nvidia 显卡驱动。","https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Fissues\u002F39",{"id":150,"question_zh":151,"answer_zh":152,"source_url":153},41190,"报错 'Could not open skybox RGB files' 找不到天空盒图片如何解决？","这是因为数据集中缺少生成的深度天空盒文件。需要运行项目中的 `depth_to_skybox.py` 脚本来生成这些文件。请确保已下载完整数据集，并在正确的目录下执行该脚本以生成缺失的 skybox 图片。","https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Fissues\u002F138",{"id":155,"question_zh":156,"answer_zh":157,"source_url":158},41191,"渲染出的图像全红、全蓝或颜色异常是什么原因？","这通常是由于运行目录不正确导致的。如果在 Jupyter Notebook 或 IPython 中运行，请确保先将工作目录切换到 Matterport3DSimulator 的根目录。例如，在 Notebook 中执行：%cd Matterport3DSimulator\u002F，然后再运行演示脚本。","https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Fissues\u002F21",{"id":160,"question_zh":161,"answer_zh":162,"source_url":163},41192,"Docker 容器内报错 FileNotFoundError 找不到扫描数据文件怎么办？","这通常不是 Docker 本身的问题，而是数据路径配置或文件解压问题。请检查以下几点：\n1. 确认 `MATTERPORT_DATA_DIR` 环境变量已正确设置为包含扫描文件夹的目录（如 ..\u002FMatterport3D\u002Fdata\u002Fv1\u002Fscans）。\n2. 确保下载的 .zip 文件已完全解压。有时解压会创建多余的嵌套子文件夹。\n3. 在扫描数据目录下运行以下命令确保所有 zip 包被正确解压：\n   for k in $(find .\u002F -type f -name '*.zip') ; do unzip $k ; done","https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Fissues\u002F123",{"id":165,"question_zh":166,"answer_zh":167,"source_url":168},41193,"运行测试用例 (tests) 时在 RGB Image 环节失败并报 EGL 错误怎么办？","如果在测试中遇到 EGL initialization 失败，通常是因为资源文件未准备好。解决方法是：\n1. 确保所有数据文件已解压。\n2. 先执行 `.\u002Fscripts\u002Fdownsize_skybox.py` 脚本处理天空盒图像。\n完成这两步后，再次运行测试通常就能通过。","https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Fissues\u002F83",{"id":170,"question_zh":171,"answer_zh":172,"source_url":148},41194,"远程服务器上运行 Flask 应用或显示图像时如何配置 Matplotlib？","在远程服务器环境下，为了避免图形界面依赖问题，建议将 Matplotlib 的后端设置为 'WebAgg'。在导入 pyplot 之前添加以下代码：\nimport matplotlib\nmatplotlib.use('WebAgg')\nimport matplotlib.pyplot as plt\n这样可以成功在远程环境中显示图像。",{"id":174,"question_zh":175,"answer_zh":176,"source_url":177},41195,"渲染图像全黑（所有像素值为 0）可能是什么原因？","如果动作执行正常但渲染图像全黑，可能是环境状态或相机参数在之前的操作中被意外修改，或者显卡上下文丢失。建议尝试重启 Python 会话或重新初始化模拟器实例。如果问题持续，检查是否最近更新了驱动程序或修改了渲染相关的配置文件。","https:\u002F\u002Fgithub.com\u002Fpeteanderson80\u002FMatterport3DSimulator\u002Fissues\u002F131",[]]