[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-HaoranZhuExplorer--World-Models-Autonomous-Driving-Survey":3,"tool-HaoranZhuExplorer--World-Models-Autonomous-Driving-Survey":62},[4,18,26,36,46,54],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",158594,2,"2026-04-16T23:34:05",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":42,"last_commit_at":43,"category_tags":44,"status":17},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,45],"插件",{"id":47,"name":48,"github_repo":49,"description_zh":50,"stars":51,"difficulty_score":32,"last_commit_at":52,"category_tags":53,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":55,"name":56,"github_repo":57,"description_zh":58,"stars":59,"difficulty_score":32,"last_commit_at":60,"category_tags":61,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[45,13,15,14],{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":78,"owner_email":77,"owner_twitter":77,"owner_website":79,"owner_url":80,"languages":77,"stars":81,"forks":82,"last_commit_at":83,"license":77,"difficulty_score":84,"env_os":85,"env_gpu":86,"env_ram":86,"env_deps":87,"category_tags":90,"github_topics":91,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":96,"updated_at":97,"faqs":98,"releases":99},8215,"HaoranZhuExplorer\u002FWorld-Models-Autonomous-Driving-Survey","World-Models-Autonomous-Driving-Survey","A curated list of world models for autonomous driving.","World-Models-Autonomous-Driving-Survey 是一份由纽约大学学习系统实验室维护的精选清单，专门收录面向自动驾驶领域的“世界模型”前沿研究与开源项目。它致力于解决自动驾驶系统中环境理解、长时程预测及多模态场景生成等核心难题，通过整理最新的学术论文与代码资源，帮助从业者快速掌握该领域的技术脉络。\n\n这份清单特别适合自动驾驶算法工程师、人工智能研究人员以及相关领域的学生使用。无论是希望复现最新成果的开发人员，还是寻找灵感的学术探索者，都能从中高效获取经过筛选的高质量资料。其独特亮点在于不仅涵盖了基于视觉、激光雷达（LiDAR）及多模态融合的多种世界模型架构，还及时更新了包括 JEPA（联合嵌入预测架构）在内的自监督学习新范式。此外，清单动态追踪了 NeurIPS、ICCV、CVPR 等顶级会议的最新录用论文，如 DINO-Foresight、DriveGPT 等项目，并附带论文链接与代码仓库地址，极大地降低了技术调研的门槛，是进入自动驾驶世界模型研究领域的理想入口。","# World-Models-Autonomous-Driving-Latest-Survey\nA curated list of world model for autonmous driving. Keep updated.\n\n## Announcement\nBesides the wonderful papers we list below, we are very happy to announce that our group, NYU Learning Systems Laboratory, recently released a preprint titled: [AD-L-JEPA: Self-Supervised Spatial World Models with Joint Embedding Predictive Architecture for Autonomous Driving with LiDAR Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04969), the first joint-embedding predictive architecture (JEPA) based spatial world models for self-supervised representation learning of autonomous driving. Source code is available at [AD-L-JEPA-Release](https:\u002F\u002Fgithub.com\u002FHaoranZhuExplorer\u002FAD-L-JEPA-Release). If this paper inspires you, you may consider cite it via:\n```bibtex\n@article{zhu2025ad,\n  title={AD-L-JEPA: Self-Supervised Spatial World Models with Joint Embedding Predictive Architecture for Autonomous Driving with LiDAR Data},\n  author={Zhu, Haoran and Dong, Zhenyuan and Topollai, Kristi and Choromanska, Anna},\n  journal={arXiv preprint arXiv:2501.04969},\n  year={2025}\n}\n```\n\n## Leading Researchers\n[Yann Lecun](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=WLN3QrAAAAAJ&view_op=list_works&sortby=pubdate), [Danijar Hafner](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=VINmGpYAAAAJ&view_op=list_works&sortby=pubdate), [Chuang Gang](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=PTeSCbIAAAAJ&view_op=list_works&sortby=pubdate), [Yilun Du](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=GRMMc_MAAAAJ&view_op=list_works&sortby=pubdate), [Nicklas Hansen](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=8wGH7wsAAAAJ&view_op=list_works&sortby=pubdate)\n\n\n## Papers\n\n\n### 2025\n#### NeurIPS 2025\n* DINO-Foresight: Looking into the Future with DINO __`NeurIPS 2025`__; __`VFM`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.11673), [Code](https:\u002F\u002Fgithub.com\u002FSta8is\u002FDINO-Foresight)\n* FutureSightDrive: Thinking Visually with Spatio-Temporal CoT for Autonomous Driving __`NeurIPS 2025`__; __`VLM`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.17685), [Code](https:\u002F\u002Fgithub.com\u002FMIV-XJTU\u002FFSDrive)\n* Raw2Drive: Reinforcement Learning with Aligned World Models for End-to-End Autonomous Driving (in CARLA v2) __`NeurIPS 2025`__;  __`End-to-End AD`__; __`RL`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.16394)\n* Towards foundational LiDAR world models with efficient latent flow matching  __`NeurIPS 2025`__; __`Generative AI`__; __`Transfer Learning`__; [Paper](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2506.23434), [Website](https:\u002F\u002Forbis36.github.io\u002FAdaFlowMatchingWM-Web\u002F)\n* Orbis: Overcoming Challenges of Long-Horizon Prediction in Driving World Models __`NeurIPS 2025`__; __`Generative AI`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.13162), [Website](https:\u002F\u002Flmb-freiburg.github.io\u002Forbis.github.io\u002F)\n* Genesis: Multimodal Driving Scene Generation with Spatio-Temporal and Cross-Modal Consistency __`NeurIPS 2025`__; __`Generative AI`__;  __`Multi-Modal`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.07497), [website](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.07497), [Code to be released](https:\u002F\u002Fgithub.com\u002Fxiaomi-research\u002Fgenesis)\n\n\n#### ICCV 2025\n* World4Drive: End-to-End Autonomous Driving via Intention-aware Physical Latent World Model  __`ICCV 2025`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.00603), [Code](https:\u002F\u002Fgithub.com\u002Fucaszyp\u002FWorld4Drive)\n \n#### ICML 2025\n* DriveGPT: Scaling Autoregressive Behavior Models for Driving __`ICML 2025`__;  [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14415) [Demo](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=-hLi44PfY8g)\n\n#### CVPR 2025\n* GEM: A Generalizable Ego-Vision Multimodal World Model for Fine-Grained Ego-Motion, Object Dynamics, and Scene Composition Control __`CVPR 2025`__; __`Generative AI`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.11198), [Code to be released](https:\u002F\u002Fgithub.com\u002Fvita-epfl\u002FGEM)\n* FUTURIST: Advancing Semantic Future Prediction through Multimodal Visual Sequence Transformers. __`CVPR 2025`__ [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.08303)] [[Code](https:\u002F\u002Fgithub.com\u002FSta8is\u002FFUTURIST)]\n* DIO: Decomposable Implicit 4D Occupancy-Flow World Model  __`CVPR 2025`__ [Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2025\u002Fpapers\u002FDiehl_DIO_Decomposable_Implicit_4D_Occupancy-Flow_World_Model_CVPR_2025_paper.pdf)\n\n#### ICLR 2025\n* LAW: Enhancing End-to-End Autonomous Driving with Latent World Model  __`ICLR 2025`__; __`End-to-End AD`__; [Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=fd2u60ryG0), [Code](https:\u002F\u002Fgithub.com\u002FBraveGroup\u002FLAW)\n* PreWorld: Semi-Supervised Vision-Centric 3D Occupancy World Model for Autonomous Driving __`ICLR 2025`__; __`Occupancy Forecasting `__; __`Motion Planning `__; [Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rCX9l4OTCT), [Code](https:\u002F\u002Fgithub.com\u002Fgetterupper\u002FPreWorld)\n* AdaWM: Adaptive World Model based Planning for Autonomous Driving __`ICLR 2025`__; __`RL`__; __`Planning`__; [Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NEu8wgPctU)\n* SSR: Navigation-Guided Sparse Scene Representation for End-to-End Autonomous Driving  __`ICLR 2025`__;  __`End-to-End AD`__; [Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Vv76fCYffN), [Code](https:\u002F\u002Fgithub.com\u002FPeidongLi\u002FSSR)\n* OccProphet: Pushing Efficiency Frontier of Camera-Only 4D Occupancy Forecasting with Observer-Forecaster-Refiner Framework __`ICLR 2025`__; __`Occupancy Forecasting `__; [Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vC7AlY1ytz), [Code to be released](https:\u002F\u002Fgithub.com\u002FJLChen-C\u002FOccProphet)\n\n#### AAAI 2025\n* DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video Generation __`AAAI 2025`__; __`Generative AI`__; __`LLM`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.06845), [Website](https:\u002F\u002Fdrivedreamer2.github.io\u002F), [Code](https:\u002F\u002Fgithub.com\u002Ff1yfisher\u002FDriveDreamer2)\n* Drive-OccWorld: Driving in the Occupancy World: Vision-Centric 4D Occupancy Forecasting and Planning via World Models for Autonomous Driving __`AAAI 2025`__; __`Occupancy Forecasting `__; __`Planning `__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.14197), [Website](https:\u002F\u002Fdrive-occworld.github.io\u002F), [Code](https:\u002F\u002Fgithub.com\u002Fyuyang-cloud\u002FDrive-OccWorld)\n\n#### RSS 2025\n* LOPR: Self-supervised Multi-future Occupancy Forecasting for Autonomous Driving [Paper](https:\u002F\u002Fwww.roboticsproceedings.org\u002Frss21\u002Fp003.pdf)  __`RSS 2025`__;\n\n#### Others\n* Back to the Features: DINO as a Foundation for Video World Models [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2507.19468)\n* IntPhys 2: Benchmarking Intuitive Physics Understanding In Complex Synthetic Environments [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2506.09849), [Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FIntPhys2)\n* Genie Envisioner: A Unified World Foundation Platform for Robotic Manipulation [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.05635v1), [Website](https:\u002F\u002Fgenie-envisioner.github.io\u002F)\n* Genie 3: A new frontier for world models [Website](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fgenie-3-a-new-frontier-for-world-models\u002F)\n* DriVerse: Navigation World Model for Driving Simulation via Multimodal Trajectory Prompting and Motion Alignment __`arxiv April`__; __`Generative AI`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.18576), [Code](https:\u002F\u002Fgithub.com\u002Fshalfun\u002FDriVerse)\n* Learning to Drive from a World Mode __`arxiv April`__; [Paper](https:\u002F\u002Fwww.arxiv.org\u002Fpdf\u002F2504.19077)\n* WoTE: End-to-End Driving with Online Trajectory Evaluation via BEV World Model __`arxiv April`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.01941), [Code](https:\u002F\u002Fgithub.com\u002FliyingyanUCAS\u002FWoTE)\n* AETHER: Geometric-Aware Unified World Modeling __`arxiv March`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.18945), [Website](https:\u002F\u002Faether-world.github.io\u002F)\n* GAIA-2: A Controllable Multi-View Generative World Model for Autonomous Driving __`Generative AI`__; [Paper](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1L_FwiQS0KvrzERaYeG08AA1GO5HpIMfq\u002Fview)\n* Other Vehicle Trajectories Are Also Needed: A Driving World Model Unifies Ego-Other Vehicle Trajectories in Video Latent Space __`arxiv March`__; __`Generative AI`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09215)\n* $T^3$Former: Temporal Triplane Transformers as Occupancy World Models __`arxiv March`__; __`Occupancy Forecasting`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.07338)\n* InDRiVE: Intrinsic Disagreement-based Reinforcement for Vehicle Exploration through Curiosity-Driven Generalized World Model  __`arxiv March`__;  __`RL`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.05573)\n* PIWM: Dream to Drive with Predictive Individual World Model __`TIV 2025`__; __`RL`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.16733), [Code](https:\u002F\u002Fgithub.com\u002Fgaoyinfeng\u002FPIWM)\n* MaskGWM: A Generalizable Driving World Model with Video Mask Reconstruction  __`arxiv`__; __`Generative AI`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11663) [Code](https:\u002F\u002Fgithub.com\u002FSenseTime-FVG\u002FOpenDWM)\n* Dream to Drive: Model-Based Vehicle Control Using Analytic World Models  __`arxiv`__;  __`Planning`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.10012)\n* HERMES: A Unified Self-Driving World Model for Simultaneous 3D Scene Understanding and Generation  __`arxiv`__;  __`Generative AI`__;  __`LLM`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.14729), [Code to be released](https:\u002F\u002Fgithub.com\u002FLMD0311\u002FHERMES)\n* AD-L-JEPA: Self-Supervised Spatial World Models with Joint Embedding Predictive Architecture for Autonomous Driving with LiDAR Data. __`arxiv`__; __`Pre-training`__; __`Self-supervised representation learning`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04969), [Code](https:\u002F\u002Fgithub.com\u002FHaoranZhuExplorer\u002FAD-L-JEPA-Release)\n* Cosmos World Foundation Model Platform for Physical AI  __`arxiv`__; __`Foundation Model`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.03575), [Code](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FCosmos)\n\n### 2024\n\n#### NeurIPS 2024\n*  DrivingDojo Dataset: Advancing Interactive and Knowledge-Enriched Driving World Model __`NeurIPS 2024`__; __`Dataset`__; [Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002F178f4666a84ecdd61e3b85145ed56484-Paper-Datasets_and_Benchmarks_Track.pdf), [Website](https:\u002F\u002Fdrivingdojo.github.io\u002F), [Code](https:\u002F\u002Fgithub.com\u002FRobertwyq\u002FDrivingdojo)\n*  Vista: A Generalizable Driving World Model with High Fidelity and Versatile Controllability __`NeurIPS 2024`__; __`from Shanghai AI Lab`__;  __`Generative AI`__; [Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002Fa6a066fb44f2fe0d36cf740c873b8890-Paper-Conference.pdf), [Website](https:\u002F\u002Fopendrivelab.com\u002FVista\u002F), [Code](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FVista)\n\n#### ECCV 2024\n* DriveDreamer: Towards Real-world-driven World Models for Autonomous Driving __`ECCV 2024`__; __`Generative AI`__; [Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F06416.pdf), [Website](https:\u002F\u002Fdrivedreamer.github.io\u002F), [Code](https:\u002F\u002Fgithub.com\u002FJeffWang987\u002FDriveDreamer)\n* Modelling Competitive Behaviors in Autonomous Driving Under Generative World Model __`ECCV 2024`__; __`RL`__; __`Trajectories Simulation`__; [Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F05085.pdf), [Code to be released](https:\u002F\u002Fgithub.com\u002Fqiaoguanren\u002FMARL-CCE)\n* NeMo: Neural Volumetric World Models for Autonomous Driving __`ECCV 2024`__; __`End-to-End AD`__; __`Motion Planning `__; [Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F02571.pdf)\n* OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving __`ECCV 2024`__; __`Occupancy Forecasting`__; __`Motion Planning`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.16038.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fwzzheng\u002FOccWorld)\n* Think2Drive: Efficient Reinforcement Learning by Thinking with Latent World Model for Autonomous Driving (in CARLA-v2)  __`ECCV 2024`__;  __`RL`__; [Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F06129.pdf), [Website](https:\u002F\u002Fthinklab-sjtu.github.io\u002FCornerCaseRepo\u002F)\n* FipTR: A Simple yet Effective Transformer Framework for Future Instance Prediction in Autonomous Driving __`ECCV 2024`__; __`Future Instance Prediction`__; [Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F11758.pdf), [Code](https:\u002F\u002Fgithub.com\u002FTabGuigui\u002FFipTR)\n* DrivingDiffusion: Layout-Guided multi-view driving scene video generation with latent diffusion model __`ECCV 2024`__; __`Generative AI`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07771.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fshalfun\u002FDrivingDiffusion)\n\n#### CVPR 2024\n* Drive-WM: Driving into the Future: Multiview Visual Forecasting and Planning with World Model for Autonomous Driving __`CVPR 2024`__; __`Generative AI`__; __`Planning`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.17918.pdf), [Website](https:\u002F\u002Fdrive-wm.github.io\u002F), [Code](https:\u002F\u002Fgithub.com\u002FBraveGroup\u002FDrive-WM)\n* DriveWorld: 4D Pre-trained Scene Understanding via World Models for Autonomous Driving  __`CVPR 2024`__;  __`Pre-training`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.04390)\n* Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications  __`CVPR 2024`__;  __`Occupancy Forecasting `__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.17663), [Code](https:\u002F\u002Fgithub.com\u002Fhaomo-ai\u002FCam4DOcc)\n* GenAD: Generalized Predictive Model for Autonomous Driving __`CVPR 2024`__;  __`from Shanghai AI Lab`__ __`Generative AI`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09630.pdf), [Code](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveAGI) \n* ViDAR: Visual Point Cloud Forecasting enables Scalable Autonomous Driving  __`CVPR 2024`__; __`Pre-training`__;  __`from Shanghai AI Lab`__; __`NuScenes dataset`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17655), [Code](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FViDAR)\n* UnO: Unsupervised Occupancy Fields for Perception and Forecasting __`CVPR 2024`__; __`Occupancy Forecasting`__; __`Pre-training`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08691) \n\n#### ICLR 2024\n* Copilot4D: Learning Unsupervised World Models for Autonomous Driving via Discrete Diffusion __`ICLR 2024`__; __`Future Point Cloud Prediction`__; __`from Waabi`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.01017)\n\n#### ICRA 2024\n* Mitigating Covariate Shift in Imitation Learning for Autonomous Vehicles Using Latent Space Generative World Models  __`ICRA 2024`__; __`Planning`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.16663) \n\n#### Others\n* InfinityDrive: Breaking Time Limits in Driving World Models __`arxiv 2024`__; __`Generative AI`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.01522v1), [Website](https:\u002F\u002Fmetadrivescape.github.io\u002Fpapers_project\u002FInfinityDrive\u002Fpage.html)\n* DriveDreamer4D: World Models Are Effective Data Machines for 4D Driving Scene Representation __`arxiv 2024`__; __`Generative AI`__; __`4D Simulation`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.13571), [Website](https:\u002F\u002Fdrivedreamer4d.github.io\u002F), [Code](https:\u002F\u002Fgithub.com\u002FGigaAI-research\u002FDriveDreamer4D)\n* ReconDreamer: Crafting World Models for Driving Scene Reconstruction via Online Restoration __`arxiv 2024`__; __`Generative AI`__; __`4D Simulation`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.19548), [Website](https:\u002F\u002Frecondreamer.github.io\u002F), [Code](https:\u002F\u002Fgithub.com\u002FGigaAI-research\u002FReconDreamer)\n* 2024-DrivingWorld: Constructing World Model for Autonomous Driving via Video GPT. [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.19505)  [Project Page](https:\u002F\u002Fhuxiaotaostasy.github.io\u002FDrivingWorld\u002Findex.html) [Code](https:\u002F\u002Fgithub.com\u002FYvanYin\u002FDrivingWorld)\n* 2024-DOME: Taming Diffusion Model into High-Fidelity Controllable Occupancy World Model. [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10429)  [Project Page](https:\u002F\u002Fgusongen.github.io\u002FDOME)\n* 2024-OccLLaMA: An Occupancy-Language-Action Generative World Model for Autonomous Driving [Paper](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2409.03272)\n* 2024-BEVWorld: A Multimodal World Model for Autonomous Driving via Unified BEV Latent Space  __`arxiv`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.05679)\n* 2024-Planning with Adaptive World Models for Autonomous Driving  __`arxiv`__; __`Planning`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.10714)\n* 2024-OccSora: 4D Occupancy Generation Models as World Simulators for Autonomous Driving [Paper](https:\u002F\u002Farxiv.org\u002Fab\u002F2405.20337), [Code](https:\u002F\u002Fgithub.com\u002Fwzzheng\u002FOccSora)\n\n### Before 2023\n* 2023-ADriver-I: A General World Model for Autonomous Driving __`arxiv`__; __`Generative AI`__; __`NuScenes & one private dataset`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.13549.pdf) \n* 2023-GAIA-1: A Generative World Model for Autonomous Driving __`arxiv`__; __`Generative AI`__; __`Wayve's private data`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.17080.pdf)\n* 2023-Neural World Models for Computer Vision __'PhD Thesis'__; __`from Wayve`__  [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.09179)\n* 2022-Separating the World and Ego Models for Self-Driving __` ICLR 2022 workshop on Generalizable Policy Learning in the Physical World`__; __`from Yann Lecun's Group`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07184), [Code](https:\u002F\u002Fgithub.com\u002Fvladisai\u002Fpytorch-ppuu)\n* 2022-SEM2: Enhance Sample Efficiency and Robustness of End-to-end Urban Autonomous Driving via Semantic Masked World Model  __`NeurIPS 2022 Deep Reinforcement Learning Workshop`__; __`RL`__; __`CARLA dataset`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.04017.pdf)\n* 2022-MILE: Model-Based Imitation Learning for Urban Driving __`NeurIPS 2022`__; __`RL`__; __`from Wayve`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.07729.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fwayveai\u002Fmile)\n* 2022-Iso-Dream: Isolating and Leveraging Noncontrollable Visual Dynamics in World Models __`NeurIPS 2022`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.13817.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fpanmt\u002Fiso-dream)\n* 2021-FIERY: Future Instance Prediction in Bird's-Eye View from Surround Monocular Cameras __`ICCV 2019`__; __`Future Prediction`__; __`from Wayve`__; __`NuScenes, Lyft datasets`__ [Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fpapers\u002FHu_FIERY_Future_Instance_Prediction_in_Birds-Eye_View_From_Surround_Monocular_ICCV_2021_paper.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fwayveai\u002Ffiery)\n* 2021-Learning to drive from a world on rails __`CVPR 2021 Oral`__; __`RL`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.00636.pdf), [Project Page](https:\u002F\u002Fdotchen.github.io\u002Fworld_on_rails\u002F), [Code](https:\u002F\u002Fgithub.com\u002Fdotchen\u002FWorldOnRails)\n* 2019-Model-Predictive Policy Learning with Uncertainty Regularization for Driving in Dense Traffic __`ICLR 2019`__; __`Future Prediction`__; __`from Yann Lecun's Group`__ [Paper](https:\u002F\u002Fgithub.com\u002FAtcold\u002Fpytorch-PPUU?tab=readme-ov-file), [Code](https:\u002F\u002Fgithub.com\u002FAtcold\u002Fpytorch-PPUU)\n  \n## Workshops\u002FChallenges\n* 2024-1X World Model Challenge  __`Challenges`__ [Link](https:\u002F\u002Fgithub.com\u002F1x-technologies\u002F1xgpt)\n* 2024-CVPR Workshop, Foundation Models for Autonomous Systems, Challenges, Track 4: Predictive World Model __`Challenges`__ [Link](https:\u002F\u002Fopendrivelab.com\u002Fchallenge2024\u002F)\n\n## Tutorials\u002FTalks\u002F\n* 2023 __`from Wayve`__; [Video](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=lNOs08byOhw)\n* 2022-Neural World Models for Autonomous Driving [Video](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=wMvYjiv6EpY)\n\n## Surveys that Contain World Models for AD\n* 2025-A Survey of World Models for Autonomous Driving __`arxiv`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.11260)\n* 2024-World Models for Autonomous Driving: An Initial Survey __`arxiv`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02622)\n* 2024-Data-Centric Evolution in Autonomous Driving: A Comprehensive Survey of Big\nData System, Data Mining, and Closed-Loop Technologies __`arxiv`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12888.pdf)\n* 2024-Forging Vision Foundation Models for Autonomous Driving: Challenges, Methodologies, and Opportunities __`arxiv`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.08045.pdf)\n\n## Other General World Model Papers\n* 2025-Dreamer 4: Training Agents Inside of Scalable World Models  __`arxiv`__ [Paper](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2509.24527)\n* 2025-TAWM: Time-Aware World Model for Adaptive Prediction and Control  __`ICML 2025`__  [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.08441), [Code](https:\u002F\u002Fgithub.com\u002Fanh-nn01\u002FTime-Aware-World-Model)\n* 2025-What Has a Foundation Model Found? Using Inductive Bias to Probe for World Models  __`ICML 2025`__ [Paper](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2507.06952)\n* 2025-Critiques of World Models [Paper](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2507.05169)\n* 2025-DREAMGEN: Unlocking Generalization in Robot Learning through Video World Models  __`from Nvidia`__  [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.12705), [Code](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FGR00T-Dreams)\n* 2025-V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning __`from Meta`__  [Paper](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Fv-jepa-2-self-supervised-video-models-enable-understanding-prediction-and-planning\u002F), [Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fvjepa2)\n* 2025-UniVLA: Learning to Act Anywhere with Task-centric Latent Actions __`arxiv 2025`__  [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.06111), [Code](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FUniVLA)\n* 2025-Learning 3D Persistent Embodied World Models __`arxiv 2025`__ [Paper](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2505.05495)\n* 2025-AdaWorld: Learning Adaptable World Models with Latent Actions  __`ICML 2025`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.18938)\n* 2025-DreamerV3: Mastering diverse control tasks through world models __`Nature`__ [Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-025-08744-2), [Code](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdreamerv3)\n* 2025-PhysTwin: Physics-Informed Reconstruction and Simulation of Deformable Objects from Videos [Paper](https:\u002F\u002Fjianghanxiao.github.io\u002Fphystwin-web\u002Fphystwin.pdf), [Code](https:\u002F\u002Fgithub.com\u002FJianghanxiao\u002FPhysTwin)\n* 2025-Intuitive physics understanding emerges from self-supervised pretraining on natural videos [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11831v1), [Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fjepa-intuitive-physics)\n* 2025-Do generative video models learn physical principles from watching videos? [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.09038), [Code](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002Fphysics-IQ-benchmark), [Website](https:\u002F\u002Fphysics-iq.github.io\u002F)\n* 2024-PreLAR: World Model Pre-training with Learnable Action Representation __`ECCV 2024`__; __`Pretraining`__; __`RL`__; [Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F03363.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fzhanglixuan0720\u002FPreLAR)\n* 2024-Understanding Physical Dynamics with Counterfactual World Modeling __`ECCV 2024`__; [Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F03523.pdf), [Website](https:\u002F\u002Fneuroailab.github.io\u002Fcwm-physics\u002F), [Code](https:\u002F\u002Fgithub.com\u002Fneuroailab\u002Fcwm_dynamics)\n* 2024-Genie2: [Website](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fgenie-2-a-large-scale-foundation-world-model\u002F)\n* 2024-WHALE: Towards Generalizable and Scalable World Models for Embodied Decision-making [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.05619)\n* 2024-How Far is Video Generation from World Model: A Physical Law Perspective [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.02385)\n* 2024-PIVOT-R: Primitive-Driven Waypoint-Aware World Model for Robotic Manipulation  __`NeurIPS 2024`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10394) \n* 2024-RoboDreamer: Learning Compositional World Models for Robot Imagination [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.12377)\n* 2024-TD-MPC2: Scalable, Robust World Models for Continuous Control __`ICLR 2024`__ [Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Oxh5CstDJU)\n* 2024-Hierarchical World Models as Visual Whole-Body Humanoid Controllers [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.18418)\n* 2024-Efficient World Models with Time-Aware and Context-Augmented Tokenization __`ICML 2024`__ \n* 2024-3D-VLA: A 3D Vision-Language-Action Generative World Model __`ICML 2024`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09631.pdf)\n* 2024-Newton from Archetype AI __`website`__ [Link](https:\u002F\u002Fwww.archetypeai.io\u002Fblog\u002Fintroducing-archetype-ai---understand-the-real-world-in-real-time)\n* 2024-MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators __`arxiv`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.05014.pdf), [Code](https:\u002F\u002Fgithub.com\u002FPKU-YuanGroup\u002FMagicTime)\n* 2024-IWM: Learning and Leveraging World Models in Visual Representation Learning  __`arxiv`__, __`from Yann Lecun's Group`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.00504.pdf)\n* 2024-Video as the New Language for Real-World Decision Making __`arxiv`__, __`Deepmind`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.17139)\n* 2024-Genie: Generative Interactive Environments __`Deepmind`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15391v1), [Website](https:\u002F\u002Fsites.google.com\u002Fview\u002Fgenie-2024\u002Fhome)\n* 2024-Sora __`OpenAI`__, __`Generative AI`__ [Link](https:\u002F\u002Fopenai.com\u002Fsora), [Technical Report](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fvideo-generation-models-as-world-simulators)\n* 2024-LWM: World Model on Million-Length Video And Language With RingAttention __`arxiv`__; __`Generative AI`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08268), [Code](https:\u002F\u002Fgithub.com\u002FLargeWorldModel\u002FLWM)\n* 2024-WorldDreamer: Towards General World Models for Video Generation via Predicting Masked Tokens __`arxiv`__; __`Generative AI`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09985)\n* 2024-Video prediction models as rewards for reinforcement learning __`NeurIPS 2024`__ [Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002Fd9042abf40782fbce28901c1c9c0e8d8-Paper-Conference.pdf), [Code](https:\u002F\u002Fgithub.com\u002FAlescontrela\u002Fviper_rl)\n* 2024-V-JEPA: Revisiting Feature Prediction for Learning Visual Representations from Video __`from Yann Lecun's Group`__ [Paper](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Frevisiting-feature-prediction-for-learning-visual-representations-from-video\u002F), [Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fjepa)\n* 2023-STORM: Efficient Stochastic Transformer based World Models for Reinforcement Learning  __`NeurIPS 2023`__ [Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002F5647763d4245b23e6a1cb0a8947b38c9-Paper-Conference.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fweipu-zhang\u002FSTORM)\n* 2023-Facing Off World Model Backbones: RNNs, Transformers, and S4 __`NeurIPS 2023`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02064)\n* 2023-I-JEPA: Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture __`CVPR 2023`__; __`from Yann Lecun's Group`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.08243), [Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fijepa)\n* 2023-Temporally Consistent Transformers for Video Generation __`ICML 2023`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02396), [Code](https:\u002F\u002Fgithub.com\u002Fwilson1yan\u002Fteco)\n* 2023-Learning to Model the World with Language __`arxiv`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01399), [Code](https:\u002F\u002Fgithub.com\u002Fjlin816\u002Fdynalang)\n* 2023-Transformers are sample-efficient world models __`ICLR 2023`__;__`RL`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.00588.pdf), [Code](https:\u002F\u002Fgithub.com\u002Feloialonso\u002Firis)\n* 2023-Gradient-based Planning with World Models __`arxiv`__; __`from Yann Lecun's Group`__; __`Planning`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17227)\n* 2023-World Models via Policy-Guided Trajectory Diffusion __`arxiv`__; __`RL`__; [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.08533.pdf)\n* 2023-DreamerV3: Mastering diverse domains through world models __`arxiv`__;__`RL`__; [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.04104), [Code](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdreamerv3)\n* 2022-Daydreamer: World models for physical robot learning __`CoRL 2022`__; __`Robotics`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176), [Code](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdaydreamer)\n* 2022-Masked World Models for Visual Control __`CoRL 2022`__; __`Robotics`__ [Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv205\u002Fseo23a.html), [Code](https:\u002F\u002Fgithub.com\u002Fyounggyoseo\u002FMWM) \n* 2022-A Path Towards Autonomous Machine Intelligence __`openreview`__; __`from Yann Lecun's Group`__; __`General Roadmap for World Models`__; [Paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=BZ5a1r-kVsf); [Slides1](https:\u002F\u002Fleshouches2022.github.io\u002FSLIDES\u002Fcompressed-yann-1.pdf), [Slides2](https:\u002F\u002Fleshouches2022.github.io\u002FSLIDES\u002Flecun-20220720-leshouches-02.pdf), [Slides3](https:\u002F\u002Fleshouches2022.github.io\u002FSLIDES\u002Flecun-20220720-leshouches-03.pdf); [Videos](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PLEIq5bchE3R3Yl5taXdYA04a9kH9yvyGm)\n* 2021-LEXA:Discovering and Achieving Goals via World Models __`NeurIPS 2021`__; [Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2021\u002Fhash\u002Fcc4af25fa9d2d5c953496579b75f6f6c-Abstract.html), [Website & Code](https:\u002F\u002Forybkin.github.io\u002Flexa\u002F)\n* 2021-DreamerV2: Mastering Atari with Discrete World Models __`ICLR 2021`__; __`RL`__; __`from Google & Deepmind`__ [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.02193.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdreamerv2)\n* 2020-Dreamer: Dream to Control: Learning Behaviors by Latent Imagination __`ICLR 2020`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603), [Code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fdreamer)\n* 2019-Learning Latent Dynamics for Planning from Pixels __`ICML 2019`__ [Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv97\u002Fhafner19a\u002Fhafner19a.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fplanet)\n* 2018-Model-Based Planning with Discrete and Continuous Actions __`arxiv`__; __`RL, Planning`__; __`from Yann Lecun's Group`__;  [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.07177)\n* 2018-Recurrent world models facilitate policy evolution __`NeurIPS 2018`__; [Paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2018\u002Ffile\u002F2de5d16682c3c35007e4e92982f1a2ba-Paper.pdf), [Code](https:\u002F\u002Fgithub.com\u002Fhardmaru\u002FWorldModelsExperiments)\n\n## Other Related Papers\n* 2023-Occupancy Prediction-Guided Neural Planner for Autonomous Driving __`ITSC 2023`__; __`Planning, Neural Predicted-Guided Planning`__; __`Waymo Open Motion dataset`__ [Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.03303)\n\n## Other Related Repos\n[Awesome-World-Model](https:\u002F\u002Fgithub.com\u002FLMD0311\u002FAwesome-World-Model),\n[Awesome-World-Models-for-AD ](https:\u002F\u002Fgithub.com\u002Fzhanghm1995\u002Fawesome-world-models-for-AD?tab=readme-ov-file#Table-of-Content),\n[World models paper list from Shanghai AI lab](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FEnd-to-end-Autonomous-Driving\u002Fblob\u002Fmain\u002Fpapers.md#world-model--model-based-rl),\n[Awesome-Papers-World-Models-Autonomous-Driving](https:\u002F\u002Fgithub.com\u002Fchaytonmin\u002FAwesome-Papers-World-Models-Autonomous-Driving).\n    \n","# 世界模型-自动驾驶-最新综述\n一份精选的自动驾驶领域世界模型列表。持续更新中。\n\n## 公告\n除了我们下面列出的优秀论文外，我们非常高兴地宣布，我们所在的纽约大学学习系统实验室最近发布了一篇预印本，题目为：[AD-L-JEPA：基于联合嵌入预测架构的自监督空间世界模型，用于激光雷达数据驱动的自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04969)，这是首个基于联合嵌入预测架构（JEPA）的空间世界模型，用于自动驾驶的自监督表征学习。源代码可在[AD-L-JEPA-Release](https:\u002F\u002Fgithub.com\u002FHaoranZhuExplorer\u002FAD-L-JEPA-Release)获取。如果您受到这篇论文的启发，可以考虑通过以下方式引用：\n```bibtex\n@article{zhu2025ad,\n  title={AD-L-JEPA: Self-Supervised Spatial World Models with Joint Embedding Predictive Architecture for Autonomous Driving with LiDAR Data},\n  author={Zhu, Haoran and Dong, Zhenyuan and Topollai, Kristi and Choromanska, Anna},\n  journal={arXiv preprint arXiv:2501.04969},\n  year={2025}\n}\n```\n\n## 领军人物\n[Yann Lecun](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=WLN3QrAAAAAJ&view_op=list_works&sortby=pubdate), [Danijar Hafner](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=VINmGpYAAAAJ&view_op=list_works&sortby=pubdate), [Chuang Gang](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=PTeSCbIAAAAJ&view_op=list_works&sortby=pubdate), [Yilun Du](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=GRMMc_MAAAAJ&view_op=list_works&sortby=pubdate), [Nicklas Hansen](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=8wGH7wsAAAAJ&view_op=list_works&sortby=pubdate)\n\n\n## 论文\n\n\n### 2025年\n#### NeurIPS 2025\n* DINO-Foresight：借助DINO展望未来 __`NeurIPS 2025`__; __`VFM`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.11673), [代码](https:\u002F\u002Fgithub.com\u002FSta8is\u002FDINO-Foresight)\n* FutureSightDrive：利用时空思维链进行视觉化思考，用于自动驾驶 __`NeurIPS 2025`__; __`VLM`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.17685), [代码](https:\u002F\u002Fgithub.com\u002FMIV-XJTU\u002FFSDrive)\n* Raw2Drive：基于对齐世界模型的强化学习，用于端到端自动驾驶（在CARLA v2中） __`NeurIPS 2025`__; __`端到端自动驾驶`__; __`强化学习`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.16394)\n* 向高效潜在流匹配的基石级激光雷达世界模型迈进 __`NeurIPS 2025`__; __`生成式AI`__; __`迁移学习`__; [论文](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2506.23434), [网站](https:\u002F\u002Forbis36.github.io\u002FAdaFlowMatchingWM-Web\u002F)\n* Orbis：克服驾驶世界模型中长时程预测的挑战 __`NeurIPS 2025`__; __`生成式AI`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.13162), [网站](https:\u002F\u002Flmb-freiburg.github.io\u002Forbis.github.io\u002F)\n* Genesis：具有时空及跨模态一致性的多模态驾驶场景生成 __`NeurIPS 2025`__; __`生成式AI`__; __`多模态`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.07497), [网站](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.07497), [代码即将发布](https:\u002F\u002Fgithub.com\u002Fxiaomi-research\u002Fgenesis)\n\n\n#### ICCV 2025\n* World4Drive：通过意图感知的物理潜在世界模型实现端到端自动驾驶 __`ICCV 2025`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.00603), [代码](https:\u002F\u002Fgithub.com\u002Fucaszyp\u002FWorld4Drive)\n \n#### ICML 2025\n* DriveGPT：扩展自动驾驶中的自回归行为模型 __`ICML 2025`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14415) [演示](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=-hLi44PfY8g)\n\n#### CVPR 2025\n* GEM：一种可泛化的自我视角多模态世界模型，用于精细控制自我运动、物体动力学和场景构成 __`CVPR 2025`__; __`生成式AI`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.11198), [代码即将发布](https:\u002F\u002Fgithub.com\u002Fvita-epfl\u002FGEM)\n* FUTURIST：通过多模态视觉序列Transformer推进语义未来预测 __`CVPR 2025`__ [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.08303)] [[代码](https:\u002F\u002Fgithub.com\u002FSta8is\u002FFUTURIST)]\n* DIO：可分解的隐式四维占用-流世界模型 __`CVPR 2025`__ [论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2025\u002Fpapers\u002FDiehl_DIO_Decomposable_Implicit_4D_Occupancy-Flow_World_Model_CVPR_2025_paper.pdf)\n\n#### ICLR 2025\n* LAW：利用潜在世界模型提升端到端自动驾驶性能 __`ICLR 2025`__; __`端到端自动驾驶`__; [论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=fd2u60ryG0), [代码](https:\u002F\u002Fgithub.com\u002FBraveGroup\u002FLAW)\n* PreWorld：半监督的以视觉为中心的三维占用世界模型，用于自动驾驶 __`ICLR 2025`__; __`占用预测`__; __`路径规划`__; [论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rCX9l4OTCT), [代码](https:\u002F\u002Fgithub.com\u002Fgetterupper\u002FPreWorld)\n* AdaWM：基于自适应世界模型的自动驾驶规划 __`ICLR 2025`__; __`强化学习`__; __`规划`__; [论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NEu8wgPctU)\n* SSR：面向端到端自动驾驶的导航引导型稀疏场景表示 __`ICLR 2025`__; __`端到端自动驾驶`__; [论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Vv76fCYffN), [代码](https:\u002F\u002Fgithub.com\u002FPeidongLi\u002FSSR)\n* OccProphet：借助观察者-预测者-精炼者框架，推动仅使用摄像头的四维占用预测效率前沿 __`ICLR 2025`__; __`占用预测`__; [论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vC7AlY1ytz), [代码即将发布](https:\u002F\u002Fgithub.com\u002FJLChen-C\u002FOccProphet)\n\n#### AAAI 2025\n* DriveDreamer-2：LLM增强的世界模型，用于多样化驾驶视频生成 __`AAAI 2025`__; __`生成式AI`__; __`LLM`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.06845), [网站](https:\u002F\u002Fdrivedreamer2.github.io\u002F), [代码](https:\u002F\u002Fgithub.com\u002Ff1yfisher\u002FDriveDreamer2)\n* Drive-OccWorld：在占用世界中驾驶：基于世界模型的以视觉为中心的四维占用预测与规划，用于自动驾驶 __`AAAI 2025`__; __`占用预测`__; __`规划`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.14197), [网站](https:\u002F\u002Fdrive-occworld.github.io\u002F), [代码](https:\u002F\u002Fgithub.com\u002Fyuyang-cloud\u002FDrive-OccWorld)\n\n#### RSS 2025\n* LOPR：用于自动驾驶的自监督多未来占用预测 [论文](https:\u002F\u002Fwww.roboticsproceedings.org\u002Frss21\u002Fp003.pdf)  __`RSS 2025`__;\n\n#### 其他\n* 回到特性：DINO作为视频世界模型的基础 [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2507.19468)\n* IntPhys 2：在复杂合成环境中基准测试直观物理理解 [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2506.09849)，[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FIntPhys2)\n* Genie Envisioner：用于机器人操作的统一世界基础平台 [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.05635v1)，[网站](https:\u002F\u002Fgenie-envisioner.github.io\u002F)\n* Genie 3：世界模型的新前沿 [网站](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fgenie-3-a-new-frontier-for-world-models\u002F)\n* DriVerse：通过多模态轨迹提示和运动对齐实现驾驶模拟的导航世界模型 __`arxiv四月`__；__`生成式AI`__；[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.18576)，[代码](https:\u002F\u002Fgithub.com\u002Fshalfun\u002FDriVerse)\n* 从世界模型中学习驾驶 __`arxiv四月`__；[论文](https:\u002F\u002Fwww.arxiv.org\u002Fpdf\u002F2504.19077)\n* WoTE：基于BEV世界模型的在线轨迹评估端到端驾驶 __`arxiv四月`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.01941)，[代码](https:\u002F\u002Fgithub.com\u002FliyingyanUCAS\u002FWoTE)\n* AETHER：几何感知的统一世界建模 __`arxiv三月`__；[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.18945)，[网站](https:\u002F\u002Faether-world.github.io\u002F)\n* GAIA-2：面向自动驾驶的可控多视角生成式世界模型 __`生成式AI`__；[论文](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1L_FwiQS0KvrzERaYeG08AA1GO5HpIMfq\u002Fview)\n* 还需要其他车辆的轨迹：一种驾驶世界模型将本车与其他车辆的轨迹统一到视频潜在空间中 __`arxiv三月`__；__`生成式AI`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09215)\n* $T^3$Former：作为占用世界模型的时序三平面Transformer __`arxiv三月`__；__`占用预测`__；[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.07338)\n* InDRiVE：基于内在分歧的强化学习，通过好奇心驱动的广义世界模型进行车辆探索 __`arxiv三月`__；__`RL`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.05573)\n* PIWM：用预测性个体世界模型实现“梦想成真”的驾驶 __`TIV 2025`__；__`RL`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.16733)，[代码](https:\u002F\u002Fgithub.com\u002Fgaoyinfeng\u002FPIWM)\n* MaskGWM：具有视频掩码重建功能的可泛化驾驶世界模型 __`arxiv`__；__`生成式AI`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11663) [代码](https:\u002F\u002Fgithub.com\u002FSenseTime-FVG\u002FOpenDWM)\n* 梦想到驾驶：使用解析型世界模型进行基于模型的车辆控制 __`arxiv`__；__`规划`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.10012)\n* HERMES：用于同时进行3D场景理解和生成的统一自动驾驶世界模型 __`arxiv`__；__`生成式AI`__；__`LLM`__；[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.14729)，[代码待发布](https:\u002F\u002Fgithub.com\u002FLMD0311\u002FHERMES)\n* AD-L-JEPA：结合联合嵌入预测架构的自监督空间世界模型，适用于基于LiDAR数据的自动驾驶。__`arxiv`__；__`预训练`__；__`自监督表征学习`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04969)，[代码](https:\u002F\u002Fgithub.com\u002FHaoranZhuExplorer\u002FAD-L-JEPA-Release)\n* Cosmos世界基础模型平台，用于物理AI __`arxiv`__；__`基础模型`__；[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.03575)，[代码](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FCosmos)\n\n### 2024\n\n#### NeurIPS 2024\n* DrivingDojo 数据集：推动交互式与知识增强的驾驶世界模型 __`NeurIPS 2024`__; __`数据集`__; [论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002F178f4666a84ecdd61e3b85145ed56484-Paper-Datasets_and_Benchmarks_Track.pdf), [官网](https:\u002F\u002Fdrivingdojo.github.io\u002F), [代码](https:\u002F\u002Fgithub.com\u002FRobertwyq\u002FDrivingdojo)\n* Vista：具有高保真度和多功能可控性的可泛化驾驶世界模型 __`NeurIPS 2024`__; __`来自上海人工智能实验室`__;  __`生成式AI`__; [论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002Fa6a066fb44f2fe0d36cf740c873b8890-Paper-Conference.pdf), [官网](https:\u002F\u002Fopendrivelab.com\u002FVista\u002F), [代码](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FVista)\n\n#### ECCV 2024\n* DriveDreamer：迈向由真实世界驱动的自动驾驶世界模型 __`ECCV 2024`__; __`生成式AI`__; [论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F06416.pdf), [官网](https:\u002F\u002Fdrivedreamer.github.io\u002F), [代码](https:\u002F\u002Fgithub.com\u002FJeffWang987\u002FDriveDreamer)\n* 基于生成式世界模型的自动驾驶中竞争行为建模 __`ECCV 2024`__; __`强化学习`__; __`轨迹仿真`__; [论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F05085.pdf), [代码待发布](https:\u002F\u002Fgithub.com\u002Fqiaoguanren\u002FMARL-CCE)\n* NeMo：用于自动驾驶的神经体积世界模型 __`ECCV 2024`__; __`端到端自动驾驶`__; __`运动规划`__; [论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F02571.pdf)\n* OccWorld：学习用于自动驾驶的3D占用世界模型 __`ECCV 2024`__; __`占用预测`__; __`运动规划`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.16038.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fwzzheng\u002FOccWorld)\n* Think2Drive：通过潜在世界模型思考实现高效的自动驾驶强化学习（在CARLA-v2中） __`ECCV 2024`__; __`强化学习`__; [论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F06129.pdf), [官网](https:\u002F\u002Fthinklab-sjtu.github.io\u002FCornerCaseRepo\u002F)\n* FipTR：一种简单而有效的Transformer框架，用于自动驾驶中的未来实例预测 __`ECCV 2024`__; __`未来实例预测`__; [论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F11758.pdf), [代码](https:\u002F\u002Fgithub.com\u002FTabGuigui\u002FFipTR)\n* DrivingDiffusion：基于潜扩散模型的布局引导多视角驾驶场景视频生成 __`ECCV 2024`__; __`生成式AI`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07771.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fshalfun\u002FDrivingDiffusion)\n\n#### CVPR 2024\n* Drive-WM：驶向未来：基于世界模型的多视角视觉预测与规划用于自动驾驶 __`CVPR 2024`__; __`生成式AI`__; __`规划`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.17918.pdf), [官网](https:\u002F\u002Fdrive-wm.github.io\u002F), [代码](https:\u002F\u002Fgithub.com\u002FBraveGroup\u002FDrive-WM)\n* DriveWorld：通过世界模型进行的自动驾驶4D预训练场景理解 __`CVPR 2024`__; __`预训练`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.04390)\n* Cam4DOcc：自动驾驶应用中仅使用摄像头的4D占用预测基准测试 __`CVPR 2024`__; __`占用预测`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.17663), [代码](https:\u002F\u002Fgithub.com\u002Fhaomo-ai\u002FCam4DOcc)\n* GenAD：自动驾驶的通用预测模型 __`CVPR 2024`__; __`来自上海人工智能实验室`__ __`生成式AI`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09630.pdf), [代码](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveAGI) \n* ViDAR：视觉点云预测助力规模化自动驾驶 __`CVPR 2024`__; __`预训练`__; __`来自上海人工智能实验室`__; __`NuScenes数据集`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17655), [代码](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FViDAR)\n* UnO：用于感知和预测的无监督占用场 __`CVPR 2024`__; __`占用预测`__; __`预训练`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08691)\n\n#### ICLR 2024\n* Copilot4D：通过离散扩散学习自动驾驶的无监督世界模型 __`ICLR 2024`__; __`未来点云预测`__; __`来自Waabi`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.01017)\n\n#### ICRA 2024\n* 利用潜在空间生成式世界模型缓解自动驾驶模仿学习中的协变量偏移 __`ICRA 2024`__; __`规划`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.16663) \n\n#### 其他\n* InfinityDrive：突破驾驶世界模型的时间限制 __`arxiv 2024`__; __`生成式AI`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.01522v1), [官网](https:\u002F\u002Fmetadrivescape.github.io\u002Fpapers_project\u002FInfinityDrive\u002Fpage.html)\n* DriveDreamer4D：世界模型是4D驾驶场景表示的有效数据机器 __`arxiv 2024`__; __`生成式AI`__; __`4D仿真`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.13571), [官网](https:\u002F\u002Fdrivedreamer4d.github.io\u002F), [代码](https:\u002F\u002Fgithub.com\u002FGigaAI-research\u002FDriveDreamer4D)\n* ReconDreamer：通过在线修复构建驾驶场景重建的世界模型 __`arxiv 2024`__; __`生成式AI`__; __`4D仿真`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.19548), [官网](https:\u002F\u002Frecondreamer.github.io\u002F), [代码](https:\u002F\u002Fgithub.com\u002FGigaAI-research\u002FReconDreamer)\n* 2024-DrivingWorld：通过Video GPT构建自动驾驶世界模型。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.19505)  [项目页面](https:\u002F\u002Fhuxiaotaostasy.github.io\u002FDrivingWorld\u002Findex.html) [代码](https:\u002F\u002Fgithub.com\u002FYvanYin\u002FDrivingWorld)\n* 2024-DOME：驯服扩散模型成为高保真可控的占用世界模型。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10429)  [项目页面](https:\u002F\u002Fgusongen.github.io\u002FDOME)\n* 2024-OccLLaMA：一种用于自动驾驶的占用-语言-动作生成式世界模型 [论文](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2409.03272)\n* 2024-BEVWorld：通过统一BEV潜在空间构建的自动驾驶多模态世界模型 __`arxiv`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.05679)\n* 2024-利用适应性世界模型进行自动驾驶规划 __`arxiv`__; __`规划`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.10714)\n* 2024-OccSora：作为自动驾驶世界模拟器的4D占用生成模型 [论文](https:\u002F\u002Farxiv.org\u002Fab\u002F2405.20337), [代码](https:\u002F\u002Fgithub.com\u002Fwzzheng\u002FOccSora)\n\n### 2023年之前\n* 2023-ADriver-I：用于自动驾驶的通用世界模型 __`arxiv`__; __`生成式AI`__; __`NuScenes及一个私有数据集`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.13549.pdf) \n* 2023-GAIA-1：用于自动驾驶的生成式世界模型 __`arxiv`__; __`生成式AI`__; __`Wayve的私有数据`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.17080.pdf)\n* 2023-计算机视觉中的神经世界模型 __'博士论文'__; __`来自Wayve`__  [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.09179)\n* 2022-为自动驾驶分离世界模型与自我模型 __`ICLR 2022物理世界中可泛化策略学习研讨会`__; __`来自Yann LeCun团队`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07184), [代码](https:\u002F\u002Fgithub.com\u002Fvladisai\u002Fpytorch-ppuu)\n* 2022-SEM2：通过语义掩码世界模型提升端到端城市自动驾驶的样本效率和鲁棒性 __`NeurIPS 2022深度强化学习研讨会`__; __`RL`__; __`CARLA数据集`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.04017.pdf)\n* 2022-MILE：面向城市驾驶的基于模型的模仿学习 __`NeurIPS 2022`__; __`RL`__; __`来自Wayve`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.07729.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fwayveai\u002Fmile)\n* 2022-Iso-Dream：在世界模型中隔离并利用不可控的视觉动态 __`NeurIPS 2022`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.13817.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fpanmt\u002Fiso-dream)\n* 2021-FIERY：从环视单目摄像头预测鸟瞰视角下的未来实例 __`ICCV 2019`__; __`未来预测`__; __`来自Wayve`__; __`NuScenes、Lyft数据集`__ [论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fpapers\u002FHu_FIERY_Future_Instance_Prediction_in_Birds-Eye_View_From_Surround_Monocular_ICCV_2021_paper.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fwayveai\u002Ffiery)\n* 2021-从“轨道上的世界”中学习驾驶 __`CVPR 2021口头报告`__; __`RL`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.00636.pdf), [项目页面](https:\u002F\u002Fdotchen.github.io\u002Fworld_on_rails\u002F), [代码](https:\u002F\u002Fgithub.com\u002Fdotchen\u002FWorldOnRails)\n* 2019-带有不确定性正则化的模型预测策略学习，用于密集交通中的驾驶 __`ICLR 2019`__; __`未来预测`__; __`来自Yann LeCun团队`__ [论文](https:\u002F\u002Fgithub.com\u002FAtcold\u002Fpytorch-PPUU?tab=readme-ov-file), [代码](https:\u002F\u002Fgithub.com\u002FAtcold\u002Fpytorch-PPUU)\n  \n## 研讨会\u002F挑战赛\n* 2024-1X世界模型挑战赛 __`挑战赛`__ [链接](https:\u002F\u002Fgithub.com\u002F1x-technologies\u002F1xgpt)\n* 2024-CVPR研讨会，自动驾驶系统的基础模型，挑战赛，赛道4：预测型世界模型 __`挑战赛`__ [链接](https:\u002F\u002Fopendrivelab.com\u002Fchallenge2024\u002F)\n\n## 教程\u002F演讲\n* 2023 __`来自Wayve`__; [视频](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=lNOs08byOhw)\n* 2022-自动驾驶中的神经世界模型 [视频](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=wMvYjiv6EpY)\n\n## 包含自动驾驶用世界模型的综述\n* 2025-自动驾驶用世界模型综述 __`arxiv`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.11260)\n* 2024-自动驾驶用世界模型：初步综述 __`arxiv`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02622)\n* 2024-自动驾驶中的数据驱动演进：大数据系统、数据挖掘和闭环技术的全面综述 __`arxiv`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12888.pdf)\n* 2024-为自动驾驶打造视觉基础模型：挑战、方法论与机遇 __`arxiv`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.08045.pdf)\n\n## 其他通用世界模型论文\n* 2025-Dreamer 4：在可扩展的世界模型中训练智能体  __`arxiv`__ [论文](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2509.24527)\n* 2025-TAWM：用于自适应预测与控制的时间感知世界模型  __`ICML 2025`__  [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.08441), [代码](https:\u002F\u002Fgithub.com\u002Fanh-nn01\u002FTime-Aware-World-Model)\n* 2025-基础模型发现了什么？利用归纳偏置探测世界模型  __`ICML 2025`__ [论文](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2507.06952)\n* 2025-对世界模型的批判 [论文](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2507.05169)\n* 2025-DREAMGEN：通过视频世界模型解锁机器人学习中的泛化能力  __`来自Nvidia`__  [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.12705), [代码](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FGR00T-Dreams)\n* 2025-V-JEPA 2：自监督视频模型实现理解、预测与规划 __`来自Meta`__  [论文](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Fv-jepa-2-self-supervised-video-models-enable-understanding-prediction-and-planning\u002F), [代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fvjepa2)\n* 2025-UniVLA：学习以任务为中心的潜在动作，在任何地方执行行动 __`arxiv 2025`__  [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.06111), [代码](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FUniVLA)\n* 2025-学习3D持久性具身世界模型 __`arxiv 2025`__ [论文](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2505.05495)\n* 2025-AdaWorld：利用潜在动作学习可适应的世界模型  __`ICML 2025`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.18938)\n* 2025-DreamerV3：通过世界模型掌握多样化的控制任务 __`Nature`__ [论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-025-08744-2), [代码](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdreamerv3)\n* 2025-PhysTwin：基于物理信息的可变形物体视频重建与仿真 [论文](https:\u002F\u002Fjianghanxiao.github.io\u002Fphystwin-web\u002Fphystwin.pdf), [代码](https:\u002F\u002Fgithub.com\u002FJianghanxiao\u002FPhysTwin)\n* 2025-直观的物理理解源于自然视频上的自监督预训练 [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11831v1), [代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fjepa-intuitive-physics)\n* 2025-生成式视频模型是否通过观看视频学习物理规律？ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.09038), [代码](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002Fphysics-IQ-benchmark), [网站](https:\u002F\u002Fphysics-iq.github.io\u002F)\n* 2024-PreLAR：使用可学习动作表示进行世界模型预训练 __`ECCV 2024`__；__`预训练`__；__`强化学习`__；[论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F03363.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fzhanglixuan0720\u002FPreLAR)\n* 2024-利用反事实世界建模理解物理动力学 __`ECCV 2024`__；[论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2024\u002Fpapers_ECCV\u002Fpapers\u002F03523.pdf), [网站](https:\u002F\u002Fneuroailab.github.io\u002Fcwm-physics\u002F), [代码](https:\u002F\u002Fgithub.com\u002Fneuroailab\u002Fcwm_dynamics)\n* 2024-Genie2：[网站](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fgenie-2-a-large-scale-foundation-world-model\u002F)\n* 2024-WHALE：迈向具身决策的通用且可扩展的世界模型 [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.05619)\n* 2024-从物理定律角度看，视频生成距离世界模型还有多远？ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.02385)\n* 2024-PIVOT-R：面向机器人操作的基元驱动、航点感知世界模型  __`NeurIPS 2024`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10394) \n* 2024-RoboDreamer：学习用于机器人想象的组合式世界模型 [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.12377)\n* 2024-TD-MPC2：适用于连续控制的可扩展、鲁棒世界模型 __`ICLR 2024`__ [论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Oxh5CstDJU)\n* 2024-分层世界模型作为视觉全身人形控制器 [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.18418)\n* 2024-具有时间感知和上下文增强标记的高效世界模型 __`ICML 2024`__\n* 2024-3D-VLA：一种3D视觉-语言-动作生成式世界模型 __`ICML 2024`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09631.pdf)\n* 2024-Newton，来自Archetype AI __`网站`__ [链接](https:\u002F\u002Fwww.archetypeai.io\u002Fblog\u002Fintroducing-archetype-ai---understand-the-real-world-in-real-time)\n* 2024-MagicTime：延时视频生成模型作为变质模拟器 __`arxiv`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.05014.pdf), [代码](https:\u002F\u002Fgithub.com\u002FPKU-YuanGroup\u002FMagicTime)\n* 2024-IWM：在视觉表征学习中学习并利用世界模型  __`arxiv`__, __`来自Yann LeCun团队`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.00504.pdf)\n* 2024-视频作为现实世界决策的新语言 __`arxiv`__, __`Deepmind`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.17139)\n* 2024-Genie：生成式交互环境 __`Deepmind`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15391v1), [网站](https:\u002F\u002Fsites.google.com\u002Fview\u002Fgenie-2024\u002Fhome)\n* 2024-Sora __`OpenAI`__, __`生成式AI`__ [链接](https:\u002F\u002Fopenai.com\u002Fsora), [技术报告](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fvideo-generation-models-as-world-simulators)\n* 2024-LWM：基于RingAttention的百万级视频与语言世界模型 __`arxiv`__; __`生成式AI`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08268), [代码](https:\u002F\u002Fgithub.com\u002FLargeWorldModel\u002FLWM)\n* 2024-WorldDreamer：通过预测掩码令牌，迈向用于视频生成的通用世界模型 __`arxiv`__; __`生成式AI`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09985)\n* 2024-视频预测模型作为强化学习的奖励 __`NeurIPS 2024`__ [论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002Fd9042abf40782fbce28901c1c9c0e8d8-Paper-Conference.pdf), [代码](https:\u002F\u002Fgithub.com\u002FAlescontrela\u002Fviper_rl)\n* 2024-V-JEPA：重新审视特征预测，用于从视频中学习视觉表征 __`来自Yann LeCun团队`__ [论文](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Frevisiting-feature-prediction-for-learning-visual-representations-from-video\u002F), [代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fjepa)\n* 2023-STORM：用于强化学习的高效随机Transformer世界模型  __`NeurIPS 2023`__ [论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002F5647763d4245b23e6a1cb0a8947b38c9-Paper-Conference.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fweipu-zhang\u002FSTORM)\n* 2023-世界模型骨干网络对决：RNN、Transformer和S4 __`NeurIPS 2023`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02064)\n* 2023-I-JEPA：基于联合嵌入预测架构的图像自监督学习 __`CVPR 2023`__; __`来自Yann LeCun团队`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.08243), [代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fijepa)\n* 2023-用于视频生成的时序一致Transformer __`ICML 2023`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02396), [代码](https:\u002F\u002Fgithub.com\u002Fwilson1yan\u002Fteco)\n* 2023-用语言学习建模世界 __`arxiv`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01399), [代码](https:\u002F\u002Fgithub.com\u002Fjlin816\u002Fdynalang)\n* 2023-Transformer是样本高效的世界模型 __`ICLR 2023`__; __`强化学习`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.00588.pdf), [代码](https:\u002F\u002Fgithub.com\u002Feloialonso\u002Firis)\n* 2023-基于梯度的世界模型规划 __`arxiv`__; __`来自Yann LeCun团队`__; __`规划`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17227)\n* 2023-通过策略引导的轨迹扩散构建世界模型 __`arxiv`__; __`强化学习`__; [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.08533.pdf)\n* 2023-DreamerV3：通过世界模型掌握多个领域 __`arxiv`__; __`强化学习`__; [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.04104), [代码](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdreamerv3)\n* 2022-Daydreamer：用于物理机器人学习的世界模型 __`CoRL 2022`__; __`机器人技术`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176), [代码](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdaydreamer)\n* 2022-用于视觉控制的掩码世界模型 __`CoRL 2022`__; __`机器人技术`__ [论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv205\u002Fseo23a.html), [代码](https:\u002F\u002Fgithub.com\u002Fyounggyoseo\u002FMWM) \n* 2022-通往自主机器智能之路 __`openreview`__; __`来自Yann LeCun团队`__; __`世界模型总体路线图`__; [论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=BZ5a1r-kVsf); [幻灯片1](https:\u002F\u002Fleshouches2022.github.io\u002FSLIDES\u002Fcompressed-yann-1.pdf), [幻灯片2](https:\u002F\u002Fleshouches2022.github.io\u002FSLIDES\u002Flecun-20220720-leshouches-02.pdf), [幻灯片3](https:\u002F\u002Fleshouches2022.github.io\u002FSLIDES\u002Flecun-20220720-leshouches-03.pdf); [视频](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PLEIq5bchE3R3Yl5taXdYA04a9kH9yvyGm)\n* 2021-LEXA：通过世界模型发现并实现目标 __`NeurIPS 2021`__; [论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2021\u002Fhash\u002Fcc4af25fa9d2d5c953496579b75f6f6c-Abstract.html), [网站与代码](https:\u002F\u002Forybkin.github.io\u002Flexa\u002F)\n* 2021-DreamerV2：用离散世界模型征服Atari __`ICLR 2021`__; __`强化学习`__; __`来自Google与Deepmind`__ [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.02193.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fdanijar\u002Fdreamerv2)\n* 2020-Dreamer：从梦想到控制：通过潜在想象学习行为 __`ICLR 2020`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603), [代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fdreamer)\n* 2019-从像素中学习潜在动力学以进行规划 __`ICML 2019`__ [论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv97\u002Fhafner19a\u002Fhafner19a.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fplanet)\n* 2018-基于模型的规划，支持离散与连续动作 __`arxiv`__; __`强化学习、规划`__; __`来自Yann LeCun团队`__;  [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.07177)\n* 2018-循环世界模型促进策略演化 __`NeurIPS 2018`__; [论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2018\u002Ffile\u002F2de5d16682c3c35007e4e92982f1a2ba-Paper.pdf), [代码](https:\u002F\u002Fgithub.com\u002Fhardmaru\u002FWorldModelsExperiments)\n\n## 其他相关论文\n* 2023-基于 occupancy 预测的自动驾驶神经规划器 __`ITSC 2023`__; __`规划，神经预测引导的规划`__; __`Waymo 开放运动数据集`__ [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.03303)\n\n## 其他相关仓库\n[Awesome-World-Model](https:\u002F\u002Fgithub.com\u002FLMD0311\u002FAwesome-World-Model),\n[Awesome-World-Models-for-AD ](https:\u002F\u002Fgithub.com\u002Fzhanghm1995\u002Fawesome-world-models-for-AD?tab=readme-ov-file#Table-of-Content),\n[来自上海人工智能实验室的世界模型论文列表](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FEnd-to-end-Autonomous-Driving\u002Fblob\u002Fmain\u002Fpapers.md#world-model--model-based-rl),\n[Awesome-Papers-World-Models-Autonomous-Driving](https:\u002F\u002Fgithub.com\u002Fchaytonmin\u002FAwesome-Papers-World-Models-Autonomous-Driving).","# World-Models-Autonomous-Driving-Survey 快速上手指南\n\n**工具简介**：\n`World-Models-Autonomous-Driving-Survey` 并非一个单一的可执行软件包，而是一个由纽约大学学习系统实验室（NYU Learning Systems Laboratory）维护的**精选论文与代码资源列表**。它汇集了自动驾驶领域最新的世界模型（World Models）研究成果，涵盖生成式 AI、端到端驾驶、占据栅格预测等方向。\n\n本指南将指导开发者如何利用该列表快速定位、克隆并运行其中具有代表性的开源项目（以列表中提到的 `AD-L-JEPA` 及通用世界模型项目为例）。\n\n---\n\n## 1. 环境准备\n\n由于列表中的项目多涉及深度学习、3D 感知及大模型训练，对硬件和软件环境有较高要求。\n\n### 系统要求\n*   **操作系统**: Linux (推荐 Ubuntu 20.04\u002F22.04) 或 macOS (部分项目支持)。\n*   **GPU**: NVIDIA GPU (推荐显存 ≥ 24GB，如 RTX 3090\u002F4090 或 A100)，需支持 CUDA 11.8+。\n*   **内存**: ≥ 32GB RAM。\n*   **存储**: ≥ 100GB 可用空间（用于存放数据集和模型权重）。\n\n### 前置依赖\n在开始之前，请确保系统已安装以下基础工具：\n*   **Python**: 3.9 或 3.10 (大多数最新论文代码对此版本支持最好)。\n*   **Git**: 用于克隆仓库。\n*   **CUDA Toolkit**: 与 PyTorch 版本匹配。\n*   **FFmpeg**: 用于视频数据处理（部分生成式模型需要）。\n\n> **国内加速建议**：\n> *   推荐使用 **清华 (TUNA)** 或 **阿里 (Aliyun)** 镜像源加速 Python 包下载。\n> *   代码托管若遇 GitHub 访问缓慢，可使用 **Gitee** 镜像（若作者提供）或配置本地代理。\n\n---\n\n## 2. 安装步骤\n\n由于这是一个资源列表，你需要先选择一个具体的项目进行安装。以下以列表中高亮推荐的 **AD-L-JEPA** (基于 JEPA 架构的激光雷达世界模型) 为例，其他项目流程类似。\n\n### 第一步：克隆目标项目仓库\n从列表中获取目标项目的 GitHub 地址并克隆。\n\n```bash\n# 示例：克隆 AD-L-JEPA 项目\ngit clone https:\u002F\u002Fgithub.com\u002FHaoranZhuExplorer\u002FAD-L-JEPA-Release.git\ncd AD-L-JEPA-Release\n```\n\n### 第二步：创建虚拟环境\n建议使用 `conda` 管理环境，避免依赖冲突。\n\n```bash\n# 创建名为 'world-model' 的环境，指定 Python 3.10\nconda create -n world-model python=3.10 -y\nconda activate world-model\n```\n\n### 第三步：安装依赖\n根据项目根目录下的 `requirements.txt` 安装依赖。**强烈建议使用国内镜像源加速**。\n\n```bash\n# 使用清华镜像源安装依赖\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n*注：若项目依赖特定的 PyTorch 版本（如带有 CUDA 支持），请优先参照该项目 README 中的具体安装命令，通常格式如下：*\n```bash\n# 示例：安装特定版本的 PyTorch (根据实际项目要求调整)\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\n```\n\n### 第四步：准备数据与预训练模型\n大多数世界模型需要大规模数据集（如 nuScenes, Waymo, CARLA）或预训练权重。\n1.  **下载数据**：参照项目文档下载数据集，并放置于指定目录（通常为 `data\u002F` 或 `datasets\u002F`）。\n2.  **下载权重**：部分项目提供预训练模型链接，需下载 `.pth` 或 `.ckpt` 文件至 `checkpoints\u002F` 目录。\n\n---\n\n## 3. 基本使用\n\n安装完成后，你可以运行提供的脚本进行推理、训练或可视化。以下以最简单的**推理\u002F演示**模式为例。\n\n### 运行推理示例\n大多数项目会在 `scripts\u002F` 目录或根目录提供测试脚本。\n\n```bash\n# 示例：运行 AD-L-JEPA 的评估脚本\n# 请根据具体项目的实际脚本名称调整 (如 test.py, infer.py, demo.py)\npython scripts\u002Fevaluate.py --config configs\u002Fjepa_config.yaml --checkpoint checkpoints\u002Fpretrained.pth\n```\n\n### 查看其他热门项目\n你可以在该 Survey 列表中查找其他感兴趣的项目，例如：\n*   **Vista** (NeurIPS 2024): 高保真驾驶世界模型。\n    *   仓库: `https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FVista`\n*   **DriveDreamer** (ECCV 2024): 真实世界驱动的生成模型。\n    *   仓库: `https:\u002F\u002Fgithub.com\u002FJeffWang987\u002FDriveDreamer`\n*   **OccWorld** (ECCV 2024): 3D 占据栅格世界模型。\n    *   仓库: `https:\u002F\u002Fgithub.com\u002Fwzzheng\u002FOccWorld`\n\n只需重复上述“克隆 -> 安装 -> 运行”的步骤即可探索这些前沿技术。\n\n### 引用支持\n如果你在研究中使用了列表中的资源或受其启发（特别是 NYU 团队的 AD-L-JEPA），请考虑引用相关论文：\n\n```bibtex\n@article{zhu2025ad,\n  title={AD-L-JEPA: Self-Supervised Spatial World Models with Joint Embedding Predictive Architecture for Autonomous Driving with LiDAR Data},\n  author={Zhu, Haoran and Dong, Zhenyuan and Topollai, Kristi and Choromanska, Anna},\n  journal={arXiv preprint arXiv:2501.04969},\n  year={2025}\n}\n```","某自动驾驶初创公司的算法团队正致力于研发基于世界模型（World Models）的端到端预测系统，以应对复杂城市路况下的长时程决策挑战。\n\n### 没有 World-Models-Autonomous-Driving-Survey 时\n- **文献检索如大海捞针**：研究人员需在 arXiv、Google Scholar 等多个平台手动筛选关键词，极易遗漏如 AD-L-JEPA 或 DINO-Foresight 等最新预印本和顶会论文。\n- **技术路线评估困难**：面对生成式 AI、多模态融合或强化学习等不同技术流派，缺乏统一的分类索引，难以快速判断哪种架构（如 JEPA 或 VLM）最适合当前的激光雷达数据场景。\n- **复现成本高昂**：许多论文未明确标注代码开源状态，团队常花费数周时间寻找可运行代码，甚至因找不到官方实现而被迫放弃某些前沿思路。\n- **错失社区动态**：无法及时追踪 Yann LeCun、Danijar Hafner 等领军人物的最新动向，导致技术选型滞后于行业顶尖水平。\n\n### 使用 World-Models-Autonomous-Driving-Survey 后\n- **一站式获取前沿成果**：团队直接通过该清单锁定了 NeurIPS 2025 和 CVPR 2025 的最新论文，迅速将 AD-L-JEPA 纳入自监督表示学习的候选方案。\n- **精准匹配技术需求**：利用清单中详细的标签（如 `End-to-End AD`、`LiDAR Data`、`Long-Horizon Prediction`），快速对比 Orbis 与 Genesis 等模型特性，确定了针对长时程预测的最优解。\n- **加速原型验证**：借助清单提供的直达链接，直接获取 World4Drive 和 FUTURIST 的官方代码库，将算法复现周期从数周缩短至几天。\n- **紧跟权威风向标**：通过关注清单推荐的顶尖学者及其最新工作，确保团队的技术路线图始终与全球最前沿的研究保持同步。\n\nWorld-Models-Autonomous-Driving-Survey 将分散的学术资源转化为结构化的决策引擎，极大提升了自动驾驶研发团队在前沿技术探索中的效率与准确性。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FHaoranZhuExplorer_World-Models-Autonomous-Driving-Survey_ff2cb51e.png","HaoranZhuExplorer","Haoran Zhu","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FHaoranZhuExplorer_461c08a2.png","PhD Student at NYU ECE, I work on building next-generation world models for autonomous driving.",null,"New York City","email: hz1922@nyu.edu","https:\u002F\u002Fgithub.com\u002FHaoranZhuExplorer",512,23,"2026-04-16T07:41:14",5,"","未说明",{"notes":88,"python":86,"dependencies":89},"该仓库是一个自动驾驶世界模型（World Models）的论文和代码资源列表（Survey），本身不是一个可独立运行的单一软件工具。列表中包含了数十个不同的研究项目（如 AD-L-JEPA, Vista, DriveDreamer 等），每个项目都有各自独立的运行环境需求和依赖库。用户需点击列表中具体项目的代码链接（Code），前往其对应的独立仓库查看具体的安装和环境配置说明。",[],[14,13],[92,93,94,95],"autonomous-driving","deep-learning","self-supervised-learning","world-models","2026-03-27T02:49:30.150509","2026-04-17T09:52:33.303923",[],[]]