[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-opendilab--awesome-exploration-rl":3,"tool-opendilab--awesome-exploration-rl":65},[4,23,32,40,49,57],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":22},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,2,"2026-04-05T10:45:23",[13,14,15,16,17,18,19,20,21],"图像","数据工具","视频","插件","Agent","其他","语言模型","开发框架","音频","ready",{"id":24,"name":25,"github_repo":26,"description_zh":27,"stars":28,"difficulty_score":29,"last_commit_at":30,"category_tags":31,"status":22},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,3,"2026-04-04T04:44:48",[17,13,20,19,18],{"id":33,"name":34,"github_repo":35,"description_zh":36,"stars":37,"difficulty_score":29,"last_commit_at":38,"category_tags":39,"status":22},519,"PaddleOCR","PaddlePaddle\u002FPaddleOCR","PaddleOCR 是一款基于百度飞桨框架开发的高性能开源光学字符识别工具包。它的核心能力是将图片、PDF 等文档中的文字提取出来，转换成计算机可读取的结构化数据，让机器真正“看懂”图文内容。\n\n面对海量纸质或电子文档，PaddleOCR 解决了人工录入效率低、数字化成本高的问题。尤其在人工智能领域，它扮演着连接图像与大型语言模型（LLM）的桥梁角色，能将视觉信息直接转化为文本输入，助力智能问答、文档分析等应用场景落地。\n\nPaddleOCR 适合开发者、算法研究人员以及有文档自动化需求的普通用户。其技术优势十分明显：不仅支持全球 100 多种语言的识别，还能在 Windows、Linux、macOS 等多个系统上运行，并灵活适配 CPU、GPU、NPU 等各类硬件。作为一个轻量级且社区活跃的开源项目，PaddleOCR 既能满足快速集成的需求，也能支撑前沿的视觉语言研究，是处理文字识别任务的理想选择。",74913,"2026-04-05T10:44:17",[19,13,20,18],{"id":41,"name":42,"github_repo":43,"description_zh":44,"stars":45,"difficulty_score":46,"last_commit_at":47,"category_tags":48,"status":22},3215,"awesome-machine-learning","josephmisiti\u002Fawesome-machine-learning","awesome-machine-learning 是一份精心整理的机器学习资源清单，汇集了全球优秀的机器学习框架、库和软件工具。面对机器学习领域技术迭代快、资源分散且难以甄选的痛点，这份清单按编程语言（如 Python、C++、Go 等）和应用场景（如计算机视觉、自然语言处理、深度学习等）进行了系统化分类，帮助使用者快速定位高质量项目。\n\n它特别适合开发者、数据科学家及研究人员使用。无论是初学者寻找入门库，还是资深工程师对比不同语言的技术选型，都能从中获得极具价值的参考。此外，清单还延伸提供了免费书籍、在线课程、行业会议、技术博客及线下聚会等丰富资源，构建了从学习到实践的全链路支持体系。\n\n其独特亮点在于严格的维护标准：明确标记已停止维护或长期未更新的项目，确保推荐内容的时效性与可靠性。作为机器学习领域的“导航图”，awesome-machine-learning 以开源协作的方式持续更新，旨在降低技术探索门槛，让每一位从业者都能高效地站在巨人的肩膀上创新。",72149,1,"2026-04-03T21:50:24",[20,18],{"id":50,"name":51,"github_repo":52,"description_zh":53,"stars":54,"difficulty_score":46,"last_commit_at":55,"category_tags":56,"status":22},2234,"scikit-learn","scikit-learn\u002Fscikit-learn","scikit-learn 是一个基于 Python 构建的开源机器学习库，依托于 SciPy、NumPy 等科学计算生态，旨在让机器学习变得简单高效。它提供了一套统一且简洁的接口，涵盖了从数据预处理、特征工程到模型训练、评估及选择的全流程工具，内置了包括线性回归、支持向量机、随机森林、聚类等在内的丰富经典算法。\n\n对于希望快速验证想法或构建原型的数据科学家、研究人员以及 Python 开发者而言，scikit-learn 是不可或缺的基础设施。它有效解决了机器学习入门门槛高、算法实现复杂以及不同模型间调用方式不统一的痛点，让用户无需重复造轮子，只需几行代码即可调用成熟的算法解决分类、回归、聚类等实际问题。\n\n其核心技术亮点在于高度一致的 API 设计风格，所有估算器（Estimator）均遵循相同的调用逻辑，极大地降低了学习成本并提升了代码的可读性与可维护性。此外，它还提供了强大的模型选择与评估工具，如交叉验证和网格搜索，帮助用户系统地优化模型性能。作为一个由全球志愿者共同维护的成熟项目，scikit-learn 以其稳定性、详尽的文档和活跃的社区支持，成为连接理论学习与工业级应用的最",65628,"2026-04-05T10:10:46",[20,18,14],{"id":58,"name":59,"github_repo":60,"description_zh":61,"stars":62,"difficulty_score":10,"last_commit_at":63,"category_tags":64,"status":22},3364,"keras","keras-team\u002Fkeras","Keras 是一个专为人类设计的深度学习框架，旨在让构建和训练神经网络变得简单直观。它解决了开发者在不同深度学习后端之间切换困难、模型开发效率低以及难以兼顾调试便捷性与运行性能的痛点。\n\n无论是刚入门的学生、专注算法的研究人员，还是需要快速落地产品的工程师，都能通过 Keras 轻松上手。它支持计算机视觉、自然语言处理、音频分析及时间序列预测等多种任务。\n\nKeras 3 的核心亮点在于其独特的“多后端”架构。用户只需编写一套代码，即可灵活选择 TensorFlow、JAX、PyTorch 或 OpenVINO 作为底层运行引擎。这一特性不仅保留了 Keras 一贯的高层易用性，还允许开发者根据需求自由选择：利用 JAX 或 PyTorch 的即时执行模式进行高效调试，或切换至速度最快的后端以获得最高 350% 的性能提升。此外，Keras 具备强大的扩展能力，能无缝从本地笔记本电脑扩展至大规模 GPU 或 TPU 集群，是连接原型开发与生产部署的理想桥梁。",63927,"2026-04-04T15:24:37",[20,14,18],{"id":66,"github_repo":67,"name":68,"description_en":69,"description_zh":70,"ai_summary_zh":71,"readme_en":72,"readme_zh":73,"quickstart_zh":74,"use_case_zh":75,"hero_image_url":76,"owner_login":77,"owner_name":78,"owner_avatar_url":79,"owner_bio":80,"owner_company":81,"owner_location":81,"owner_email":82,"owner_twitter":78,"owner_website":81,"owner_url":83,"languages":81,"stars":84,"forks":85,"last_commit_at":86,"license":87,"difficulty_score":46,"env_os":88,"env_gpu":89,"env_ram":89,"env_deps":90,"category_tags":93,"github_topics":94,"view_count":10,"oss_zip_url":81,"oss_zip_packed_at":81,"status":22,"created_at":105,"updated_at":106,"faqs":107,"releases":133},2548,"opendilab\u002Fawesome-exploration-rl","awesome-exploration-rl","A curated list of awesome exploration RL resources (continually updated)","awesome-exploration-rl 是一个专注于强化学习（RL）中“探索方法”的精选资源库，旨在为研究者和开发者提供一份持续更新的高质量文献清单。在强化学习中，智能体需要在“探索未知”与“利用已知”之间找到平衡，这是决定算法成败的核心难题。特别是在那些需要数十甚至数百步才能达成目标的复杂环境中，如何高效地探索状态空间往往极具挑战性。awesome-exploration-rl 正是为了解决这一痛点而生，它系统性地梳理了该领域的前沿进展，帮助用户快速掌握如何提升智能体的探索效率。\n\n这份资源库非常适合人工智能领域的研究人员、算法工程师以及正在深入学习强化学习的学生使用。无论你是希望追踪 NeurIPS、ICML、ICLR 等顶级会议的最新论文，还是寻找经典的基础理论参考，这里都能提供清晰的指引。其独特的技术亮点在于提供了一套清晰的方法分类体系：它将探索策略细分为“增强收集策略”（如动作选择扰动、状态选择引导等）和“增强训练策略”（如基于计数、预测、信息论或熵增的方法等）。这种结构化的整理方式，不仅让读者能直观理解不同算法的应用阶段和原理，还通过 MiniGrid 等环境的可视化","awesome-exploration-rl 是一个专注于强化学习（RL）中“探索方法”的精选资源库，旨在为研究者和开发者提供一份持续更新的高质量文献清单。在强化学习中，智能体需要在“探索未知”与“利用已知”之间找到平衡，这是决定算法成败的核心难题。特别是在那些需要数十甚至数百步才能达成目标的复杂环境中，如何高效地探索状态空间往往极具挑战性。awesome-exploration-rl 正是为了解决这一痛点而生，它系统性地梳理了该领域的前沿进展，帮助用户快速掌握如何提升智能体的探索效率。\n\n这份资源库非常适合人工智能领域的研究人员、算法工程师以及正在深入学习强化学习的学生使用。无论你是希望追踪 NeurIPS、ICML、ICLR 等顶级会议的最新论文，还是寻找经典的基础理论参考，这里都能提供清晰的指引。其独特的技术亮点在于提供了一套清晰的方法分类体系：它将探索策略细分为“增强收集策略”（如动作选择扰动、状态选择引导等）和“增强训练策略”（如基于计数、预测、信息论或熵增的方法等）。这种结构化的整理方式，不仅让读者能直观理解不同算法的应用阶段和原理，还通过 MiniGrid 等环境的可视化示例，降低了理解硬探索任务的门槛。通过关注 awesome-exploration-rl，你可以紧跟 ERL 领域的发展前沿，为你的算法优化或学术研究提供坚实的理论支持和灵感来源。","\u003Cdiv id=\"top\">\u003C\u002Fdiv>\n\n# Awesome Exploration Methods in Reinforcement Learning \n\n`Updated on 2025.12.02`\n\n- Here is a collection of research papers for **Exploration methods in Reinforcement Learning (ERL)**.\nThe repository will be continuously updated to track the frontier of ERL. Welcome to follow and star!\n\n- The balance of **exploration and exploitation** is one of the most central problems in reinforcement learning.\nIn order to give readers an intuitive feeling for exploration, we provide a visualization of a typical hard exploration environment in [MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid) below.\nIn this task, a series of actions to achieve the goal often require dozens or even hundreds of steps, in which the agent needs to fully explore different state-action spaces \nin order to learn the skills required to achieve the goal.\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fopendilab_awesome-exploration-rl_readme_e42b34f4f95a.png\" alt=\"minigrid_hard_exploration\" width=\"40%\" height=\"40%\" \u002F>\u003Cbr>\n  \u003Cem style=\"display: inline-block;\">A typical hard-exploration environment: MiniGrid-ObstructedMaze-Full-v0.\u003C\u002Fem>\n\u003C\u002Fp>\n\n## Table of Contents\n\n- [Awesome Exploration Methods in Reinforcement Learning](#awesome-exploration-methods-in-reinforcement-learning)\n  - [Table of Contents](#table-of-contents)\n  - [A Taxonomy of Exploration RL Methods](#a-taxonomy-of-exploration-rl-methods)\n  - [Papers](#papers)\n    - [NeurIPS 2025](#neurips-2025)\n    - [ICML 2025](#icml-2025)\n    - [ICLR 2025](#iclr-2025)\n    - [NeurIPS 2024](#neurips-2024)\n    - [ICML 2024](#icml-2024)\n    - [ICLR 2024](#iclr-2024)\n    - [NeurIPS 2023](#neurips-2023)\n    - [ICML 2023](#icml-2023)\n    - [ICLR 2023](#iclr-2023)\n    - [NeurIPS 2022](#neurips-2022)\n    - [ICML 2022](#icml-2022)\n    - [ICLR 2022](#iclr-2022)\n    - [NeurIPS 2021](#neurips-2021)\n    - [Classic Exploration RL Papers](#classic-exploration-rl-papers)\n  - [Contributing](#contributing)\n  - [License](#license)\n\n\n## A Taxonomy of Exploration RL Methods\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\nIn general, we can divide reinforcement learning process into two phases: *collect* phase and *train* phase.\nIn the *collect* phase, the agent chooses actions based on the current policy and then interacts with the environment to collect useful experience.\nIn the *train* phase, the agent uses the collected experience to update the current policy to obtain a better performing policy.\n\nAccording to the phase the exploration component is explicitly applied, we simply divide the methods in `Exploration RL` into two main categories: `Augmented Collecting Strategy`, `Augmented Training Strategy`:\n\n- `Augmented Collecting Strategy` represents a variety of different exploration strategies commonly used in the *collect* phase, which we further divide into *four* categories:\n  - `Action Selection Perturbation`\n  - `Action Selection Guidance`\n  - `State Selection Guidance`\n  - `Parameter Space Perturbation`\n\n- `Augmented Training Strategy` represents a variety of different exploration strategies commonly used in the *train* phase, which we further divide into *seven* categories:\n  - `Count Based`\n  - `Prediction Based`\n  - `Information Theory Based`\n  - `Entropy Augmented`\n  - `Bayesian Posterior Based`\n  - `Goal Based`\n  - `(Expert) Demo Data`\n\n> Note that there may be overlap between these categories, and an algorithm may belong to several of them. \n> For other detailed survey on exploration methods in RL, you can refer to [Tianpei Yang et al](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.06668) and [Susan Amin et al](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.00157).\n\n\n\u003Ccenter>\n\u003Cfigure>\n    \u003Cimg style=\"border-radius: 0.3125em;\n    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);\" \n    src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fopendilab_awesome-exploration-rl_readme_058f19e503fb.png\" width=100% height=100%>\n    \u003Cbr>\n    \u003Cfigcaption align = \"center\">\u003Cb>A non-exhaustive, but useful taxonomy of methods in Exploration RL.\n    We provide some example methods for each of the different categories, shown in blue area above. \u003C\u002Fb>\u003C\u002Ffigcaption>\n\u003C\u002Ffigure>\n\u003C\u002Fcenter>\n\nHere are the links to the papers that appeared in the taxonomy:\n>[1] [Go-Explore](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03157-9): Adrien Ecoffet et al, 2021  \n[2] [NoisyNet](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rywHCPkAW), Meire Fortunato et al, 2018  \n[3] [DQN-PixelCNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.01868): Marc G. Bellemare et al, 2016  \n[4] [#Exploration](http:\u002F\u002Fpapers.neurips.cc\u002Fpaper\u002F6868-exploration-a-study-of-count-based-exploration-for-deep-reinforcement-learning.pdf) Haoran Tang et al, 2017  \n[5] [EX2](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2017\u002Ffile\u002F1baff70e2669e8376347efd3a874a341-Paper.pdf): Justin Fu et al, 2017  \n[6] [ICM](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.05363): Deepak Pathak et al, 2018  \n[7] [RND](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12894): Yuri Burda et al, 2018  \n[8] [NGU](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06038): Adrià Puigdomènech Badia et al, 2020  \n[9] [Agent57](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13350): Adrià Puigdomènech Badia et al, 2020  \n[10] [VIME](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.09674): Rein Houthooft et al, 2016    \n[11] [EMI](https:\u002F\u002Fopenreview.net\u002Fforum?id=H1exf64KwH): Wang et al, 2019  \n[12] [DIYAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.06070): Benjamin Eysenbach et al, 2019  \n[13] [SAC](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.01290): Tuomas Haarnoja et al, 2018  \n[14] [BootstrappedDQN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1602.04621): Ian Osband et al, 2016  \n[15] [PSRL](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1306.0940.pdf): Ian Osband et al, 2013  \n[16] [HER](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.01495.pdf) Marcin Andrychowicz et al, 2017  \n[17] [DQfD](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.03732): Todd Hester et al, 2018  \n[18] [R2D3](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01387): Caglar Gulcehre et al, 2019  \n\n\u003C\u002Fdetails>\n\n\n## Papers\n\n```\nformat:\n- [title](paper link) (presentation type, openreview score [if the score is public])\n  - author1, author2, author3, ...\n  - Key: key problems and insights\n  - ExpEnv: experiment environments\n```\n### NeurIPS 2025\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [State Entropy Regularization for Robust Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=rtG7n93Ru8)\n  - Yonatan Ashlag, Uri Koren, Mirco Mutti, Esther Derman, Pierre-Luc Bacon, Shie Mannor\n  - Key: Robust Reinforcement Learning, Risk-Averse Reinforcement Learning, Regularized Reinforcement Learning\n  - ExpEnv: MiniGrid, MuJoCo\n\n- [Geometry Meets Incentives: Sample-Efficient Incentivized Exploration with Linear Contexts](https:\u002F\u002Fopenreview.net\u002Fforum?id=nwlX15Wnr9)\n  - Benjamin Schiffer, Mark Sellke\n  - Key: Multi-armed bandits, Bayesian Incentive Compatible, Exploration\n  - ExpEnv: Multi-Armed Bandit\n\n- [LLM-Explorer: A Plug-in Reinforcement Learning Policy Exploration Enhancement Driven by Large Language Models](https:\u002F\u002Fopenreview.net\u002Fforum?id=VA5P0rUZPx)\n  - Qianyue Hao, Yiwen Song, Qingmin Liao, Jian Yuan, Yong Li\n  - Key: Reinforcement learning, large language model, policy exploration\n  - ExpEnv: Atari, MuJoCo\n\n- [Exploration via Feature Perturbation in Contextual Bandits](https:\u002F\u002Fopenreview.net\u002Fforum?id=gAddPMjmUc)\n  - Seouh-won Yi, Min-hwan Oh\n  - Key:  Generalized Linear Bandits, Contextual Bandits, Thompson Sampling, Feature Perturbation\n  - ExpEnv: Synthetic Data, UCI Datasets, MNIST\n  \n- [REINFORCE Converges to Optimal Policies with Any Learning Rate](https:\u002F\u002Fopenreview.net\u002Fforum?id=YzriuQGaNX)\n  - Samuel McLaughlin Robertson, Thang D. Chu, Bo Dai, Dale Schuurmans, Csaba Szepesvari, Jincheng Mei\n  - Key: Reinforcement learning, Policy gradient, Convergence, Bandits\n  - ExpEnv: Multi-armed Bandits, ChainMDP, DeepSea, CartPole\n\n- [Asymmetric REINFORCE for off-Policy Reinforcement Learning: Balancing positive and negative rewards](https:\u002F\u002Fopenreview.net\u002Fforum?id=Ql3sENn0mi)\n  - Charles Arnal, Gaëtan Narozniak, Vivien Cabannes, Yunhao Tang, Julia Kempe, Remi Munos\n  - Key: reinforcement learning, off-policy RL, LLM finetuning, bandits\n  - ExpEnv: Stochastic Bandits, MATH dataset  \n\n- [Off-policy Reinforcement Learning with Model-based Exploration Augmentation](http:\u002F\u002Fopenreview.net\u002Fforum?id=JGkZgEEjiM)\n  - Likun Wang, Xiangteng Zhang, Yinuo Wang, Guojian Zhan, Wenxuan Wang, Haoyu Gao, Jingliang Duan, Shengbo Eben Li\n  - Key: Model-based reinforcement learning, Model-based Exploration, Generative model, World model\n  - ExpEnv: OpenAI Gym, DMC\n\n- [Cognitive Predictive Processing: A Human-inspired Framework for Adaptive Exploration in Open-World Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=2fFRIIwau6)\n  - boheng liu, Ziyu Li, Chenghua Duan, YuTian Liu, Zhuo Wang, Xiuxing Li, Qing Li, Xia Wu\n  - Key: Open-World Reinforcement Learning, Human-inspired Artificial Intelligence, Cognitive Architectures\n  - ExpEnv:  MineDojo, Minecraft\n\n- [Novel Exploration via Orthogonality](https:\u002F\u002Fopenreview.net\u002Fforum?id=yJS1eZSNUv)\n  - Andreas Theophilou, Özgür Şimşek\n  - Key: Laplacian, Novelty, Reinforcement Learning, Exploration, Eigenvectors, Spectral Methods\n  - ExpEnv: GridWorld\n\n- [Infrequent Exploration in Linear Bandits](https:\u002F\u002Fopenreview.net\u002Fforum?id=STLolzI6q1)\n  - Harin Lee, Min-hwan Oh\n  - Key: linear bandits, greedy selection\n  - ExpEnv: Multi-Armed Bandit\n\n- [Deployment Efficient Reward-Free Exploration with Linear Function Approximation](https:\u002F\u002Fopenreview.net\u002Fforum?id=ByzRO25Bjr)\n  - Zihan Zhang, Yuxin Chen, Jason D. Lee, Simon Shaolei Du, Lin Yang, Ruosong Wang\n  - Key:  Reinforcement learning, linear MDP, deployment efficiency\n  - ExpEnv: None\n\n- [Wonder Wins Ways: Curiosity-Driven Exploration through Multi-Agent Contextual Calibration](https:\u002F\u002Fopenreview.net\u002Fforum?id=1fOGTbO5Sx)\n  - Yiyuan Pan, Zhe Liu, Hesheng Wang\n  - Key:   Multi-Agent Reinforcement Learning, Intrinsic Reward, Artificial Curiosity\n  - ExpEnv: VMAS, Meltingpot, SMACv2\n\n- [Scalable Exploration via Ensemble++](https:\u002F\u002Fopenreview.net\u002Fforum?id=M226WElHp5)\n  - Yingru Li, Jiawei Xu, Baoxiang Wang, Zhi-Quan Luo\n  - Key:  Reinforcement Learning, Ensemble Sampling, Thompson Sampling, Exploration, Posterior Approximation, Scalable Computation\n  - ExpEnv: linear bandits, quadratic bandits, neural bandits, GPT-based contextual bandits \n\n- [Uncertainty-Guided Exploration for Efficient AlphaZero Training](https:\u002F\u002Fopenreview.net\u002Fforum?id=3q6lJTN45T)\n  - Scott Cheng, Meng-Yu Tsai, Ding-Yong Hong, Mahmut Kandemir\n  - Key:  AlphaZero, uncertainty, exploration\n  - ExpEnv: Go\n\n- [Exploration from a Primal-Dual Lens: Value-Incentivized Actor-Critic Methods for Sample-Efficient Online RL](https:\u002F\u002Fopenreview.net\u002Fforum?id=A5Y8Uh5Szl)\n  - Tong Yang, Bo Dai, Lin Xiao, Yuejie Chi\n  - Key:  exploration-exploitation trade-off, actor-critic, RL theory\n  - ExpEnv: MuJoCo\n  \n- [Reinforcement Learning with Action Chunking](https:\u002F\u002Fopenreview.net\u002Fforum?id=XUks1Y96NR)\n  - Qiyang Li, Zhiyuan Zhou, Sergey Levine\n  - Key:  reinforcement learning, offline-to-online RL, exploration\n  - ExpEnv: OGBench, robomimic\n\n- [DISCOVER: Automated Curricula for Sparse-Reward Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=guZBnsKPsw)\n  - Leander Diaz-Bone, Marco Bagatella, Jonas Hübotter, Andreas Krause\n  - Key: reinforcement learning, test-time training, test-time reinforcement learning, sparse-reward reinforcement learning, goal selection, goal-conditioned reinforcement learning, exploration, exploration-exploitation, upper confidence bound\n  - ExpEnv: antmaze, arm, pointmaze\n\n- [Meta-learning how to Share Credit among Macro-Actions](https:\u002F\u002Fopenreview.net\u002Fforum?id=cJlgdpEFx9)\n  - Ionel Hosu, Traian Rebedea, Razvan Pascanu\n  - Key: deep reinforcement learning, macro-actions, exploration\n  - ExpEnv: Atari, StreetFighter II\n\n- [ExPO: Unlocking Hard Reasoning with Self-Explanation-Guided Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=D1PeGJtVEu)\n  - Ruiyang Zhou, Shuozhe Li, Amy Zhang, Liu Leqi\n  - Key: Large Language Model, Self-improvement, Guided exploration, Reasoning, Reinforcement Learning with Verifiable Reward, Bootstrapping\n  - ExpEnv: MATH, GSM8K, MATH-500\n\n\n\u003C\u002Fdetails>\n\n### ICML 2025\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Controlling Underestimation Bias in Constrained Reinforcement Learning for Safe Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=nq5bt0mRTC)\n  - Shiqing Gao, Jiaxin Ding, Luoyi Fu, Xinbing Wang\n  - Key: constrained RL, safe exploration, underestimation, intrinsic cost, Reinforcement Learning. \n  - ExpEnv: safety-gymnasium, MuJoCo\n\n- [Training a Generally Curious Agent](https:\u002F\u002Fopenreview.net\u002Fforum?id=UeB3Hdrhda)\n  - Fahim Tajwar, Yiding Jiang, Abitha Thankaraj, Sumaita Sadia Rahman, J Zico Kolter, Jeff Schneider, Russ Salakhutdinov\n  - Key: LLM Agent, Synethic Data, Multiturn finetuning\n  - ExpEnv: Twenty questions, Guess my city, Wordle, Cellular automata, Customer service, Murder mystery, Mastermind, Battleship, Minesweeper, Bandit best arm selection\n  \n- [Monte Carlo Tree Diffusion for System 2 Planning](https:\u002F\u002Fopenreview.net\u002Fforum?id=XrCbBdycDc)\n  - Jaesik Yoon, Hyeonseo Cho, Doojin Baek, Yoshua Bengio, Sungjin Ahn\n  - Key: Diffusion, MCTS, Long-term Planning, Offline RL, Goal-conditioned RL, Inference-Time Scaling\n  - ExpEnv: PointMaze, AntMaze, Robot Arm Cube Manipulation, Visual PointMaze\n\n- [Soft Reasoning: Navigating Solution Spaces in Large Language Models through Controlled Embedding Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=4gWE7CMOlH)\n  - Qinglin Zhu, Runcong Zhao, Hanqi Yan, Yulan He, Yudong Chen, Lin Gui\n  - Key: Large Language Models, Reasoning, Embedding Perturbation, Bayesian Optimisation\n  - ExpEnv: None\n\n- [Hyper: Hyperparameter Robust Efficient Exploration in Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=n1CVVzBSjQ)\n  - Yiran Wang, Chenshu Liu, Yunfan Li, Sanae Amani, Bolei Zhou, Lin F. Yang\n  - Key: Reinforcement Learning, Exploration, Provably Efficient, Hyperparameter Robustness\n  - ExpEnv: PointMaze, MuJoCo, MiniGrid\n\n- [Provably Efficient Exploration in Inverse Constrained Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=eLTPkGGHum)\n  - Bo Yue, Jian Li, Guiliang Liu\n  - Key: Inverse Constrained Reinforcement Learning, Exploration Algorithm, Sample Efficiency\n  - ExpEnv: PointMaze, GridWorld\n\n- [Explicit Exploration for High-Welfare Equilibria in Game-Theoretic Multiagent Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=AxqgpcL90a)\n  - Austin A. Nguyen, Anri Gu, Michael P. Wellman\n  - Key: Empirical game theoretic analysis, equilibrium selection, game solving, strategy exploration\n  - ExpEnv: Harvest, Bargaining\n  \n- [Maximum Entropy Reinforcement Learning with Diffusion Policy](https:\u002F\u002Fopenreview.net\u002Fforum?id=CpjKXe9rY7)\n  - Xiaoyi Dong, Jian Cheng, Xi Sheryl Zhang\n  - Key: Diffusion models, online reinforcement learning, maximum entropy reinforcement learning, soft actor-critic\n  - ExpEnv: Mujoco, AntMaze, DeepMind Control Suite\n\n- [Towards Efficient Online Tuning of VLM Agents via Counterfactual Soft Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=H76PMm7hf2)\n  - Lang Feng, Weihao Tan, Zhiyi Lyu, Longtao Zheng, Haiyang Xu, Ming Yan, Fei Huang, Bo An\n  - Key: vision-language model, agent, reinforcement learning, online fine-tuning, counterfactual\n  - ExpEnv: Android-in-the-Wild, Gym Cards, ALFWorld\n\n- [DIME: Diffusion-Based Maximum Entropy Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=Aw6dBR7Vxj)\n  - Onur Celik, Zechu Li, Denis Blessing, Ge Li, Daniel Palenicek, Jan Peters, Georgia Chalvatzaki, Gerhard Neumann\n  - Key: Reinforcement Learning, Diffusion Models, Diffusion Based Reinforcement Learning, Maximum Entropy Reinforcement Learning\n  - ExpEnv: Mujoco, DeepMind Control Suite, Myo Suite\n\n- [Knowledge Retention in Continual Model-Based Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=DiqeZY27XK)\n  - Haotian Fu, Yixiang Sun, Michael Littman, George Konidaris\n  - Key: Deep Reinforcement learning, Model-based Reinforcement Learning, Continual Learning, World Models\n  - ExpEnv: MiniGrid, DeepMind Control Suite\n\n- [KEA: Keeping Exploration Alive by Proactively Coordinating Exploration Strategies](https:\u002F\u002Fopenreview.net\u002Fpdf?id=XIyrotmBSJ)\n  - Shih-Min Yang, Martin Magnusson, Johannes A. Stork, Todor Stoyanov\n  - Key: Reinforcement Learning, Novelty-based Exploration, Soft Actor-Critic, Sparse reward\n  - ExpEnv: 2D Navigation,  DeepMind Control Suite\n\n- [EVOLvE: Evaluating and Optimizing LLMs For In-Context Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=ck7dvZFbRW)\n  - Allen Nie, Yi Su, Bo Chang, Jonathan Lee, Ed H. Chi, Quoc V Le, Minmin Chen\n  - Key: Exploration, In-Context Reinforcement Learning, Bandit\n  - ExpEnv:  Multi-Armed Bandit, Contextual Bandits\n\n- [Catching Two Birds with One Stone: Reward Shaping with Dual Random Networks for Balancing Exploration and Exploitation](https:\u002F\u002Fopenreview.net\u002Fforum?id=YqtgKdW9dD)\n  - Haozhe Ma, Fangling Li, Jing Yu Lim, Zhengding Luo, Thanh Vinh Vo, Tze-Yun Leong\n  - Key: Reinforcement Learning, Reward Shaping, Exploration-Exploitation Balance\n  - ExpEnv: Atari, VizDoom, MiniWorld\n\n- [SENSEI: Semantic Exploration Guided by Foundation Models to Learn Versatile World Models](https:\u002F\u002Fopenreview.net\u002Fforum?id=ZDPNmihkMR)\n  - Cansu Sancaktar, Christian Gumbsch, Andrii Zadaianchuk, Pavel Kolev, Georg Martius\n  - Key: intrinsic motivation, exploration, foundation models, model-based RL\n  - ExpEnv: MiniHack, Robodesk, Pokémon Red\n  \n- [Behavioral Exploration: Learning to Explore via In-Context Adaptation](https:\u002F\u002Fopenreview.net\u002Fforum?id=tlLkY9E2bZ)\n  - Andrew Wagenmaker, Zhiyuan Zhou, Sergey Levine\n  - Key: in-context learning, exploration, adaptive agents, behavior cloning\n  - ExpEnv: D4RL AntMaze, D4RL Kitchen\n\n- [Leveraging Skills from Unlabeled Prior Data for Efficient Online Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=perpuTFEF7)\n  - Max Wilcoxson, Qiyang Li, Kevin Frans, Sergey Levine\n  - Key: Offline-to-online RL, Unsupervised Pre-training, Exploration\n  - ExpEnv: D4RL, OGBench, Visual-AntMaze\n  \n  \n\u003C\u002Fdetails>\n\n### ICLR 2025\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Episodic Novelty Through Temporal Distance](https:\u002F\u002Fopenreview.net\u002Fforum?id=I7DeajDEx7)\n  - Yuhua Jiang, Qihan Liu, Yiqin Yang, Xiaoteng Ma, Dianyu Zhong, Hao Hu, Jun Yang, Bin Liang, Bo XU, Chongjie Zhang, Qianchuan Zhao\n  - Key: episodic novelty, temporal distance, exploration, reinforcement learning\n  - ExpEnv: MiniGrid, MiniWorld, Craft, Maze, DMControl, etc.\n\n- [Brain Bandit: A Biologically Grounded Neural Network for Efficient Control of Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=RWJX5F5I9g)  \n  - Chen Jiang, Jiahui An, Yating Liu, Ni Ji  \n  - Key: explore-exploit, stochastic Hopfield network, Thompson sampling, decision under uncertainty, brain-inspired algorithm, reinforcement learning  \n  - ExpEnv: multi-armed bandit (MAB) tasks, MDP tasks\n\n- [TOP-ERL: Transformer-based Off-Policy Episodic Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=N4NhVN30ph)  \n  - Ge Li, Dong Tian, Hongyi Zhou, Xinkai Jiang, Rudolf Lioutikov, Gerhard Neumann  \n  - Key: Value of sequences of actions, Reinforcement Learning, Transformer, Robot Manipulation, Movement Primitives  \n  - ExpEnv: Robot Learning Environments\n\n- [Efficient Exploration and Discriminative World Model Learning with an Object-Centric Abstraction](https:\u002F\u002Fopenreview.net\u002Fforum?id=hgwGi81ndj)  \n  - Anthony GX-Chen, Kenneth Marino, Rob Fergus  \n  - Key: reinforcement learning, model based reinforcement learning, world model, exploration, hierarchy  \n  - ExpEnv: 2D crafting, MiniHack environments\n\n- [MaxInfoRL: Boosting exploration in reinforcement learning through information gain maximization](https:\u002F\u002Fopenreview.net\u002Fforum?id=R4q3cY3kQf)  \n  - Bhavya Sukhija, Stelian Coros, Andreas Krause, Pieter Abbeel, Carmelo Sferrazza  \n  - Key: Reinforcement learning, Exploration in off-policy methods, Continuous control  \n  - ExpEnv: Continuous and visual control tasks\n\n- [ActSafe: Active Exploration with Safety Constraints for Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=aKRADWBJ1I)  \n  - Yarden As, Bhavya Sukhija, Lenart Treven, Carmelo Sferrazza, Stelian Coros, Andreas Krause  \n  - Key: Safe Exploration, Constrained Markov Decision Processes, Safe Reinforcement Learning  \n  - ExpEnv: Standard safe deep RL benchmarks\n\n- [A Single Goal is All You Need: Skills and Exploration Emerge from Contrastive RL without Rewards, Demonstrations, or Subgoals](https:\u002F\u002Fopenreview.net\u002Fforum?id=xCkgX4Xfu0)  \n  - Grace Liu, Michael Tang, Benjamin Eysenbach  \n  - Key: exploration, emergent skills, contrastive reinforcement learning, open-ended learning  \n  - ExpEnv: 2D maze navigation task\n\n- [Risk Informed Policy Learning for Safer Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=gJG4IPwg6l)  \n  - Kaustubh Mani, Vincent Mai, Charlie Gauthier, Annie S Chen, Samer B. Nashed, Liam Paull  \n  - Key: Reinforcement Learning, Safe Exploration, Representation Learning, Inductive Bias  \n  - ExpEnv: AdroitHandPen, PointGoal1 and PointButton1\n\n- [Toward Efficient Multi-Agent Exploration With Trajectory Entropy Maximization](https:\u002F\u002Fopenreview.net\u002Fforum?id=YvKJGYL4j7)  \n  - Tianxu Li, Kun Zhu  \n  - Key: Multi-Agent Reinforcement Learning, Exploration, Cooperation, Trajectory Entropy Maximization  \n  - ExpEnv: Several MARL benchmarks\n  \n- [Beyond-Expert Performance with Limited Demonstrations: Efficient Imitation Learning with Double Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=FviefuxmeW)  \n  - Heyang Zhao, Xingrui Yu, David Mark Bossens, Ivor Tsang, Quanquan Gu  \n  - Key: Reinforcement learning, imitation learning, Double Exploration\n  - ExpEnv: Atari, MuJoCo\n\n- [EgoSim: Egocentric Exploration in Virtual Worlds with Multi-modal Conditioning](https:\u002F\u002Fopenreview.net\u002Fforum?id=zAyS5aRKV8)  \n  - Wei Yu, Songheng Yin, Steve Easterbrook, Animesh Garg  \n  - Key: Controllable video generation, Egocentric video prediction, World model  \n  - ExpEnv: RealEstate, Epic-Field\n\n\u003C\u002Fdetails>\n\n### NeurIPS 2024\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [PEAC: Unsupervised Pre-training for Cross-Embodiment Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=LyAFfdx8YF)  \n  - Chengyang Ying, Zhongkai Hao, Xinning Zhou, Xuezhou Xu, Hang Su, Xingxing Zhang, Jun Zhu \n  - Key: Cross-embodiment Reinforcement Learning, Unsupervised Exploration, Skill Discovery, Intrinsic Reward  \n  - ExpEnv: DeepMind Control Suite, Robosuite, Isaacgym, Real-world locomotion\n\n- [SeeA*: Efficient Exploration-Enhanced A* Search by Selective Sampling](https:\u002F\u002Fopenreview.net\u002Fforum?id=mSaqxZVZW8)  \n  - Dengwei Zhao, Shikui Tu, Lei Xu  \n  - Key: Enhances A* search by constructing a dynamic OPEN subset through selective sampling, enabling exploration of promising branches; theoretical and empirical efficiency improvements.  \n  - ExpEnv: Retrosynthetic planning (organic chemistry), logic synthesis (IC design), and Sokoban game.\n\n- [Learning Formal Mathematics From Intrinsic Motivation](https:\u002F\u002Fopenreview.net\u002Fforum?id=uNKlTQ8mBD)  \n  - Gabriel Poesia, David Broman, Nick Haber, Noah Goodman  \n  - Key: Jointly learns to prove formal mathematical theorems and propose harder provable conjectures in a self-improving loop; utilizes dependent type theory and hindsight relabeling to improve sample efficiency.  \n  - ExpEnv: Propositional logic, arithmetic, and group theory.  \n\n- [RL-GPT: Integrating Reinforcement Learning and Code-as-policy](https:\u002F\u002Fopenreview.net\u002Fforum?id=LEzx6QRkRH)  \n  - Shaoteng Liu, Haoqi Yuan, Minda Hu, Yanwei Li, Yukang Chen, Shu Liu, Zongqing Lu, Jiaya Jia  \n  - Key: Two-level hierarchical framework combining reinforcement learning and large language models (LLMs); achieves high efficiency by integrating coding for high-level planning with RL for low-level actions.  \n  - ExpEnv: Minecraft and MineDojo tasks.  \n\n- [Beyond Optimism: Exploration With Partially Observable Rewards](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.13909)\n  - Simone Parisi, Alireza Kazemipour, Michael Bowling\n  - Key: Reinforcement Learning, Partial Observability, Optimism, Exploration\n  - ExpEnv: Tabular Environments (with and without unobservable rewards)\n\n- [Exploring the Edges of Latent State Clusters for Goal-Conditioned Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.01396)  \n  - Yuanlin Duan, Guofeng Cui, He Zhu  \n  - Key: Goal-Conditioned Reinforcement Learning, Exploration, Latent Space Clustering  \n  - ExpEnv: Multi-legged Ant Maze, Robotic Arm Manipulation (cluttered tabletop), Anthropomorphic Hand Object Rotation\n\n- [Subwords as Skills: Tokenization for Sparse-Reward Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.04459)\n  - David Yunis, Justin Jung, Falcon Dai, Matthew Walter\n  - Key: Sparse-Reward Reinforcement Learning, Skill Generation, Tokenization, Continuous Action Spaces\n  - ExpEnv: Challenging Sparse-Reward Domains\n\n- [Adaptive Exploration for Data-Efficient General Value Function Evaluations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.07838)  \n  - Arushi Jain, Josiah P. Hanna, Doina Precup  \n  - Key: General Value Functions, Reinforcement Learning, Data Efficiency  \n  - ExpEnv: Tabular settings, Nonlinear function approximation, Mujoco environments (stationary and non-stationary reward signals)\n\n- [Leveraging Separated World Model for Exploration in Visually Distracted Environments](https:\u002F\u002Fopenreview.net\u002Fpdf\u002F6972a2683764073195f725a7d18b19d8e88711da.pdf)\n  - Kaichen Huang, Shenghua Wan, Minghao Shao, Hai-Hang Sun, Le Gan, Shuai Feng, De-Chuan Zhan\n  - Key: Model-based Reinforcement Learning, Unsupervised Reinforcement Learning (URL), Visual Distractors, Bi-level Optimization\n  - ExpEnv: Locomotion Tasks, Manipulation Tasks\n  \n- [Effective Exploration Based on the Structural Information Principles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06621)  \n  - Xianghua Zeng, Hao Peng, Angsheng Li  \n  - Key: Reinforcement Learning, Structural Information, Effective Exploration, Intrinsic Reward  \n  - ExpEnv: MiniGrid, MetaWorld, DeepMind Control Suite\n\n- [The Ladder in Chaos: Improving Policy Learning by Harnessing the Parameter Evolving Path in A Low-dimensional Space](https:\u002F\u002Fopenreview.net\u002Fpdf?id=3vHfwL2stG)\n  - Hongyao Tang, Min Zhang, Chen Chen, Jianye Hao\n  - Key: Reinforcement Learning, Policy Learning Dynamics, Temporal SVD, Low-dimensional Space\n  - ExpEnv: MuJoCo, DeepMind Control Suite (DMC), MinAtar\n\n- [Rethinking Exploration in Reinforcement Learning with Effective Metric-Based Exploration Bonus](https:\u002F\u002Fopenreview.net\u002Fpdf?id=QpKWFLtZKi)\n  - Yiming Wang, Kaiyan Zhao, Furui Liu, Leong Hou U\n  - Key: Reinforcement Learning, Exploration, Intrinsic Rewards, Metric-Based State Discrepancy\n  - ExpEnv: Atari, Minigrid, Robosuite, Habitat\n  - [code](https:\u002F\u002Fwww.google.com\u002Furl?q=https%3A%2F%2Fgithub.com%2FYimingWangMingle%2FEME&sa=D&sntz=1&usg=AOvVaw3DKT8iecHBtgIeOzqUjoaH)\n\n- [Offline Oracle-Efficient Learning for Contextual MDPs via Layerwise Exploration-Exploitation Tradeoff](https:\u002F\u002Fopenreview.net\u002Fpdf?id=848vuK2cKp)  \n  - Jian Qian, Haichen Hu, David Simchi-Levi  \n  - Key: Contextual Markov Decision Processes (CMDPs), Offline Density Estimation, Layerwise Exploration-Exploitation Tradeoff  \n  - ExpEnv: Reward-Free Reinforcement Learning\n\n\u003C\u002Fdetails>\n\n\n### ICML 2024\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Q-Star Meets Scalable Posterior Sampling: Bridging Theory and Practice via HyperAgent](https:\u002F\u002Fproceedings.mlr.press\u002Fv235\u002Fli24by.html)\n  - Yingru Li, Jiawei Xu, Lei Han, Zhi-Quan Luo\n  - Key: ensembles, Thompson sampling, scalable exploration, regret analysis, complexity theory\n  - ExpEnv: Atari, DeepSea\n\n- [ACE: Off-Policy Actor-Critic with Causality-Aware Entropy Regularization](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OwtMhMSybu)\n  - Tianying Ji, Yongyuan Liang, Yan Zeng, Yu Luo, Guowei Xu, Jiawei Guo, Ruijie Zheng, Furong Huang, Fuchun Sun, Huazhe Xu\n  - Key: Off-policy Actor-critic with Causality-aware Entropy regularization, exploration, causality-aware entropy regularization\n  - ExpEnv: MetaWorld, DeepMind Control Suite, Dexterous Hand, Sparse Reward\n\n- [Random Latent Exploration for Deep Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=Y9qzwNlKVU)\n  - Srinath V. Mahankali, Zhang-Wei Hong, Ayush Sekhari, Alexander Rakhlin, Pulkit Agrawal\n  - Key: Random Latent Exploration, perturbing rewards by adding structured random rewards to the original task rewards\n  - ExpEnv: ATARI, ISAACGYM\n\n- [Exploration and Anti-Exploration with Distributional Random Network Distillation](https:\u002F\u002Fopenreview.net\u002Fforum?id=rIrpzmqRBk)\n  - Kai Yang, Jian Tao, Jiafei Lyu, Xiu Li\n  - Key: bonus inconsistency, Distributional Random Network Distillation, exploration and anti-exploration\n  - ExpEnv: Atari, Adroit, Fetch Manipulation Tasks\n\n- [Breadth-First Exploration on Adaptive Grid for Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=59MYoLghyk)\n  - Youngsik Yoon, Gangbok Lee, Sungsoo Ahn, Jungseul Ok\n  - Key: Breadth-First Exploration, Adaptive Grid, Exploration Efficiency\n  - ExpEnv: GridWorld, Atari, Procgen\n\n- [Just Cluster It: An Approach for Exploration in High-Dimensions using Clustering and Pre-Trained Representations](https:\u002F\u002Fopenreview.net\u002Fforum?id=cXBPPfNUZJ)\n  - Stefan Sylvius Wagner, Stefan Harmeling\n  - Key: representation-centric perspective on exploration, clustering, pre-trained representations\n  - ExpEnv: VizDoom and Habitat\n\n- [Provably Efficient Long-Horizon Exploration in Monte Carlo Tree Search through State Occupancy Regularization](https:\u002F\u002Fopenreview.net\u002Fforum?id=UCKFhc9SFC)\n  - Liam Schramm, Abdeslam Boularias\n  - Key: Monte Carlo Tree Search, Long-Horizon Exploration, State Occupancy Regularization\n  - ExpEnv: robot navigation problems\n\n- [Efficient Exploration for LLMs](https:\u002F\u002Fopenreview.net\u002Fforum?id=PpPZ6W7rxy)\n  - Vikranth Dwaracherla, Seyed Mohammad Asghari, Botao Hao, Benjamin Van Roy\n  - Key: exploration, large language models, efficient exploration\n  - ExpEnv: Language Tasks\n\n- [Scalable Online Exploration via Coverability](https:\u002F\u002Fopenreview.net\u002Fforum?id=C64clssMVU)\n  - Philip Amortila, Dylan J Foster, Akshay Krishnamurthy\n  - Key: L1-Coverage, Intrinsic complexity control, Efficient planning, Efficient exploration\n  - ExpEnv: MountainCar\n\n- [Uncertainty-Aware Reward-Free Exploration with General Function Approximation](https:\u002F\u002Fopenreview.net\u002Fforum?id=BvBdYSIkpb)\n  - Junkai Zhang, Weitong Zhang, Dongruo Zhou, Quanquan Gu\n  - Key: uncertainty-aware intrinsic reward, reward-free exploration, general function approximation\n  - ExpEnv: DeepMind Control Suite\n\n- [Constrained Ensemble Exploration for Unsupervised Skill Discovery](https:\u002F\u002Fopenreview.net\u002Fforum?id=AOJCCFTlfJ)\n  - Chenjia Bai, Rushuai Yang, Qiaosheng Zhang, Kang Xu, Yi Chen, Ting Xiao, Xuelong Li\n  - Key: constrained ensemble exploration, unsupervised skill discovery, partition exploration based on the state prototypes\n  - ExpEnv: URLB tasks, maze\n\n- [Bayesian Exploration Networks](https:\u002F\u002Fopenreview.net\u002Fforum?id=OYw6sS8QmL)\n  - Mattie Fellows, Brandon Gary Kaplowitz, Christian Schroeder de Witt, Shimon Whiteson\n  - Key: Bayesian Exploration Networks, exploration, uncertainty estimation\n  - ExpEnv: a novel search and rescue gridworld MDP\n\n- [Geometric Active Exploration in Markov Decision Processes: the Benefit of Abstraction](https:\u002F\u002Fopenreview.net\u002Fpdf?id=2JYOxcGlRe)\n  - Riccardo De Santi, Federico Arangath Joseph, Noah Liniger, Mirco Mutti, Andreas Krause\n  - Key: geometric active exploration, abstraction, exploration efficiency\n  - ExpEnv: environments motivated by problems in scientific discovery\n\n- [Fast Peer Adaptation with Context-aware Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=CBcNl5Eo32)\n  - Long Ma, Yuanfei Wang, Fangwei Zhong, Song-Chun Zhu, Yizhou Wang\n  - Key: peer adaptation, context-aware exploration, fast adaptation\n  - ExpEnv: competitive (Kuhn Poker), cooperative (PO-Overcooked), or mixed (Predator-Prey-W) games\n\n- [Individual Contributions as Intrinsic Exploration Scaffolds for Multi-agent Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=zCmMkWK4Ly)\n  - Xinran Li, Zifan LIU, Shibo Chen, Jun Zhang\n  - Key: individual contributions, intrinsic exploration, multi-agent reinforcement learning\n  - ExpEnv: Google Research Football, SMAC\n\n\u003C\u002Fdetails>\n\n### ICLR 2024\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Unlocking the Power of Representations in Long-term Novelty-based Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OwtMhMSybu)\n  - Alaa Saade, Steven Kapturowski, Daniele Calandriello, Charles Blundell, Pablo Sprechmann, Leopoldo Sarra, Oliver Groth, Michal Valko, Bilal Piot\n  - Key: Robust Exploration via Clustering-based Online Density Estimation\n  - ExpEnv: Atari, DM-HARD-8\n\n- [A Theoretical Explanation of Deep RL Performance in Stochastic Environments](https:\u002F\u002Fopenreview.net\u002Fforum?id=5ES5Hdlbxw)\n  - Cassidy Laidlaw, Banghua Zhu, Stuart Russell, Anca Dragan\n  - Key: Stochastic Environments, effective horizon, RL theory, instance-dependent bounds, empirical validation of theory\n  - ExpEnv: BRIDGE\n\n- [DrM: Mastering Visual Reinforcement Learning through Dormant Ratio Minimization](https:\u002F\u002Fopenreview.net\u002Fforum?id=MSe8YFbhUE)\n  - Guowei Xu, Ruijie Zheng, Yongyuan Liang, Xiyao Wang, Zhecheng Yuan, Tianying Ji, Yu Luo, Xiaoyu Liu, Jiaxin Yuan, Pu Hua, Shuzhen Li, Yanjie Ze, Hal Daumé III, Furong Huang, Huazhe Xu\n  - Key: Visual RL, Dormant Ratio Minimization, Exploration\n  - ExpEnv:DeepMind Control Suite, MetaWorld, and Adroit\n\n- [METRA: Scalable Unsupervised RL with Metric-Aware Abstraction](https:\u002F\u002Fopenreview.net\u002Fforum?id=c5pwL0Soay)\n  - Seohong Park, Oleh Rybkin, Sergey Levine\n  - Key: unsupervised RL, metric-aware abstraction, scalable exploration\n  - ExpEnv: state-based Ant and HalfCheetah, Kitchen\n\n- [Text2Reward: Reward Shaping with Language Models for Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=tUM39YTRxH)\n  - Tianbao Xie, Siheng Zhao, Chen Henry Wu, Yitao Liu, Qian Luo, Victor Zhong, Yanchao Yang, Tao Yu\n  - Key: reward shaping, language models, text-based reward shaping\n  - ExpEnv: MUJOCO, MANISKILL2, METAWORLD\n\n- [Pre-Training Goal-based Models for Sample-Efficient Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=o2IEmeLL9r)\n  - Haoqi Yuan, Zhancun Mu, Feiyang Xie, Zongqing Lu\n  - Key: goal-based models, pre-training, sample efficiency\n  - ExpEnv: Kitchen, Minecraft.\n\n- [Efficient Episodic Memory Utilization of Cooperative Multi-Agent Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=LjivA1SLZ6)\n  - Hyungho Na, Yunkyeong Seo, Il-chul Moon\n  - Key: episodic memory, cooperative multi-agent, efficient utilization\n  - ExpEnv: StarCraft II and Google Research Football\n\n- [Simple Hierarchical Planning with Diffusion](https:\u002F\u002Fopenreview.net\u002Fforum?id=kXHEBK9uAY)\n  - Chang Chen, Fei Deng, Kenji Kawaguchi, Caglar Gulcehre, Sungjin Ahn\n  - Key: hierarchical planning, diffusion, exploration\n  - ExpEnv: Maze2D and AntMaze\n\n- [Sample Efficient Myopic Exploration Through Multitask Reinforcement Learning with Diverse Tasks](https:\u002F\u002Fopenreview.net\u002Fforum?id=YZrg56G0JV)\n  - Ziping Xu, Zifan Xu, Runxuan Jiang, Peter Stone, Ambuj Tewari\n  - Key: myopic exploration, multitask reinforcement learning, diverse tasks\n  - ExpEnv: synthetic robotic control environment\n\n- [PAE: Reinforcement Learning from External Knowledge for Efficient Exploration](https:\u002F\u002Fopenreview.net\u002Fforum?id=R7rZUSGOPD)\n  - Zhe Wu, Haofei Lu, Junliang Xing, You Wu, Renye Yan, Yaozhong Gan, Yuanchun Shi\n  - Key: external knowledge, efficient exploration, reinforcement learning\n  - ExpEnv: BabyAI and MiniHack\n\n- [In-context Exploration-Exploitation for Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=uIKZSStON3)\n  - Zhenwen Dai, Federico Tomasi, Sina Ghiassian\n  - Key: in-context exploration-exploitation, reinforcement learning, exploration-exploitation trade-off\n  - ExpEnv: Dark Room, Dark Key-to-Door, Dark Room (Biased).\n\n- [Transformers as Decision Makers: Provable In-Context Reinforcement Learning via Supervised Pretraining](https:\u002F\u002Fopenreview.net\u002Fpdf?id=yN4Wv17ss3)\n  - Licong Lin, Yu Bai, Song Mei\n  - Key: transformers, decision makers, in-context reinforcement learning\n  - ExpEnv: Linear bandit, Bernoulli bandits.\n\n- [Learning to Act without Actions](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rvUq3cxpDF)\n  - Dominik Schmidt, Minqi Jiang\n  - Key: recovering latent action information, video, pre-training\n  - ExpEnv: Procgen\n\n- [Consciousness-Inspired Spatio-Temporal Abstractions for Better Generalization in Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=eo9dHwtTFt)\n  - Mingde Zhao, Safa Alver, Harm van Seijen, Romain Laroche, Doina Precup, Yoshua Bengio\n  - Key: spatio-temporal abstractions, hierarchical planning, task\u002Fgoal decomposition\n  - ExpEnv: MiniGrid-BabyAI\n\n\u003C\u002Fdetails>\n\n### NeurIPS 2023\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Maximize to Explore: One Objective Function Fusing Estimation, Planning, and Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=A57UMlUJdc)\n  - Zhihan Liu, Miao Lu, Wei Xiong, Han Zhong, Hao Hu, Shenao Zhang, Sirui Zheng, Zhuoran Yang, Zhaoran Wang\n  - Key: a single objective that integrates the estimation and planning components, balancing exploration and exploitation automatically, sublinear regret\n  - ExpEnv: MuJoCo with sparse reward\n\n- [On the Importance of Exploration for Generalization in Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=y5duN2j9s6)\n  - Yiding Jiang, J Zico Kolter, Roberta Raileanu\n  - Key: exploration, generalization, Exploration via Distributional Ensemble\n  - ExpEnv: tabular contextual MDP, Procgen and Crafter\n\n- [Monte Carlo Tree Search with Boltzmann Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NG4DaApavi) \n  - Michael Painter, Mohamed Baioumy, Nick Hawes, Bruno Lacerda\n  - Key: Boltzmann exploration with MCTS, optimal actions for the maximum entropy objective do not necessarily correspond to optimal actions for the original objective, two improved algorithms.\n  - ExpEnv: the Frozen Lake environment, the Sailing Problem, Go\n\n- [Breadcrumbs to the Goal: Supervised Goal Selection from Human-in-the-Loop Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.11049gst)\n  - Marcel Torne Villasevil, Max Balsells I Pamies, Zihan Wang, Samedh Desai, Tao Chen, Pulkit Agrawal, Abhishek Gupta\n  - Key: human-in-the-loop feedback, bifurcating human feedback and policy learning\n  - ExpEnv: Bandu, Block Stacking, Kitchen, and Pusher，Four rooms and Maze\n\n- [MIMEx: Intrinsic Rewards from Masked Input Modeling](https:\u002F\u002Fopenreview.net\u002Fpdf?id=g1dMYenhe4)\n  - Toru Lin, Allan Jabri\n  - Key: pseudo-likelihood estimation with different mask distributions,\n  - ExpEnv: PixMC-Sparse, DeepMind Control suite\n\n- [Accelerating Exploration with Unlabeled Prior Data](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Itorzn4Kwf)\n  - Qiyang Li, Jason Zhang, Dibya Ghosh, Amy Zhang, Sergey Levine\n  - Key: prior data without reward labels, learns a reward model from online experience, labels the unlabeled prior data with optimistic rewards\n  - ExpEnv: AntMaze domain, Adroit hand manipulation domain, and a visual simulated robotic manipulation domain.\n\n- [On the Convergence and Sample Complexity Analysis of Deep Q-Networks with ε-Greedy Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=HWGWeaN76q)\n  - Shuai Zhang, Hongkang Li, Meng Wang, Miao Liu, Pin-Yu Chen, Songtao Lu, Sijia Liu, Keerthiram Murugesan, Subhajit Chaudhury\n  - Key: ε-greedy exploration, convergence, sample complexity\n  - ExpEnv: Numerical Experiments\n\n- [Pitfall of Optimism: Distributional Reinforcement Learning by Randomizing Risk Criterion](https:\u002F\u002Fopenreview.net\u002Fpdf?id=v8u3EFAyW9)\n  - Taehyun Cho, Seungyub Han, Heesoo Lee, Kyungjae Lee, Jungwoo Lee\n  - Key: distributional reinforcement learning, randomizing risk criterion, optimistic exploration\n  - ExpEnv: Atari 55 games.\n\n- [CQM: Curriculum Reinforcement Learning with a Quantized World Model](https:\u002F\u002Fopenreview.net\u002Fpdf?id=tcotyjon2a)\n  - Seungjae Lee, Daesol Cho, Jonghae Park, H. Jin Kim\n  - Key: curriculum reinforcement learning, quantized world model, quantized world model\n  - ExpEnv: PointNMaze\n\n- [Safe Exploration in Reinforcement Learning: A Generalized Formulation and Algorithms](https:\u002F\u002Fopenreview.net\u002Fpdf?id=dQLsvKNwZC)\n  - Akifumi Wachi, Wataru Hashimoto, Xun Shen, Kazumune Hashimoto\n  - Key: safe exploration, generalized formulation, safe exploration algorithms, Meta-Algorithm for Safe Exploration\n  - ExpEnv: grid-world and Safety Gym\n\n- [Successor-Predecessor Intrinsic Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=ohKbQp0jIY)\n  - Changmin Yu, Neil Burgess, Maneesh Sahani, Samuel J. Gershman\n  - Key: retrospective structure of transition sequences, combining prospective and retrospective information\n  - ExpEnv: grid worlds, MountainCar, Atari\n  \n- [Accelerating Reinforcement Learning with Value-Conditional State Entropy Exploration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19476)\n  - Dongyoung Kim, Jinwoo Shin, Pieter Abbeel, Younggyo Seo\n  - Key: value-conditional state entropy exploration\n  - ExpEnv: MiniGrid, DeepMind Control Suite, and Meta-World\n\n- [ELDEN: Exploration via Local Dependencies](https:\u002F\u002Fopenreview.net\u002Fpdf?id=sL4pJBXkxu)\n  - Zizhao Wang, Jiaheng Hu, Peter Stone, Roberto Martín-Martín\n  - Key: local dependencies, exploration bonus, intrinsic motivation, encourages the discovery of new interactions between entities\n  - ExpEnv: 2D grid worlds to 3D robotic tasks\n\n\u003C\u002Fdetails>\n\n### ICML 2023\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [A Study of Global and Episodic Bonuses for Exploration in Contextual MDPs](https:\u002F\u002Fopenreview.net\u002Fpdf?id=1CqtvwHTKQ)\n  - Mikael Henaff, Minqi Jiang, Roberta Raileanu\n  - Key: global novelty bonuses, episodic novelty bonuses, shared structure\n  - ExpEnv: Mini-Hack suite, Habitat and Montezuma’s Revenge\n\n- [Curiosity in Hindsight: Intrinsic Exploration in Stochastic Environments](https:\u002F\u002Fopenreview.net\u002Fpdf?id=fIH2G4fnSy)\n  - Daniel Jarrett, Corentin Tallec, Florent Altché, Thomas Mesnard, Rémi Munos, Michal Valko\n  - Key: stochastic environments, disentangle “noise” from “novelty”, BYOL-Hindsight\n  - ExpEnv: Pycolab Maze, Atari, Bank Heist\n\n- [Representations and Exploration for Deep Reinforcement Learning using Singular Value Decomposition](https:\u002F\u002Fopenreview.net\u002Fpdf?id=p9wFuLpp0O)\n  - Yash Chandak, Shantanu Thakoor, Zhaohan Daniel Guo, Yunhao Tang, Remi Munos, Will Dabney, Diana Borsa\n  - Key: singular value decomposition, relative frequency of state visitations, scale this decomposition method to large-scale domains\n  - ExpEnv: DMLab-30, DM-Hard-8\n\n- [Reparameterized Policy Learning for Multimodal Trajectory Optimization](https:\u002F\u002Fopenreview.net\u002Fpdf?id=5Akrk9Ln6N)\n   - Zhiao Huang, Litian Liang, Zhan Ling, Xuanlin Li, Chuang Gan, Hao Su\n   - Key: multimodal policy parameterization, a generative model of optimal trajectories\n   - ExpEnv: bandit, MetaWorld, 2D maze\n\n- [Flipping Coins to Estimate Pseudocounts for Exploration in Reinforcement Learning ](https:\u002F\u002Fopenreview.net\u002Fpdf?id=4RvcXByvnR)\n   - Sam Lobel, Akhil Bagaria, George Konidaris\n   - Key: count-based exploration, veraging samples from the Rademacher distribution (or coin flips)\n   - ExpEnv: Atari, D4RL, FETCH\n\n- [Fast Rates for Maximum Entropy Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=wcUppxYfLH)\n  - Daniil Tiapkin, Denis Belomestny, Daniele Calandriello, Éric Moulines, Rémi Munos, Alexey Naumov, Pierre Perrault, Yunhao Tang, Michal Valko, Pierre Ménard\n  - Key: visitation entropy maximization, game-theoretic algorithm, trajectory entropy\n  - ExpEnv: Double Chain MDP\n\n- [Guiding Pretraining in Reinforcement Learning with Large Language Models](https:\u002F\u002Fopenreview.net\u002Fpdf?id=63704LH4v5)\n  - Yuqing Du, OliviaWatkins, Zihan Wang, CÅLedric Colas, Trevor Darrell, Pieter Abbeel, Abhishek Gupta, Jacob Andreas\n  - Key: uses background knowledge from text corpora to shape exploration, rewards an agent for achieving goals suggested by a language model prompted with a description of the agent’s current state.\n  - ExpEnv: Crafter, Housekeep\n\n- [Do Embodied Agents Dream of Pixelated Sheep?: Embodied Decision Making using Language Guided World Modelling](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Rm5Qi57C5I)\n  - Kolby Nottingham, Prithviraj Ammanabrolu, Alane Suhr, Yejin Choi, Hannaneh Hajishirzi, Sameer Singh, Roy Fox\n  - Key: Abstract World Model (AWM) for planning and exploration, LLM-guided exploration, Dream phase and Wake phase, \n  - ExpEnv: Minecraft\n\n- [Cell-Free Latent Go-Explore](https:\u002F\u002Fopenreview.net\u002Fpdf?id=4TtG42xJvC)\n  - Quentin GallouÅLedec, Emmanuel DellandrÅLea\n  - Key: Latent Go-Explore, a learned latent representation\n  - ExpEnv: 2D maze, panda-gym, Atari\n\n- [Go Beyond Imagination: Maximizing Episodic Reachability with World Models](https:\u002F\u002Fopenreview.net\u002Fpdf?id=JsAMuzA9o2)\n  - Yao Fu, Run Peng, Honglak Lee\n  - Key: an episodic intrinsic reward that is designed to maximize the stepwise reachability expansion\n  - ExpEnv: Minigrid, DeepMind Control Suite\n\n- [Efficient Online Reinforcement Learning with Offline Data](https:\u002F\u002Fopenreview.net\u002Fpdf?id=h11j9w1ucU)\n  - Philip J. Ball, Laura Smith, Ilya Kostrikov, Sergey Levine\n  - Key: Sample efficiency and exploration, simply apply existing off-policy methods to leverage offline data when learning online, key factors that most affect performance,  a set of recommendations\n  - ExpEnv: D4RL AntMaze, Locomotion, Adroit\n\n- [Anti-Exploration by Random Network Distillation](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NRQ5lC8Dit)\n  - Alexander Nikulin, Vladislav Kurenkov, Denis Tarasov, Sergey Kolesnikov\n  - Key: an uncertainty estimator, anti-exploration bonus, Feature-wise Linear Modulation\n  - ExpEnv: D4RL\n\n- [The Impact of Exploration on Convergence and Performance of Multi-Agent Q-Learning Dynamics](https:\u002F\u002Fopenreview.net\u002Fpdf?id=agPrVQdnxT)\n  - Aamal Hussain, Francesco Belardinelli, Dario Paccagnan\n  - Key: How does exploration affect reinforcement learning dynamics in arbitrary games, even if convergence to an equilibrium cannot be guaranteed?\n  - ExpEnv: Network Shapley Game, Network Chakraborty Game, Arbitrary Games\n\n- [An Adaptive Entropy-Regularization Framework for Multi-Agent Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=MP7HOGfLf3)\n  - Woojun Kim, Youngchul Sung\n  - Key: adaptive entropyregularization framework, proper level of exploration entropy, disentangled value function\n  - ExpEnv: SMAC, multi-agent HalfCheetah\n\n- [Lazy Agents: A New Perspective on Solving Sparse Reward Problem in Multi-agent Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=DRu5BlRqrn)\n  - Boyin Liu, Zhiqiang Pu, Yi Pan, Jianqiang Yi, Yanyan Liang, Du Zhang\n  - Key: Lazy Agents Avoidance through Influencing External States, individual diligence intrinsic motivation (IDI) and collaborative diligence intrinsic motivation (CDI), external states transition model\n  - ExpEnv: SMAC, Google Research Football\n\n- [Automatic Intrinsic Reward Shaping for Exploration in Deep Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=UyJJ1pnb0y)\n  - Mingqi Yuan, Bo Li, Xin Jin, Wenjun Zeng\n  - Key: selects shaping function from a predefined set, an intrinsic reward toolkit\n  - ExpEnv: MiniGrid, Procgen, and DeepMind Control Suite\n\n- [LESSON: Learning to Integrate Exploration Strategies for Reinforcement Learning via an Option Framework](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vXcvrYJlVm)\n  - Woojun Kim, Jeonghye Kim, Youngchul Sung\n  - Key: option-critic model, adaptively select the most effective exploration strategy\n  - ExpEnv: MiniGrid and Atari\n\n\u003C\u002Fdetails>\n\n\n### ICLR 2023\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Learnable Behavior Control: Breaking Atari Human World Records via Sample-Efficient Behavior Selection](https:\u002F\u002Fopenreview.net\u002Fpdf?id=FeWvD0L_a4) (Oral: 10, 8, 8)\n  - Jiajun Fan, Yuzheng Zhuang, Yuecheng Liu, Jianye HAO, Bin Wang, Jiangcheng Zhu, Hao Wang, Shu-Tao Xia\n  - Key: Learnable Behavioral Control, hybrid behavior mapping, a unified learnable process for behavior selection, bandit-based metacontrollers\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n  \n- [The Role of Coverage in Online Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=LQIjzPdDt3q) (Oral: 8, 8, 5)\n  - Tengyang Xie, Dylan J Foster, Yu Bai, Nan Jiang, Sham M. Kakade\n  - Key: coverage conditions, data logging distribution, sample-efficient exploration, sequential extrapolation coefficient\n  - ExpEnv: None\n\n- [Near-optimal Policy Identification in Active Reinforcement Learning ](https:\u002F\u002Fopenreview.net\u002Fforum?id=3OR2tbtnYC-) (Oral: 8,8,8)\n   - Xiang Li, Viraj Mehta, Johannes Kirschner, Ian Char, Willie Neiswanger, Jeff Schneider, Andreas Krause, Ilija Bogunovic\n   - Key: kernelized least-squares value iteration, combines optimism with pessimism for active exploration\n   - ExpEnv: Cartpole, Navigation, Tracking, Rotation, Branin-Hoo, Hartmann\n\n- [Planning Goals for Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=6qeBuZSo7Pr) (Spotlight: 8, 8, 8, 8, 6)\n  - Edward S. Hu, Richard Chang, Oleh Rybkin, Dinesh Jayaraman\n  - Key: goal-conditioned, planning exploratory goals, world models, sampling-based planning algorithms\n  - ExpEnv: Point Maze, Walker, Ant Maze, 3-Block Stacking\n\n- [Pink Noise Is All You Need: Colored Noise Exploration in Deep Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=hQ9V5QN27eS) (Spotlight: 8, 8, 8)\n  - Onno Eberhard, Jakob Hollenstein, Cristina Pinneri, Georg Martius\n  - Key: continuous action spaces, temporally correlated noise, colored noise\n  - ExpEnv: [DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control), [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym), Adroit hand suite\n\n- [Learning About Progress From Experts](https:\u002F\u002Fopenreview.net\u002Fpdf?id=sKc6fgce1zs) (Spotlight: 8, 8, 6)\n  - Jake Bruce, Ankit Anand, Bogdan Mazoure, Rob Fergus\n  - Key: the use of expert demonstrations, long-horizon tasks, learn a monotonically increasing function that summarizes progress.\n  - ExpEnv: [NetHack](https:\u002F\u002Fgithub.com\u002FNetHack\u002FNetHack)\n\n- [DEP-RL: Embodied Exploration for Reinforcement Learning in Overactuated and Musculoskeletal Systems](https:\u002F\u002Fopenreview.net\u002Fpdf?id=C-xa_D3oTj6) (Spotlight: 10, 8, 8, 8)\n  - Pierre Schumacher, Daniel Haeufle, Dieter Büchler, Syn Schmitt, Georg Martius\n  - Key: large overactuated action spaces, differential extrinsic plasticity, state-space covering exploration.\n  - ExpEnv: musculoskeletal systems: torquearm, arm26, humanreacher, ostrich-foraging, ostrich-run, human-run, human-hop\n\n- [Does Zero-Shot Reinforcement Learning Exist?](https:\u002F\u002Fopenreview.net\u002Fpdf?id=MYEap_OcQI) (Spotlight: 10, 8, 8,3)\n  - Ahmed Touati, Jérémy Rapin, Yann Ollivier\n  - Key: zero-shot RL agent, disentangle universal representation learning from exploration, SFs with Laplacian eigenfunctions.\n  - ExpEnv: Unsupervised RL and ExORL benchmarks\n\n- [Human-level Atari 200x faster](https:\u002F\u002Fopenreview.net\u002Fpdf?id=JtC6yOHRoJJ) (Poster: 8, 8, 3)\n  - Steven Kapturowski, Víctor Campos, Ray Jiang, Nemanja Rakicevic, Hado van Hasselt, Charles Blundell, Adria Puigdomenech Badia\n  - Key: 200-fold reduction of experience, a more robust and efficient agent\n  - ExpEnv: Atari 57\n\n- [Learning Achievement Structure for Structured Exploration in Domains with Sparse Reward](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NDWl9qcUpvy) (Poster: 8, 8, 5, 5)\n  - Zihan Zhou, Animesh Garg\n  - Key: achievement-based environments, recovered dependency graph\n  - ExpEnv: Crafter, TreeMaze\n\n- [Safe Exploration Incurs Nearly No Additional Sample Complexity for Reward-Free RL](https:\u002F\u002Fopenreview.net\u002Fpdf?id=wNUgn1n6esQ) (Poster: 8, 8, 6, 6)\n  - Ruiquan Huang, Jing Yang, Yingbin Liang\n  - Key: reward-free reinforcement learning, reduce the uncertainty in the estimated model with minimum\n  number of trajectories.\n  - ExpEnv: tabular MDPs, Low-rank MDP\n\n- [Latent State Marginalization as a Low-cost Approach to Improving Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=b0UksKFcTOL) (Poster: 6, 6, 6)\n  - Dinghuai Zhang, Aaron Courville, Yoshua Bengio, Qinqing Zheng, Amy Zhang, Ricky T. Q. Chen\n  - Key: adoption of latent variable policies within the MaxEnt framework, low-cost marginalization of the latent state\n  - ExpEnv: [DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)\n\n- [Revisiting Curiosity for Exploration in Procedurally Generated Environments](https:\u002F\u002Fopenreview.net\u002Fpdf?id=j3GK3_xZydY) (Poster: 8, 8, 5, 3, 3)\n  - Kaixin Wang, Kuangqi Zhou, Bingyi Kang, Jiashi Feng, Shuicheng YAN\n  - Key: lifelong intrinsic rewards and episodic intrinsic rewards，the performance of all lifelong-episodic combinations\n  - ExpEnv: [MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid)\n\n- [MoDem: Accelerating Visual Model-Based Reinforcement Learning with Demonstrations](https:\u002F\u002Fopenreview.net\u002Fpdf?id=JdTnc9gjVfJ) (Poster: 8, 6, 6, 6)\n  - Nicklas Hansen, Yixin Lin, Hao Su, Xiaolong Wang, Vikash Kumar, Aravind Rajeswaran\n  - Key: Key ingredients for leveraging demonstrations in model learning\n  - ExpEnv: Adroit, Meta-World, [DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)\n\n- [Simplifying Model-based RL: Learning Representations, Latent-space Models, and Policies with One Objective](https:\u002F\u002Fopenreview.net\u002Fpdf?id=MQcmfgRxf7a) (Poster: 8, 6, 6, 6, 6)\n  - Raj Ghugare, Homanga Bharadhwaj, Benjamin Eysenbach, Sergey Levine, Russ Salakhutdinov\n  - Key: alignment between these auxiliary objectives and the RL objective,  a lower bound on expected returns\n  - ExpEnv: model-based benchmark\n\n- [EUCLID: Towards Efficient Unsupervised Reinforcement Learning with Multi-choice Dynamics Model](https:\u002F\u002Fopenreview.net\u002Fpdf?id=9-tjK93-rP) (Poster:  6, 6, 6, 6)\n  - Yifu Yuan, Jianye HAO, Fei Ni, Yao Mu, YAN ZHENG, Yujing Hu, Jinyi Liu, Yingfeng Chen, Changjie Fan\n  - Key: transition dynamics modeling, multi-choice dynamics model, sampling efficiency\n  - ExpEnv: URLB\n\n- [Guarded Policy Optimization with Imperfect Online Demonstrations](https:\u002F\u002Fopenreview.net\u002Fpdf?id=O5rKg7IRQIO) (Oral: 8, 8, 6, 5)\n  - Zhenghai Xue, Zhenghao Peng, Quanyi Li, Zhihan Liu, Bolei Zhou\n  - Key: teacher-student shared control, safety guarantee and exploration guidance, trajectory-based value estimation\n  - ExpEnv: MetaDrive\n\n\u003C\u002Fdetails>\n\n### NeurIPS 2022\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Redeeming Intrinsic Rewards via Constrained Optimization](https:\u002F\u002Fwilliamd4112.github.io\u002Fpubs\u002Fneurips22_eipo.pdf) (Poster: 8, 7, 7)\n  - Eric Chen, Zhang-Wei Hong, Joni Pajarinen, Pulkit Agrawal\n  - Key:  automatically tunes the importance of the intrinsic reward, principled constrained policy optimization procedure\n  - ExpEnv: Atari\n\n- [You Only Live Once: Single-Life Reinforcement Learning via Learned Reward Shaping](https:\u002F\u002Fopenreview.net\u002Fpdf?id=303XqIQ5c_d) (Poster: 6, 6, 5, 5)\n  - Annie S. Chen, Archit Sharma, Sergey Levine, Chelsea Finn\n  - Key: single-life reinforcement learning, Q-weighted adversarial learning (QWALE), distribution matching strategy\n  - ExpEnv: Tabletop-Organization, Pointmass, modified HalfCheetah, modified Franka-Kitchen\n\n- [Curious Exploration via Structured World Models Yields Zero-Shot Object Manipulation](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NnuYZ1el24C) (Poster: 8, 7, 6)\n  - Cansu Sancaktar, Sebastian Blaes, Georg Martius\n  - Key: self-reinforcing cycle between good models and good exploration, zero-shot generalization to downstream tasks via model-based planning\n  - ExpEnv: Playground, Fetch Pick & Place Construction\n\n- [Model-based Lifelong Reinforcement Learning with Bayesian Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=6I3zJn9Slsb) (Poster: 7, 6, 6)\n  - Haotian Fu, Shangqun Yu, Michael Littman, George Konidaris\n  - Key: hierarchical Bayesian posterior\n  - ExpEnv:  HiP-MDP versions of Mujoco, Meta-world\n  \n- [On the Statistical Efficiency of Reward-Free Exploration in Non-Linear RL](https:\u002F\u002Fopenreview.net\u002Fpdf?id=65eqtvEShR8) (Poster: 7, 6, 5, 5)\n  - Jinglin Chen, Aditya Modi, Akshay Krishnamurthy, Nan Jiang, Alekh Agarwal\n  - Key: sample-efficient reward-free exploration, explorability or reachability assumptions\n  - ExpEnv: None\n\n- [DOPE: Doubly Optimistic and Pessimistic Exploration for Safe Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=U4BUMoVTrB2) (Poster: 8, 7, 4)\n  - Archana Bura, Aria Hasanzadezonuzy, Dileep Kalathil, Srinivas Shakkottai, Jean-Francois Chamberland\n  - Key: model-based safe RL, finite-horizon Constrained Markov Decision Process, reward bonus for exploration (optimism) with a conservative constraint (pessimism)\n  - ExpEnv: Factored CMDP environment\n\n- [Bayesian Optimistic Optimization: Optimistic Exploration for Model-based Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=GdHVClGh9N)\n  - Chenyang Wu, Tianci Li, Zongzhang Zhang, Yang Yu\n  - Key: Optimism in the face of uncertainty (OFU), Bayesian optimistic optimization\n  - ExpEnv:  RiverSwim, Chain, Random MDPs.\n\n- [Active Exploration for Inverse Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=TPOJzwv2pc) (Poster: 7, 7, 7, 7)\n  - David Lindner, Andreas Krause, Giorgia Ramponi \n  - Key: actively explores an unknown environment and expert policy,  does not require a generative model of the environment\n  - ExpEnv: Four Paths, Random MDPs, Double Chain, Chain, Gridworld\n\n- [Exploration-Guided Reward Shaping for Reinforcement Learning under Sparse Rewards](https:\u002F\u002Fopenreview.net\u002Fpdf?id=W7HvKO1erY) (Poster: 6, 6, 4)\n  - Rati Devidze, Parameswaran Kamalaruban, Adish Singla\n  - Key: reward shaping, intrinsic reward function, exploration-based bonuses.\n  - ExpEnv: Chain, Room, Linek\n\n- [Monte Carlo Augmented Actor-Critic for Sparse Reward Deep Reinforcement Learning from Suboptimal Demonstrations](https:\u002F\u002Fopenreview.net\u002Fpdf?id=FLzTj4ia8BN) (Poster: 6, 6, 5, 5)\n  - Albert Wilcox, Ashwin Balakrishna, Jules Dedieu, Wyame Benslimane, Daniel S. Brown, Ken Goldberg\n  - Key: parameter free, the maximum of the standard TD target and a Monte Carlo estimate of the reward-to-go.\n  - ExpEnv: Pointmass Navigation, Block Extraction, Sequential Pushing, Door Opening, Block Lifting\n  \n- [Incentivizing Combinatorial Bandit Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=ITXgYOFi8b) (Poster: 7, 6, 5, 3)\n  - Xinyan Hu, Dung Daniel Ngo, Aleksandrs Slivkins, and Zhiwei Steven Wu\n  - Key: incentivized exploration, large,structured action sets and highly correlated beliefs, combinatorial semi-bandits.\n  - ExpEnv: None\n  \n\u003C\u002Fdetails>\n\n### ICML 2022\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n\n- [From Dirichlet to Rubin: Optimistic Exploration in RL without Bonuses](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.07704) (Oral)\n  - Daniil Tiapkin, Denis Belomestny, Eric Moulines, Alexey Naumov, Sergey Samsonov, Yunhao Tang, Michal Valko, Pierre Menard\n  - Key: Bayes-UCBVI, regret bound, quantile of a Q-value function posterior, anticoncentration inequality for a Dirichlet weighted sum\n  - ExpEnv: simple tabular grid-world env, [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [The Importance of Non-Markovianity in Maximum State Entropy Exploration](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.03060) (Oral)\n  - Mirco Mutti, Riccardo De Santi, Marcello Restelli\n  - Key: maximum state entropy exploration, non-Markovianity, finite-sample regime\n  - ExpEnv: 3State, River Swim\n\n- [Phasic Self-Imitative Reduction for Sparse-Reward Goal-Conditioned Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.12030) (Spotlight) \n  - Yunfei Li, Tian Gao, Jiaqi Yang, Huazhe Xu, Yi Wu\n  - Key:  sparse-reward goal-conditioned, RL\u002FSL phasic, task reduction\n  - ExpEnv: Sawyer Push, [Ant Maze](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control), Stacking\n\n- [Thompson Sampling for (Combinatorial) Pure Exploration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.09150) (Spotlight)\n  - Siwei Wang, Jun Zhu\n  - Key: combinatorial pure exploration, Thompson Sampling, lower complexity\n  - ExpEnv: combinatorial [multi-armed bandit](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMulti-armed_bandit)\n\n- [Near-Optimal Algorithms for Autonomous Exploration and Multi-Goal Stochastic Shortest Path](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.10729.pdf) (Spotlight)\n  - Haoyuan Cai, Tengyu Ma, Simon Du\n  - Key: incremental autonomous exploration, stronger sample complexity bounds, multi-goal stochastic shortest path\n  - ExpEnv: hard MDP\n\n- [Safe Exploration for Efficient Policy Evaluation and Comparison](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.13234.pdf) (Spotlight)\n  - Runzhe Wan, Branislav Kveton, Rui Song\n  - Key:  efficient and safe data collection for bandit policy evaluation.\n  - ExpEnv: [multi-armed bandit](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMulti-armed_bandit), contextual multi-armed bandit, linear bandits\n  \n\u003C\u002Fdetails>\n\n### ICLR 2022\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [The Information Geometry of Unsupervised Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=3wU2UX0voE) (Oral: 8, 8, 8)\n   - Benjamin Eysenbach, Ruslan Salakhutdinov, Sergey Levine\n   - Key: unsupervised skill discovery, mutual information objective, adversarially-chosen reward functions\n   - ExpEnv: None\n   \n- [When should agents explore?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.11811) (Spotlight: 8, 8, 6, 6)\n  - Miruna Pislar, David Szepesvari, Georg Ostrovski, Diana Borsa, Tom Schaul\n  - Key: mode-switching, non-monolithic exploration, intra-episodic exploration\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Learning more skills through optimistic exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=cU8rknuhxcDJ) (Spotlight: 8, 8, 8, 6)\n  - DJ Strouse, Kate Baumli, David Warde-Farley, Vlad Mnih, Steven Hansen\n  - Key: discriminator disagreement intrinsic reward, information gain auxiliary objective\n  - ExpEnv: tabular grid world, [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Learning Long-Term Reward Redistribution via Randomized Return Decomposition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.13485) (Spotlight: 8, 8, 8, 5)\n  - Zhizhou Ren, Ruihan Guo, Yuan Zhou, Jian Peng\n  - Key: sparse and delayed rewards, randomized return decomposition\n  - ExpEnv: [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [Reinforcement Learning with Sparse Rewards using Guidance from Offline Demonstration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=YJ1WzgMVsMt) (Spotlight: 8, 8, 8, 6, 6)\n  - Desik Rengarajan, Gargi Vaidya, Akshay Sarvesh, Dileep Kalathil, Srinivas Shakkottai\n  - Key: learning online with guidance offline\n  - ExpEnv: [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py), [TurtleBot](https:\u002F\u002Fwww.turtlebot.com\u002F) (Waypoint tracking, Obstacle avoidance)\n\n- [Generative Planning for Temporally Coordinated Exploration in Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fpdf?id=YZHES8wIdE) (Spotlight: 8, 8, 8, 6)\n  - Haichao Zhang, Wei Xu, Haonan Yu\n  - Key: generative planning method, temporally coordinated exploration, crude initial plan\n  - ExpEnv: [classic continuous control env](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym), [CARLA](https:\u002F\u002Fgithub.com\u002Fcarla-simulator\u002Fcarla)\n\n- [Learning Altruistic Behaviours in Reinforcement Learning without External Rewards](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.09598) (Spotlight: 8, 8, 6, 6)\n  - Tim Franzmeyer, Mateusz Malinowski, João F. Henriques\n  - Key: altruistic behaviour, task-agnostic\n  - ExpEnv: grid world env, [foraging](https:\u002F\u002Fgithub.com\u002Fsemitable\u002Flb-foraging), [multi-agent tag](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FPettingZoo\u002Ftree\u002Fmaster\u002Fpettingzoo\u002Fmpe\u002Fsimple_tag)\n\n- [Anti-Concentrated Confidence Bonuses for Scalable Exploration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11202) (Poster: 8, 6, 5)\n  - Jordan T. Ash, Cyril Zhang, Surbhi Goel, Akshay Krishnamurthy, Sham Kakade\n  - Key: anti-concentrated confidence bounds, elliptical bonus\n  - ExpEnv: [multi-armed bandit](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMulti-armed_bandit), [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Lipschitz-constrained Unsupervised Skill Discovery](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.00914) (Poster: 8, 6, 6, 6)\n  - Seohong Park, Jongwook Choi, Jaekyeom Kim, Honglak Lee, Gunhee Kim\n  - Key: unsupervised skill discovery, Lipschitz-constrained\n  - ExpEnv: [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n  \n- [LIGS: Learnable Intrinsic-Reward Generation Selection for Multi-Agent Learning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.02618.pdf) (Poster: 8, 6, 5, 5)\n  - David Henry Mguni, Taher Jafferjee, Jianhong Wang, Nicolas Perez-Nieves, Oliver Slumbers, Feifei Tong, Yang Li, Jiangcheng Zhu, Yaodong Yang, Jun Wang\n  - Key: multi-agent, coordinated exploration and behaviour, learnable intrinsic-reward generation selection, switching controls\n  - ExpEnv: [foraging](https:\u002F\u002Fgithub.com\u002Fsemitable\u002Flb-foraging), [StarCraft II](https:\u002F\u002Fgithub.com\u002Foxwhirl\u002Fsmac)\n\n- [Multi-Stage Episodic Control for Strategic Exploration in Text Games](https:\u002F\u002Fopenreview.net\u002Fforum?id=Ek7PSN7Y77z) (Spotlight: 8, 8, 6, 6)\n  - Jens Tuyls, Shunyu Yao, Sham M. Kakade, Karthik R Narasimhan\n  - Key: multi-stage approach, policy decomposition\n  - ExpEnv: [Jericho](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fjericho)\n\n- [On the Convergence of the Monte Carlo Exploring Starts Algorithm for Reinforcement Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=JzNB0eA2-M4) (Poster: 8, 8, 5, 5)\n  - Che Wang, Shuhan Yuan, Kai Shao, Keith Ross\n  - Key: Monte Carlo exploring starts, optimal policy feed-forward MDPs\n  - ExpEnv: [blackjack](https:\u002F\u002Fgithub.com\u002Ftopics\u002Fblackjack-game), cliff Walking\n  \n\u003C\u002Fdetails>\n\n### NeurIPS 2021\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Interesting Object, Curious Agent: Learning Task-Agnostic Exploration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.13119) (Oral: 9, 8, 8, 8)\n  - Simone Parisi, Victoria Dean,Deepak Pathak, Abhinav Gupta\n  - Key: task-agnostic exploration, agent-centric component, environment-centric component\n  - ExpEnv: [MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid), [Habitat](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fhabitat-sim)\n\n- [Tactical Optimism and Pessimism for Deep Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.03765.pdf) (Poster: 9, 7, 6, 6) \n  - Ted Moskovitz, Jack Parker-Holder, Aldo Pacchiano, Michael Arbel, Michael Jordan\n  - Key: Tactical Optimistic and Pessimistic estimation, multi-arm bandit problem\n  - ExpEnv: [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [Which Mutual-Information Representation Learning Objectives are Sufficient for Control?](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.07278.pdf) (Poster: 7, 6, 6, 5) \n  - Kate Rakelly, Abhishek Gupta,Carlos Florensa, Sergey Levine\n  - Key: mutual information objectives, sufficiency of a state representation\n  - ExpEnv: catcher, catcher-grip\n\n- [On the Theory of Reinforcement Learning with Once-per-Episode Feedback](https:\u002F\u002Fopenreview.net\u002Fpdf?id=-uFBxNwRHa2) (Poster: 6, 5, 5, 4) \n  - Niladri S. Chatterji, Aldo Pacchiano, Peter L. Bartlett, Michael I. Jordan\n  - Key: binary feedback, sublinear regret\n  - ExpEnv: None\n\n- [MADE: Exploration via Maximizing Deviation from Explored Regions](https:\u002F\u002Fopenreview.net\u002Fpdf?id=DTVfEJIL3DB) (Poster: 7, 7, 6, 5)\n  - Tianjun Zhang, Paria Rashidinejad, Jiantao Jiao, Yuandong Tian, Joseph Gonzalez, Stuart Russell\n  - Key: maximizing deviation from the explored regions, intrinsic reward\n  - ExpEnv: [MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid), [DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)\n\n- [Adversarial Intrinsic Motivation for Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.13345.pdf) (Poster: 7, 7, 6) \n  - Ishan Durugkar, Mauricio Tec, Scott Niekum, Peter Stone\n  - Key: the Wasserstein-1 distance, goal-conditioned, quasimetric, adversarial intrinsic motivation\n  - ExpEnv: Grid World, Fetch Robot (based on [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py))\n\n- [Information Directed Reward Learning for Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.12466.pdf) (Poster: 9, 8, 7, 6) \n  - David Lindner, Matteo Turchetta, Sebastian Tschiatschek, Kamil Ciosek, Andreas Krause\n  - Key: expert queries, Bayesian model of the reward, maximize the information gain\n  - ExpEnv: [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [Dynamic Bottleneck for Robust Self-Supervised Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=-t6TeG3A6Do) (Poster: 8, 6, 6, 6)\n  - Chenjia Bai, Lingxiao Wang, Lei Han, Animesh Garg, Jianye Hao, Peng Liu, Zhaoran Wang\n  - Key: Dynamic Bottleneck, information gain\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Hierarchical Skills for Efficient Exploration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.10809) (Poster: 7, 6, 6, 6)\n  - Jonas Gehring, Gabriel Synnaeve, Andreas Krause, Nicolas Usunier\n  - Key: hierarchical skill learning, balance between generality and specificity, skills of varying complexity\n  - ExpEnv: Hurdles, Limbo, Stairs, GoalWall PoleBalance (based on [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py))\n\n- [Exploration-Exploitation in Multi-Agent Competition: Convergence with Bounded Rationality](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OSLVL-tIBei) (spotlight: 8, 6, 6)\n  - Stefanos Leonardos, Georgios Piliouras, Kelly Spendlove\n  - Key: competitive multi-agent, balance between game rewards and exploration costs, unique quantal-response equilibrium\n  - ExpEnv: Two-Agent Weighted Zero-Sum Games\n\n- [NovelD: A Simple yet Effective Exploration Criterion](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper\u002F2021\u002Ffile\u002Fd428d070622e0f4363fceae11f4a3576-Paper.pdf) (Poster: 7, 6, 6, 6)\n  - Tianjun Zhang, Huazhe Xu, Xiaolong Wang, Yi Wu, Kurt Keutzer, Joseph E. Gonzalez, Yuandong Tian\n  - Key: weighting every novel area approximately equally\n  - ExpEnv: [MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid), [NetHack](https:\u002F\u002Fgithub.com\u002FNetHack\u002FNetHack), [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Episodic Multi-agent Reinforcement Learning with Curiosity-driven Exploration](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.11032.pdf) (Poster: 7, 6, 6, 5)\n  - Lulu Zheng, Jiarui Chen, Jianhao Wang, Jiamin He, Yujing Hu, Yingfeng Chen, Changjie Fan, Yang Gao, Chongjie Zhang\n  - Key: episodic Multi-agent, curiosity-driven exploration, prediction errors, episodic memory\n  - ExpEnv: [Predator-Prey](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FPettingZoo), [StarCraft II](https:\u002F\u002Fgithub.com\u002Foxwhirl\u002Fsmac)\n\n- [Learning Diverse Policies in MOBA Games via Macro-Goals](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper\u002F2021\u002Ffile\u002F86dba86754c0ad93997a11fa947d97b2-Paper.pdf) (Poster: 7, 6, 5, 5)\n  - Yiming Gao, Bei Shi, Xueying Du, Liang Wang, Guangwei Chen, Zhenjie Lian, Fuhao Qiu, Guonan Han, Weixuan Wang, Deheng Ye, Qiang Fu, Wei Yang, Lanxiao Huang\n  - Key: MOBA-game, policy diversity, Macro-Goals Guided framework, Meta-Controller, human demonstrations\n  - ExpEnv: [honor of kings](https:\u002F\u002Fwww.honorofkings.com\u002F)\n\n- [CIC: Contrastive Intrinsic Control for Unsupervised Skill Discovery](https:\u002F\u002Fopenreview.net\u002Fpdf\u002Ff6d399de0d94e1c67c8b556e4ab6c0ee5b77a10f.pdf) (not accepted now: 8, 8, 6, 3)\n    - Michael Laskin, Hao Liu, Xue Bin Peng, Denis Yarats, Aravind Rajeswaran, Pieter Abbeel\n    - Key: decomposition of the mutual information, particle estimator, contrastive learning\n    - ExpEnv: [URLB](https:\u002F\u002Fopenreview.net\u002Fpdf?id=lwrPkQP_is)\n    \n\u003C\u002Fdetails>\n\n### Classic Exploration RL Papers\n\n\u003Cdetails open>\n\u003Csummary>(Click to Collapse)\u003C\u002Fsummary>\n\n- [Using Confidence Bounds for Exploitation-Exploration Trade-offs](https:\u002F\u002Fwww.jmlr.org\u002Fpapers\u002Fvolume3\u002Fauer02a\u002Fauer02a.pdf) *Journal of Machine Learning Research, 2002*\n  - Peter Auer\n  - Key: linear contextual bandits\n  - ExpEnv: None\n\n\u003C!-- \n- [How can we define intrinsic motivation?](https:\u002F\u002Fciteseerx.ist.psu.edu\u002Fviewdoc\u002Fdownload?doi=10.1.1.567.6524&rep=rep1&type=pdf) *Conf. on Epigenetic Robotics, 2008*\n  - Pierre-Yves Oudeyer, Frederic Kaplan. \n  - Key: intrinsic motivation\n  - ExpEnv: None\n-->\n\n- [A Contextual-Bandit Approach to Personalized News Article Recommendation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1003.0146) *WWW 2010*\n  - Lihong Li, Wei Chu, John Langford, Robert E. Schapire\n  - Key: LinUCB\n  - ExpEnv: Yahoo! Front Page Today Module dataset\n\n- [(More) Efficient Reinforcement Learning via Posterior Sampling](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1306.0940.pdf) *NeurIPS 2013*\n  - Ian Osband, Benjamin Van Roy, Daniel Russo\n  - Key:  prior distribution, posterior sampling\n  - ExpEnv: RiverSwim\n\n- [An empirical evaluation of thompson sampling](http:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F4321-an-empirical-evaluation-of-thompson-sampling.pdf) *NeurIPS 2011*\n  - Olivier Chapelle, Lihong Li\n  - Key: Thompson sampling, empirical results\n  - ExpEnv: None\n\n- [A Tutorial on Thompson Sampling](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.02038.pdf) *arxiv 2017*\n  - Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, Zheng Wen\n  - Key: Thompson sampling\n  - ExpEnv: None\n\n- [Unifying Count-Based Exploration and Intrinsic Motivation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.01868) *NeurIPS 2016*\n  - Marc G. Bellemare, Sriram Srinivasan, Georg Ostrovski, Tom Schaul, David Saxton, Remi Munos\n  - Key: intrinsic motivation, density models, pseudo-count\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Deep Exploration via Bootstrapped DQN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1602.04621) *NeurIPS 2016*\n  - Ian Osband, Charles Blundell, Alexander Pritzel, Benjamin Van Roy\n  - Key: temporally-extended (or deep) exploration, randomized value functions, bootstrapped DQN\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [VIME: Variational information maximizing exploration](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.09674) *NeurIPS 2016*\n  - Rein Houthooft, Xi Chen, Yan Duan, John Schulman, Filip De Turck, Pieter Abbeel\n  - Key: maximization of information gain, belief of environment dynamics, variational inference in Bayesian neural networks\n  - ExpEnv: [rllab](https:\u002F\u002Fgithub.com\u002Frll\u002Frllab)\n\n- [\\#Exploration: A Study of Count-Based Exploration for Deep Reinforcement Learning](http:\u002F\u002Fpapers.neurips.cc\u002Fpaper\u002F6868-exploration-a-study-of-count-based-exploration-for-deep-reinforcement-learning.pdf) *NeurIPS 2017*\n  - Haoran Tang, Rein Houthooft, Davis Foote, Adam Stooke, Xi Chen, Yan Duan, John Schulman, Filip De Turck, Pieter Abbeel\n  - Key: hash cont, intrinsic motivation\n  - ExpEnv: [rllab](https:\u002F\u002Fgithub.com\u002Frll\u002Frllab), [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [EX2: Exploration with Exemplar Models for Deep Reinforcement Learning](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2017\u002Ffile\u002F1baff70e2669e8376347efd3a874a341-Paper.pdf) *NeurIPS 2017*\n   - Justin Fu, John D. Co-Reyes, Sergey Levine\n   - Key: novelty detection, discriminatively trained exemplar models, implicit density estimation\n   - ExpEnv: [VizDoom](https:\u002F\u002Fgithub.com\u002Fmwydmuch\u002FViZDoom), [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Hindsight Experience Replay](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.01495.pdf) *NeurIPS 2017*\n  - Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, Wojciech Zaremba\n  - Key: hindsight experience replay, implicit curriculum\n  - ExpEnv: pushing, sliding, pick-and-place, physical robot\n\n- [Curiosity-driven exploration by self-supervised prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.05363) *ICML 2017*\n  - Deepak Pathak, Pulkit Agrawal, Alexei A. Efros, Trevor Darrell\n  - Key: curiosity, self-supervised inverse dynamics model\n  - ExpEnv: [VizDoom](https:\u002F\u002Fgithub.com\u002Fmwydmuch\u002FViZDoom), [Super Mario Bros](https:\u002F\u002Fsupermario-game.com\u002F)\n\n- [Deep Q-learning from Demonstrations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.03732) *AAAI 2018*\n  - Todd Hester, Matej Vecerik, Olivier Pietquin, Marc Lanctot, Tom Schaul, Bilal Piot, Dan Horgan, John Quan, Andrew Sendonaris, Gabriel Dulac-Arnold, Ian Osband, John Agapiou, Joel Z. Leibo, Audrunas Gruslys\n  - Key: combining temporal difference updates with supervised classification of the demonstrator’s actions\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Noisy Networks For Exploration](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rywHCPkAW) *ICLR 2018*\n    - Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Matteo Hessel, Ian Osband, Alex Graves, Volodymyr Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg\n    - Key: learned parametric noise\n    - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Exploration by random network distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12894) *ICLR 2018*\n  - Yuri Burda, Harrison Edwards, Amos Storkey, Oleg Klimov\n  - Key: random network distillation\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.01290)  *ICML 2018*\n  - Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine\n  - Key: soft actor critic, maximum entropy, policy iteration\n  - ExpEnv: [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n  \n- [Large-Scale Study of Curiosity-Driven Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.04355)  *ICLR 2019*\n  - Yuri Burda, Harri Edwards & Deepak Pathak, Amos Storkey, Trevor Darrell, Alexei A. Efros\n  - Key: curiosity, prediction error, purely curiosity-driven learning, feature spaces\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym), [Super Mario Bros](https:\u002F\u002Fsupermario-game.com\u002F)\n\n- [Diversity is all you need: Learning skills without a reward function](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.06070) *ICLR 2019*\n  - Benjamin Eysenbach, Abhishek Gupta, Julian Ibarz, Sergey Levine\n  - Key: maximizing an information theoretic objective, unsupervised emergence of diverse skills\n  - ExpEnv: [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n  \n- [Episodic Curiosity through Reachability](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.02274) *ICLR 2019*\n  - Nikolay Savinov, Anton Raichuk, Rapha¨el Marinier, Damien Vincent, Marc Pollefeys, Timothy Lillicrap, Sylvain Gelly\n  - Key: curiosity, episodic memory, how many environment steps it takes to reach the current observation\n  - ExpEnv: [VizDoom](https:\u002F\u002Fgithub.com\u002Fmwydmuch\u002FViZDoom), [DMLab](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Flab), [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [Self-Supervised Exploration via Disagreement](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04161) *ICML 2019*\n  - Deepak Pathak, Dhiraj Gandhi, Abhinav Gupta\n  - Key:  ensemble of dynamics models, maximize the disagreement of those ensembles, differentiable manner\n  - ExpEnv: Noisy MNIST, [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym), [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py), [Unity](https:\u002F\u002Funity.com\u002Fproducts\u002Fmachine-learning-agents), real robot\n\n- [EMI: Exploration with Mutual Information](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.01176) *ICML 2019*\n  - Hyoungseok Kim, Jaekyeom Kim, Yeonwoo Jeong, Sergey Levine, Hyun Oh Song\n  - Key: embedding representation of states and actions, forward prediction, mutual information\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym), [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [Making Efficient Use of Demonstrations to Solve Hard Exploration Problems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01387) *arxiv 2019*\n    - Caglar Gulcehre, Tom Le Paine, Bobak Shahriari, Misha Denil, Matt Hoffman, Hubert Soyer, Richard Tanburn, Steven Kapturowski, Neil Rabinowitz, Duncan Williams, Gabriel Barth-Maron, Ziyu Wang, Nando de Freitas\n    - Key: R2D2, makes efficient use of demonstrations, hard exploration problems\n    - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Optimistic Exploration even with a Pessimistic Initialisation](http:\u002F\u002Fwww.cs.ox.ac.uk\u002Fpeople\u002Fshimon.whiteson\u002Fpubs\u002Frashidiclr20.pdf) *ICLR 2020*\n    - Tabish Rashid, Bei Peng, Wendelin Böhmer, Shimon Whiteson\n    - Key: pessimistically initialised Q-values, count-derived bonuses, optimism during both action selection and bootstrapping\n    - ExpEnv: randomised chain, Maze, [Montezuma’s Revenge](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [RIDE: Rewarding Impact-Driven Exploration for Procedurally-Generated Environments](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rkg-TJBFPB) *ICLR 2020*\n    - Roberta Raileanu, Tim Rocktäschel\n    - Key: lead to significant changes in its learned state representation\n    - ExpEnv: [MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid)\n\n- [Never give up: Learning directed exploration strategies](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06038) *ICLR 2020*\n  - Adrià Puigdomènech Badia, Pablo Sprechmann, Alex Vitvitskyi, Daniel Guo, Bilal Piot, Steven Kapturowski, Olivier Tieleman, Martín Arjovsky, Alexander Pritzel, Andew Bolt, Charles Blundell\n  - Key:  ICM+RND, different degrees of exploration\u002Fexploitation\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n  \n- [Agent57: Outperforming the atari human benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13350) *ICML 2020* \n  - Adrià Puigdomènech Badia, Bilal Piot, Steven Kapturowski, Pablo Sprechmann, Alex Vitvitskyi, Daniel Guo, Charles Blundell\n  - Key:  parameterizes a family of policies, adaptive mechanism, state-action value function parameterization\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym), [roboschool](https:\u002F\u002Fgithub.com\u002Fopenai\u002Froboschool)\n\n- [Neural Contextual Bandits with UCB-based Exploration](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.04462.pdf) *ICML 2020*\n  - Dongruo Zhou, Lihong Li, Quanquan Gu\n  - Key:  stochastic contextual bandit,  neural network-based random feature, near-optimal regret guarantee\n  - ExpEnv: contextual bandits, UCI Machine Learning Repository, [MNIST](http:\u002F\u002Fyann.lecun.com\u002Fexdb\u002Fmnist\u002F)\n\n- [Rank the Episodes: A Simple Approach for Exploration in Procedurally-Generated Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.08152) *ICLR 2021*\n  - Daochen Zha, Wenye Ma, Lei Yuan, Xia Hu, Ji Liu\n  - Key: procedurally-generated environments, episodic exploration score from both per-episode and long-term views\n  - ExpEnv: [MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid), MiniWorld, [MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [First return then explore](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03157-9) *Nature 2021*\n  - Adrien Ecoffet, Joost Huizinga, Joel Lehman, Kenneth O. Stanley, Jeff Clune\n  - Key:  detachment and derailment, remembering states, returning to them, and exploring from them\n  - ExpEnv: [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym), pick-and-place robotics task\n\n\u003C\u002Fdetails>\n\n\n## Contributing\nOur purpose is to provide a starting paper guide to who are interested in exploration methods in RL.\nIf you are interested in contributing, please refer to [HERE](CONTRIBUTING.md) for instructions in contribution.\n\n\n## License\nAwesome Exploration RL is released under the Apache 2.0 license.\n\n\u003Cp align=\"right\">(\u003Ca href=\"#top\">Back to top\u003C\u002Fa>)\u003C\u002Fp>\n","\u003Cdiv id=\"top\">\u003C\u002Fdiv>\n\n# 强化学习中的优秀探索方法\n\n`更新于 2025年12月2日`\n\n- 这里收集了关于**强化学习中的探索方法（ERL）**的研究论文。\n该仓库将持续更新，以跟踪ERL领域的前沿进展。欢迎关注并点赞！\n\n- **探索与利用**之间的平衡是强化学习中最核心的问题之一。\n为了帮助读者更直观地理解探索问题，我们在下方提供了一个来自[MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid)的典型高难度探索环境的可视化示例。\n在这个任务中，完成目标往往需要执行数十甚至数百步的动作序列，智能体必须充分探索不同的状态-动作空间，\n才能学会达成目标所需的技能。\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fopendilab_awesome-exploration-rl_readme_e42b34f4f95a.png\" alt=\"minigrid_hard_exploration\" width=\"40%\" height=\"40%\" \u002F>\u003Cbr>\n  \u003Cem style=\"display: inline-block;\">一个典型的高难度探索环境：MiniGrid-ObstructedMaze-Full-v0。\u003C\u002Fem>\n\u003C\u002Fp>\n\n## 目录\n\n- [强化学习中的优秀探索方法](#awesome-exploration-methods-in-reinforcement-learning)\n  - [目录](#table-of-contents)\n  - [探索型强化学习方法分类](#a-taxonomy-of-exploration-rl-methods)\n  - [论文](#papers)\n    - [NeurIPS 2025](#neurips-2025)\n    - [ICML 2025](#icml-2025)\n    - [ICLR 2025](#iclr-2025)\n    - [NeurIPS 2024](#neurips-2024)\n    - [ICML 2024](#icml-2024)\n    - [ICLR 2024](#iclr-2024)\n    - [NeurIPS 2023](#neurips-2023)\n    - [ICML 2023](#icml-2023)\n    - [ICLR 2023](#iclr-2023)\n    - [NeurIPS 2022](#neurips-2022)\n    - [ICML 2022](#icml-2022)\n    - [ICLR 2022](#iclr-2022)\n    - [NeurIPS 2021](#neurips-2021)\n    - [经典探索型强化学习论文](#classic-exploration-rl-papers)\n  - [贡献](#contributing)\n  - [许可证](#license)\n\n\n## 探索型强化学习方法分类\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n一般来说，我们可以将强化学习过程分为两个阶段：*收集*阶段和*训练*阶段。\n在*收集*阶段，智能体根据当前策略选择动作，并与环境交互以收集有用的经验。\n在*训练*阶段，智能体利用收集到的经验来更新当前策略，从而获得性能更好的策略。\n\n根据探索组件被显式应用的阶段，我们将“探索型强化学习”方法简单地分为两大类：`增强型收集策略`和`增强型训练策略`：\n\n- `增强型收集策略`代表了在*收集*阶段常用的多种不同探索策略，我们进一步将其划分为*四类*：\n  - `动作选择扰动`\n  - `动作选择引导`\n  - `状态选择引导`\n  - `参数空间扰动`\n\n- `增强型训练策略`代表了在*训练*阶段常用的多种不同探索策略，我们进一步将其划分为*七类*：\n  - `基于计数`\n  - `基于预测`\n  - `基于信息论`\n  - `熵增强`\n  - `基于贝叶斯后验`\n  - `基于目标`\n  - `(专家)演示数据`\n\n> 需要注意的是，这些类别之间可能存在重叠，一种算法可能同时属于多个类别。\n> 关于RL中探索方法的其他详细综述，您可以参考[Tianpei Yang等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.06668)和[Susan Amin等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.00157)的相关研究。\n\n\n\u003Ccenter>\n\u003Cfigure>\n    \u003Cimg style=\"border-radius: 0.3125em;\n    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);\" \n    src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fopendilab_awesome-exploration-rl_readme_058f19e503fb.png\" width=100% height=100%>\n    \u003Cbr>\n    \u003Cfigcaption align = \"center\">\u003Cb>一份不完全但有用的探索型强化学习方法分类。\n    我们为每个类别提供了一些示例方法，如上方蓝色区域所示。\u003C\u002Fb>\u003C\u002Ffigcaption>\n\u003C\u002Ffigure>\n\u003C\u002Fcenter>\n\n以下是分类中提到的论文链接：\n>[1] [Go-Explore](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03157-9): Adrien Ecoffet 等，2021年  \n[2] [NoisyNet](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rywHCPkAW)，Meire Fortunato 等，2018年  \n[3] [DQN-PixelCNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.01868): Marc G. Bellemare 等，2016年  \n[4] [#Exploration](http:\u002F\u002Fpapers.neurips.cc\u002Fpaper\u002F6868-exploration-a-study-of-count-based-exploration-for-deep-reinforcement-learning.pdf) Haoran Tang 等，2017年  \n[5] [EX2](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2017\u002Ffile\u002F1baff70e2669e8376347efd3a874a341-Paper.pdf): Justin Fu 等，2017年  \n[6] [ICM](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.05363): Deepak Pathak 等，2018年  \n[7] [RND](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12894): Yuri Burda 等，2018年  \n[8] [NGU](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06038): Adrià Puigdomènech Badia 等，2020年  \n[9] [Agent57](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13350): Adrià Puigdomènech Badia 等，2020年  \n[10] [VIME](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.09674): Rein Houthooft 等，2016年    \n[11] [EMI](https:\u002F\u002Fopenreview.net\u002Fforum?id=H1exf64KwH): Wang 等，2019年  \n[12] [DIYAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.06070): Benjamin Eysenbach 等，2019年  \n[13] [SAC](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.01290): Tuomas Haarnoja 等，2018年  \n[14] [BootstrappedDQN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1602.04621): Ian Osband 等，2016年  \n[15] [PSRL](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1306.0940.pdf): Ian Osband 等，2013年  \n[16] [HER](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.01495.pdf) Marcin Andrychowicz 等，2017年  \n[17] [DQfD](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.03732): Todd Hester 等，2018年  \n[18] [R2D3](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01387): Caglar Gulcehre 等，2019年  \n\n\u003C\u002Fdetails>\n\n\n## 论文\n\n```\n格式：\n- [标题](论文链接) (发表形式，若公开则附上OpenReview评分)\n  - 作者1，作者2，作者3，...\n  - 关键：关键问题及见解\n  - 实验环境：实验使用的环境\n```\n\n### NeurIPS 2025\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [用于鲁棒强化学习的状态熵正则化](https:\u002F\u002Fopenreview.net\u002Fforum?id=rtG7n93Ru8)\n  - 约纳坦·阿什拉格、乌里·科伦、米尔科·穆蒂、埃丝特·德尔曼、皮埃尔-吕克·培康、希·曼诺尔\n  - 关键词：鲁棒强化学习、风险规避型强化学习、正则化强化学习\n  - 实验环境：MiniGrid、MuJoCo\n\n- [几何与激励机制的结合：基于线性上下文的高效激励式探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=nwlX15Wnr9)\n  - 本杰明·席弗、马克·塞尔克\n  - 关键词：多臂老虎机、贝叶斯激励相容、探索\n  - 实验环境：多臂老虎机\n\n- [LLM-Explorer：由大型语言模型驱动的插件式强化学习策略探索增强方法](https:\u002F\u002Fopenreview.net\u002Fforum?id=VA5P0rUZPx)\n  - 郝千越、宋艺文、廖庆敏、袁健、李勇\n  - 关键词：强化学习、大型语言模型、策略探索\n  - 实验环境：Atari、MuJoCo\n\n- [基于特征扰动的上下文老虎机探索方法](https:\u002F\u002Fopenreview.net\u002Fforum?id=gAddPMjmUc)\n  - 李秀源、吴珉焕\n  - 关键词：广义线性老虎机、上下文老虎机、汤普森采样、特征扰动\n  - 实验环境：合成数据、UCI数据集、MNIST\n\n- [REINFORCE算法在任意学习率下均能收敛至最优策略](https:\u002F\u002Fopenreview.net\u002Fforum?id=YzriuQGaNX)\n  - 塞缪尔·麦克劳林·罗伯逊、陈唐德、戴博、戴尔·舒尔曼斯、查巴·塞佩斯瓦里、梅锦程\n  - 关键词：强化学习、策略梯度、收敛、老虎机\n  - 实验环境：多臂老虎机、ChainMDP、DeepSea、CartPole\n\n- [非策略强化学习中的不对称REINFORCE：平衡正负奖励](https:\u002F\u002Fopenreview.net\u002Fforum?id=Ql3sENn0mi)\n  - 查尔斯·阿尔纳尔、盖坦·纳罗兹尼亚克、维维安·卡班内斯、唐云浩、朱莉娅·肯佩、雷米·穆诺斯\n  - 关键词：强化学习、非策略RL、LLM微调、老虎机\n  - 实验环境：随机老虎机、MATH数据集\n\n- [基于模型的探索增强的非策略强化学习](http:\u002F\u002Fopenreview.net\u002Fforum?id=JGkZgEEjiM)\n  - 王立坤、张翔腾、王依诺、詹国建、王文轩、高浩宇、段景亮、李圣波\n  - 关键词：基于模型的强化学习、基于模型的探索、生成模型、世界模型\n  - 实验环境：OpenAI Gym、DMC\n\n- [认知预测处理：一种受人类启发的开放式强化学习自适应探索框架](https:\u002F\u002Fopenreview.net\u002Fforum?id=2fFRIIwau6)\n  - 刘博恒、李子宇、段成华、刘宇田、王卓、李秀兴、李青、吴霞\n  - 关键词：开放式强化学习、受人脑启发的人工智能、认知架构\n  - 实验环境：MineDojo、Minecraft\n\n- [基于正交性的新颖探索方法](https:\u002F\u002Fopenreview.net\u002Fforum?id=yJS1eZSNUv)\n  - 安德烈亚斯·西奥菲卢、厄兹居尔·辛姆谢克\n  - 关键词：拉普拉斯算子、新颖性、强化学习、探索、特征向量、谱方法\n  - 实验环境：GridWorld\n\n- [线性老虎机中的低频探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=STLolzI6q1)\n  - 李哈林、吴珉焕\n  - 关键词：线性老虎机、贪婪选择\n  - 实验环境：多臂老虎机\n\n- [具有线性函数近似的部署高效无奖赏探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=ByzRO25Bjr)\n  - 张子涵、陈宇欣、李Jason D.、杜Simon Shaolei、杨琳、王若松\n  - 关键词：强化学习、线性MDP、部署效率\n  - 实验环境：无\n\n- [奇思妙想赢天下：通过多智能体上下文校准实现的好奇心驱动探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=1fOGTbO5Sx)\n  - 潘一元、刘哲、王和生\n  - 关键词：多智能体强化学习、内在奖励、人工好奇心\n  - 实验环境：VMAS、Meltingpot、SMACv2\n\n- [通过Ensemble++实现可扩展探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=M226WElHp5)\n  - 李英茹、徐嘉伟、王宝祥、罗志全\n  - 关键词：强化学习、集成采样、汤普森采样、探索、后验近似、可扩展计算\n  - 实验环境：线性老虎机、二次型老虎机、神经网络老虎机、基于GPT的上下文老虎机\n\n- [不确定性引导的探索以高效训练AlphaZero](https:\u002F\u002Fopenreview.net\u002Fforum?id=3q6lJTN45T)\n  - 斯科特·程、蔡孟瑜、洪定勇、马赫穆特·坎德米尔\n  - 关键词：AlphaZero、不确定性、探索\n  - 实验环境：围棋\n\n- [从对偶视角看探索：价值激励的演员-评论家方法用于样本高效的在线强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=A5Y8Uh5Szl)\n  - 杨彤、戴博、肖林、迟跃杰\n  - 关键词：探索-利用权衡、演员-评论家、强化学习理论\n  - 实验环境：MuJoCo\n\n- [带有动作分块的强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=XUks1Y96NR)\n  - 李启阳、周志远、谢尔盖·莱文\n  - 关键词：强化学习、离线到在线RL、探索\n  - 实验环境：OGBench、robomimic\n\n- [DISCOVER：稀疏奖励强化学习的自动化课程设置](https:\u002F\u002Fopenreview.net\u002Fforum?id=guZBnsKPsw)\n  - 利安德·迪亚斯-博恩、马可·巴加泰拉、乔纳斯·休博特、安德烈亚斯·克劳斯\n  - 关键词：强化学习、测试时训练、测试时强化学习、稀疏奖励强化学习、目标选择、目标条件强化学习、探索、探索-利用权衡、上界置信区间\n  - 实验环境：antmaze、arm、pointmaze\n\n- [元学习如何在宏动作间分配功劳](https:\u002F\u002Fopenreview.net\u002Fforum?id=cJlgdpEFx9)\n  - 伊奥内尔·霍苏、特赖安·雷贝迪亚、拉兹万·帕斯卡努\n  - 关键词：深度强化学习、宏动作、探索\n  - 实验环境：Atari、街头霸王II\n\n- [ExPO：借助自我解释引导的强化学习解锁困难推理](https:\u002F\u002Fopenreview.net\u002Fforum?id=D1PeGJtVEu)\n  - 周瑞阳、李硕哲、张艾米、刘乐琪\n  - 关键词：大型语言模型、自我提升、引导式探索、推理、具有可验证奖励的强化学习、自举法\n  - 实验环境：MATH、GSM8K、MATH-500\n\n\u003C\u002Fdetails>\n\n### ICML 2025\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [约束强化学习中安全探索的低估偏差控制](https:\u002F\u002Fopenreview.net\u002Fforum?id=nq5bt0mRTC)\n  - 高世清、丁嘉鑫、傅罗毅、王新兵\n  - 关键词：约束强化学习、安全探索、低估偏差、内在成本、强化学习。\n  - 实验环境：safety-gymnasium、MuJoCo\n\n- [训练一个具有通用好奇心的智能体](https:\u002F\u002Fopenreview.net\u002Fforum?id=UeB3Hdrhda)\n  - 法希姆·塔兹瓦尔、姜一丁、阿比塔·桑卡拉杰、苏迈塔·萨迪亚·拉赫曼、J·齐科·科尔特、杰夫·施奈德、罗斯·萨拉胡丁诺夫\n  - 关键词：LLM智能体、合成数据、多轮微调。\n  - 实验环境：二十问、猜城市、Wordle、元胞自动机、客户服务、谋杀之谜、Mastermind、海战棋、扫雷、老虎机最佳臂选择\n\n- [用于系统2规划的蒙特卡洛树扩散算法](https:\u002F\u002Fopenreview.net\u002Fforum?id=XrCbBdycDc)\n  - 尹在锡、赵贤书、白斗镇、约书亚·本吉奥、安成镇\n  - 关键词：扩散模型、MCTS、长期规划、离线强化学习、目标条件强化学习、推理时缩放。\n  - 实验环境：PointMaze、AntMaze、机械臂操作立方体、视觉PointMaze\n\n- [软推理：通过受控嵌入空间探索在大型语言模型中导航解空间](https:\u002F\u002Fopenreview.net\u002Fforum?id=4gWE7CMOlH)\n  - 朱庆林、赵润聪、严汉奇、何玉兰、陈宇东、桂琳\n  - 关键词：大型语言模型、推理、嵌入扰动、贝叶斯优化。\n  - 实验环境：无\n\n- [Hyper：强化学习中鲁棒高效的超参数探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=n1CVVzBSjQ)\n  - 王怡然、刘晨舒、李云帆、萨娜·阿马尼、周博磊、杨林峰\n  - 关键词：强化学习、探索、可证明高效、超参数鲁棒性。\n  - 实验环境：PointMaze、MuJoCo、MiniGrid\n\n- [逆向约束强化学习中的可证明高效探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=eLTPkGGHum)\n  - 岳博、李健、刘贵良\n  - 关键词：逆向约束强化学习、探索算法、样本效率。\n  - 实验环境：PointMaze、GridWorld\n\n- [博弈论多智能体强化学习中面向高福利均衡的显式探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=AxqgpcL90a)\n  - 澳森·A·阮、安里·古、迈克尔·P·韦尔曼\n  - 关键词：经验博弈论分析、均衡选择、博弈求解、策略探索。\n  - 实验环境：Harvest、讨价还价\n\n- [基于扩散策略的最大熵强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=CpjKXe9rY7)\n  - 董晓义、程健、张曦雪莉\n  - 关键词：扩散模型、在线强化学习、最大熵强化学习、软演员-评论家。\n  - 实验环境：Mujoco、AntMaze、DeepMind Control Suite\n\n- [通过反事实软强化学习实现VLM智能体的高效在线调优](https:\u002F\u002Fopenreview.net\u002Fforum?id=H76PMm7hf2)\n  - 冯朗、谭伟豪、吕志毅、郑龙涛、徐海洋、颜明、黄飞、安博\n  - 关键词：视觉-语言模型、智能体、强化学习、在线微调、反事实。\n  - 实验环境：Android-in-the-Wild、Gym Cards、ALFWorld\n\n- [DIME：基于扩散的最大熵强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=Aw6dBR7Vxj)\n  - 奥努尔·切利克、李泽楚、丹尼斯·布莱辛、李戈、丹尼尔·帕列尼切克、扬·彼得斯、乔治娅·查尔瓦察基、格哈德·诺伊曼\n  - 关键词：强化学习、扩散模型、基于扩散的强化学习、最大熵强化学习。\n  - 实验环境：Mujoco、DeepMind Control Suite、Myo Suite\n\n- [持续模型基强化学习中的知识保持](https:\u002F\u002Fopenreview.net\u002Fforum?id=DiqeZY27XK)\n  - 傅浩天、孙一翔、迈克尔·利特曼、乔治·科尼达里斯\n  - 关键词：深度强化学习、模型基强化学习、持续学习、世界模型。\n  - 实验环境：MiniGrid、DeepMind Control Suite\n\n- [KEA：通过主动协调探索策略保持探索活力](https:\u002F\u002Fopenreview.net\u002Fpdf?id=XIyrotmBSJ)\n  - 杨士敏、马丁·马格努松、约翰内斯·A·斯托克、托多尔·斯托亚诺夫\n  - 关键词：强化学习、基于新颖性的探索、软演员-评论家、稀疏奖励。\n  - 实验环境：2D导航、DeepMind Control Suite\n\n- [EVOLvE：评估与优化LLM以进行上下文探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=ck7dvZFbRW)\n  - 艾伦·聂、苏易、常博、李乔纳森、奇埃德·H、黎光国、陈敏敏\n  - 关键词：探索、上下文强化学习、老虎机问题。\n  - 实验环境：多臂老虎机、上下文老虎机\n\n- [一石二鸟：利用双重随机网络进行奖励塑造以平衡探索与利用](https:\u002F\u002Fopenreview.net\u002Fforum?id=YqtgKdW9dD)\n  - 马浩哲、李芳玲、林静宇、罗正定、武青荣、梁泽云\n  - 关键词：强化学习、奖励塑造、探索-利用平衡。\n  - 实验环境：Atari、VizDoom、MiniWorld\n\n- [SENSEI：由基础模型引导的语义探索以学习多功能世界模型](https:\u002F\u002Fopenreview.net\u002Fforum?id=ZDPNmihkMR)\n  - 詹苏·桑恰克塔尔、克里斯蒂安·贡布施、安德烈·扎达扬丘克、帕维尔·科列夫、格奥尔格·马提乌斯\n  - 关键词：内在动机、探索、基础模型、模型基强化学习。\n  - 实验环境：MiniHack、Robodesk、宝可梦红版\n\n- [行为探索：通过上下文适应学习探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=tlLkY9E2bZ)\n  - 安德鲁·瓦根梅克、周志远、谢尔盖·列维纳\n  - 关键词：上下文学习、探索、适应性智能体、行为模仿。\n  - 实验环境：D4RL AntMaze、D4RL Kitchen\n\n- [利用未标注先验数据中的技能实现高效在线探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=perpuTFEF7)\n  - 马克斯·威尔科克森、李启阳、凯文·弗兰斯、谢尔盖·列维纳\n  - 关键词：离线转在线强化学习、无监督预训练、探索。\n  - 实验环境：D4RL、OGBench、视觉AntMaze\n\n  \n\u003C\u002Fdetails>\n\n### ICLR 2025\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [通过时间距离实现的情节新颖性](https:\u002F\u002Fopenreview.net\u002Fforum?id=I7DeajDEx7)\n  - 姜宇华、刘启涵、杨一钦、马晓腾、钟典宇、胡浩、杨俊、梁斌、徐博、张冲杰、赵千川\n  - 关键词：情节新颖性、时间距离、探索、强化学习\n  - 实验环境：MiniGrid、MiniWorld、Craft、Maze、DMControl等。\n\n- [Brain Bandit：一种基于生物学原理的神经网络，用于高效控制探索行为](https:\u002F\u002Fopenreview.net\u002Fforum?id=RWJX5F5I9g)  \n  - 蒋晨、安佳慧、刘雅婷、季妮  \n  - 关键词：探索-利用、随机霍普菲尔德网络、汤普森采样、不确定性下的决策、脑启发算法、强化学习  \n  - 实验环境：多臂老虎机（MAB）任务、MDP任务。\n\n- [TOP-ERL：基于Transformer的离策略情节式强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=N4NhVN30ph)  \n  - 李戈、田东、周宏毅、蒋欣凯、鲁道夫·柳蒂科夫、格哈德·诺伊曼  \n  - 关键词：动作序列的价值、强化学习、Transformer、机器人操作、运动基元  \n  - 实验环境：机器人学习环境。\n\n- [基于以物体为中心的抽象表示的高效探索与判别型世界模型学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=hgwGi81ndj)  \n  - 安东尼·GX-陈、肯尼思·马里诺、罗布·费格斯  \n  - 关键词：强化学习、基于模型的强化学习、世界模型、探索、层次结构  \n  - 实验环境：2D工艺制作、MiniHack环境。\n\n- [MaxInfoRL：通过最大化信息增益提升强化学习中的探索能力](https:\u002F\u002Fopenreview.net\u002Fforum?id=R4q3cY3kQf)  \n  - 巴维亚·苏基贾、斯特利安·科罗斯、安德烈亚斯·克劳塞、皮特·阿贝尔、卡梅洛·斯费拉扎  \n  - 关键词：强化学习、离策略方法中的探索、连续控制  \n  - 实验环境：连续控制和视觉控制任务。\n\n- [ActSafe：带有安全约束的强化学习主动探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=aKRADWBJ1I)  \n  - 亚尔登·阿斯、巴维亚·苏基贾、莱纳特·特雷文、卡梅洛·斯费拉扎、斯特利安·科罗斯、安德烈亚斯·克劳塞  \n  - 关键词：安全探索、约束马尔可夫决策过程、安全强化学习  \n  - 实验环境：标准的安全深度强化学习基准。\n\n- [只需一个目标就够了：无需奖励、示范或子目标，对比强化学习即可涌现技能与探索行为](https:\u002F\u002Fopenreview.net\u002Fforum?id=xCkgX4Xfu0)  \n  - 格蕾丝·刘、迈克尔·唐、本杰明·艾森巴赫  \n  - 关键词：探索、涌现技能、对比强化学习、开放性学习  \n  - 实验环境：2D迷宫导航任务。\n\n- [面向更安全探索的风险感知策略学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=gJG4IPwg6l)  \n  - 考斯图布·马尼、文森特·迈、查理·戈蒂耶、安妮·S·陈、萨默·B·纳什德、利亚姆·保尔  \n  - 关键词：强化学习、安全探索、表征学习、归纳偏置  \n  - 实验环境：AdroitHandPen、PointGoal1和PointButton1。\n\n- [迈向高效的多智能体探索：通过轨迹熵最大化](https:\u002F\u002Fopenreview.net\u002Fforum?id=YvKJGYL4j7)  \n  - 李天旭、朱坤  \n  - 关键词：多智能体强化学习、探索、合作、轨迹熵最大化  \n  - 实验环境：多个MARL基准。\n\n- [仅需少量示范即可超越专家表现：基于双重探索的高效模仿学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=FviefuxmeW)  \n  - 赵海阳、于兴睿、大卫·马克·博森斯、伊沃·曾、关权权  \n  - 关键词：强化学习、模仿学习、双重探索  \n  - 实验环境：Atari、MuJoCo。\n\n- [EgoSim：在虚拟世界中基于多模态条件的自我中心式探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=zAyS5aRKV8)  \n  - 余伟、尹松恒、史蒂夫·伊斯特布鲁克、阿尼梅什·加格  \n  - 关键词：可控视频生成、自我中心视频预测、世界模型  \n  - 实验环境：RealEstate、Epic-Field。\n\n\u003C\u002Fdetails>\n\n### NeurIPS 2024\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [PEAC：面向跨化身强化学习的无监督预训练](https:\u002F\u002Fopenreview.net\u002Fforum?id=LyAFfdx8YF)  \n  - Chengyang Ying, Zhongkai Hao, Xinning Zhou, Xuezhou Xu, Hang Su, Xingxing Zhang, Jun Zhu \n  - 关键词：跨化身强化学习、无监督探索、技能发现、内在奖励  \n  - 实验环境：DeepMind Control Suite、Robosuite、Isaacgym，以及真实世界的运动任务\n\n- [SeeA*：通过选择性采样实现高效的探索增强型A*搜索](https:\u002F\u002Fopenreview.net\u002Fforum?id=mSaqxZVZW8)  \n  - Dengwei Zhao, Shikui Tu, Lei Xu  \n  - 关键词：通过选择性采样构建动态OPEN集合，以探索有潜力的分支；理论与实证层面均提升了效率。  \n  - 实验环境：逆合成规划（有机化学）、逻辑综合（IC设计）以及推箱子游戏。\n\n- [基于内在动机学习形式数学](https:\u002F\u002Fopenreview.net\u002Fforum?id=uNKlTQ8mBD)  \n  - Gabriel Poesia, David Broman, Nick Haber, Noah Goodman  \n  - 关键词：在自我改进的循环中联合学习证明形式化数学定理，并提出更难的可证猜想；利用依赖类型理论和事后重标记技术提升样本效率。  \n  - 实验环境：命题逻辑、算术和群论。\n\n- [RL-GPT：将强化学习与代码即策略相结合](https:\u002F\u002Fopenreview.net\u002Fforum?id=LEzx6QRkRH)  \n  - Shaoteng Liu, Haoqi Yuan, Minda Hu, Yanwei Li, Yukang Chen, Shu Liu, Zongqing Lu, Jiaya Jia  \n  - 关键词：结合强化学习与大型语言模型的两层层次化框架；通过将高层规划的代码编写与低层动作的强化学习相结合，实现高效执行。  \n  - 实验环境：Minecraft及MineDojo任务。\n\n- [超越乐观主义：具有部分可观测奖励的探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.13909)\n  - Simone Parisi, Alireza Kazemipour, Michael Bowling\n  - 关键词：强化学习、部分可观性、乐观主义、探索\n  - 实验环境：表格型环境（含不可观测奖励与不含不可观测奖励）\n\n- [为目标条件强化学习探索潜在状态聚类的边界](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.01396)  \n  - Yuanlin Duan, Guofeng Cui, He Zhu  \n  - 关键词：目标条件强化学习、探索、潜在空间聚类  \n  - 实验环境：多足蚂蚁迷宫、机械臂操作（杂乱桌面）、拟人化手部物体旋转\n\n- [子词作为技能：针对稀疏奖励强化学习的分词方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.04459)\n  - David Yunis, Justin Jung, Falcon Dai, Matthew Walter\n  - 关键词：稀疏奖励强化学习、技能生成、分词、连续动作空间\n  - 实验环境：具有挑战性的稀疏奖励任务\n\n- [用于数据高效通用价值函数评估的自适应探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.07838)  \n  - Arushi Jain, Josiah P. Hanna, Doina Precup  \n  - 关键词：通用价值函数、强化学习、数据效率  \n  - 实验环境：表格型设置、非线性函数近似、Mujoco环境（平稳与非平稳奖励信号）\n\n- [利用分离的世界模型在视觉干扰环境中进行探索](https:\u002F\u002Fopenreview.net\u002Fpdf\u002F6972a2683764073195f725a7d18b19d8e88711da.pdf)\n  - Kaichen Huang, Shenghua Wan, Minghao Shao, Hai-Hang Sun, Le Gan, Shuai Feng, De-Chuan Zhan\n  - 关键词：基于模型的强化学习、无监督强化学习（URL）、视觉干扰、双层优化  \n  - 实验环境：运动任务、操纵任务\n\n- [基于结构信息原理的有效探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06621)  \n  - Xianghua Zeng, Hao Peng, Angsheng Li  \n  - 关键词：强化学习、结构信息、有效探索、内在奖励  \n  - 实验环境：MiniGrid、MetaWorld、DeepMind Control Suite\n\n- [混沌中的阶梯：通过低维空间中的参数演化路径提升策略学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=3vHfwL2stG)\n  - Hongyao Tang, Min Zhang, Chen Chen, Jianye Hao\n  - 关键词：强化学习、策略学习动力学、时间序列奇异值分解、低维空间  \n  - 实验环境：MuJoCo、DeepMind Control Suite (DMC)、MinAtar\n\n- [重新思考强化学习中的探索：基于度量的有效探索奖励](https:\u002F\u002Fopenreview.net\u002Fpdf?id=QpKWFLtZKi)\n  - Yiming Wang, Kaiyan Zhao, Furui Liu, Leong Hou U\n  - 关键词：强化学习、探索、内在奖励、基于度量的状态差异  \n  - 实验环境：Atari、Minigrid、Robosuite、Habitat  \n  - [代码](https:\u002F\u002Fwww.google.com\u002Furl?q=https%3A%2F%2Fgithub.com%2FYimingWangMingle%2FEME&sa=D&sntz=1&usg=AOvVaw3DKT8iecHBtgIeOzqUjoaH)\n\n- [通过分层探索-利用权衡实现上下文MDP的离线Oracle高效学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=848vuK2cKp)  \n  - Jian Qian, Haichen Hu, David Simchi-Levi  \n  - 关键词：上下文马尔可夫决策过程（CMDPs）、离线密度估计、分层探索-利用权衡  \n  - 实验环境：免奖励强化学习\n\n\u003C\u002Fdetails>\n\n### ICML 2024\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [Q-Star 遇见可扩展后验采样：通过超智能体桥接理论与实践](https:\u002F\u002Fproceedings.mlr.press\u002Fv235\u002Fli24by.html)\n  - 李英茹、徐嘉伟、韩磊、罗志全\n  - 关键词：集成方法、汤普森采样、可扩展探索、后悔分析、复杂度理论\n  - 实验环境：Atari、DeepSea\n\n- [ACE：具有因果感知熵正则化的离策略Actor-Critic 算法](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OwtMhMSybu)\n  - 季天颖、梁勇远、曾燕、罗宇、徐国伟、郭嘉伟、郑睿杰、黄福荣、孙富春、许华哲\n  - 关键词：具有因果感知熵正则化的离策略Actor-Critic、探索、因果感知熵正则化\n  - 实验环境：MetaWorld、DeepMind Control Suite、Dexterous Hand、稀疏奖励\n\n- [深度强化学习中的随机潜在探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=Y9qzwNlKVU)\n  - 斯里纳特·V·马汉卡利、洪章伟、阿尤什·塞卡里、亚历山大·拉克林、普尔基特·阿格拉瓦尔\n  - 关键词：随机潜在探索、通过向原始任务奖励中添加结构化随机奖励来扰动奖励\n  - 实验环境：ATARI、ISAACGYM\n\n- [分布式随机网络蒸馏下的探索与反探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=rIrpzmqRBk)\n  - 杨凯、陶健、吕佳飞、李秀\n  - 关键词：奖励不一致性、分布式随机网络蒸馏、探索与反探索\n  - 实验环境：Atari、Adroit、Fetch 操控任务\n\n- [强化学习中基于自适应网格的广度优先探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=59MYoLghyk)\n  - 尹英植、李刚博、安成洙、奥贞雪\n  - 关键词：广度优先探索、自适应网格、探索效率\n  - 实验环境：GridWorld、Atari、Procgen\n\n- [直接聚类：利用聚类和预训练表征进行高维空间探索的方法](https:\u002F\u002Fopenreview.net\u002Fforum?id=cXBPPfNUZJ)\n  - 斯特凡·西尔维乌斯·瓦格纳、斯特凡·哈梅林\n  - 关键词：以表征为中心的探索视角、聚类、预训练表征\n  - 实验环境：VizDoom 和 Habitat\n\n- [通过状态占用正则化实现蒙特卡洛树搜索中可证明高效的长 horizon 探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=UCKFhc9SFC)\n  - 利亚姆·施拉姆、阿卜德斯拉姆·布拉里亚斯\n  - 关键词：蒙特卡洛树搜索、长 horizon 探索、状态占用正则化\n  - 实验环境：机器人导航问题\n\n- [大型语言模型的有效探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=PpPZ6W7rxy)\n  - 维克拉恩特·德瓦拉切拉、赛义德·穆罕默德·阿斯加里、郝博涛、本杰明·范·罗伊\n  - 关键词：探索、大型语言模型、高效探索\n  - 实验环境：语言任务\n\n- [基于覆盖性的可扩展在线探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=C64clssMVU)\n  - 菲利普·阿莫蒂拉、迪伦·J·福斯特、阿克谢·克里希纳穆提\n  - 关键词：L1 覆盖、内在复杂度控制、高效规划、高效探索\n  - 实验环境：MountainCar\n\n- [具有通用函数逼近的不确定性感知无奖励探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=BvBdYSIkpb)\n  - 张俊凯、张伟彤、周东若、关权权\n  - 关键词：不确定性感知的内在奖励、无奖励探索、通用函数逼近\n  - 实验环境：DeepMind Control Suite\n\n- [约束型集成探索用于无监督技能发现](https:\u002F\u002Fopenreview.net\u002Fforum?id=AOJCCFTlfJ)\n  - 白晨嘉、杨如帅、张乔生、徐康、陈毅、肖婷、李学龙\n  - 关键词：约束型集成探索、无监督技能发现、基于状态原型的分区探索\n  - 实验环境：URLB 任务、迷宫\n\n- [贝叶斯探索网络](https:\u002F\u002Fopenreview.net\u002Fforum?id=OYw6sS8QmL)\n  - 马蒂·费洛斯、布兰登·加里·卡普洛维茨、克里斯蒂安·施罗德·德·维特、希蒙·怀特森\n  - 关键词：贝叶斯探索网络、探索、不确定性估计\n  - 实验环境：一种新颖的搜救网格 MDP\n\n- [马尔可夫决策过程中的几何主动探索：抽象的优势](https:\u002F\u002Fopenreview.net\u002Fpdf?id=2JYOxcGlRe)\n  - 里卡多·德·桑蒂、费德里科·阿兰加特·约瑟夫、诺亚·利尼格、米尔科·穆蒂、安德烈亚斯·克劳斯\n  - 关键词：几何主动探索、抽象、探索效率\n  - 实验环境：受科学发现问题启发的环境\n\n- [具有上下文感知探索的快速同伴适应](https:\u002F\u002Fopenreview.net\u002Fforum?id=CBcNl5Eo32)\n  - 马龙、王元飞、钟方伟、朱松纯、王一舟\n  - 关键词：同伴适应、上下文感知探索、快速适应\n  - 实验环境：竞争性（库恩扑克）、合作性（PO-Overcooked）或混合性（Predator-Prey-W）游戏\n\n- [个体贡献作为多智能体强化学习的内在探索支架](https:\u002F\u002Fopenreview.net\u002Fpdf?id=zCmMkWK4Ly)\n  - 李欣然、刘子凡、陈世博、张军\n  - 关键词：个体贡献、内在探索、多智能体强化学习\n  - 实验环境：Google Research Football、SMAC\n\n\u003C\u002Fdetails>\n\n### ICLR 2024\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [解锁长期新颖性驱动探索中表征的力量](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OwtMhMSybu)\n  - Alaa Saade, Steven Kapturowski, Daniele Calandriello, Charles Blundell, Pablo Sprechmann, Leopoldo Sarra, Oliver Groth, Michal Valko, Bilal Piot\n  - 关键点：基于聚类的在线密度估计实现稳健探索\n  - 实验环境：Atari、DM-HARD-8\n\n- [深度强化学习在随机环境中的性能理论解释](https:\u002F\u002Fopenreview.net\u002Fforum?id=5ES5Hdlbxw)\n  - Cassidy Laidlaw, Banghua Zhu, Stuart Russell, Anca Dragan\n  - 关键点：随机环境、有效时 horizon、强化学习理论、实例相关界、理论的实证验证\n  - 实验环境：BRIDGE\n\n- [DrM：通过休眠比率最小化掌握视觉强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=MSe8YFbhUE)\n  - Guowei Xu, Ruijie Zheng, Yongyuan Liang, Xiyao Wang, Zhecheng Yuan, Tianying Ji, Yu Luo, Xiaoyu Liu, Jiaxin Yuan, Pu Hua, Shuzhen Li, Yanjie Ze, Hal Daumé III, Furong Huang, Huazhe Xu\n  - 关键点：视觉强化学习、休眠比率最小化、探索\n  - 实验环境：DeepMind Control Suite、MetaWorld 和 Adroit\n\n- [METRA：基于度量感知抽象的可扩展无监督强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=c5pwL0Soay)\n  - Seohong Park, Oleh Rybkin, Sergey Levine\n  - 关键点：无监督强化学习、度量感知抽象、可扩展探索\n  - 实验环境：基于状态的 Ant 和 HalfCheetah、Kitchen\n\n- [Text2Reward：利用语言模型进行强化学习奖励塑造](https:\u002F\u002Fopenreview.net\u002Fforum?id=tUM39YTRxH)\n  - Tianbao Xie, Siheng Zhao, Chen Henry Wu, Yitao Liu, Qian Luo, Victor Zhong, Yanchao Yang, Tao Yu\n  - 关键点：奖励塑造、语言模型、基于文本的奖励塑造\n  - 实验环境：MUJOCO、MANISKILL2、METAWORLD\n\n- [面向样本高效强化学习的目标导向模型预训练](https:\u002F\u002Fopenreview.net\u002Fforum?id=o2IEmeLL9r)\n  - Haoqi Yuan, Zhancun Mu, Feiyang Xie, Zongqing Lu\n  - 关键点：目标导向模型、预训练、样本效率\n  - 实验环境：Kitchen、Minecraft。\n\n- [合作式多智能体强化学习中周期性记忆的高效利用](https:\u002F\u002Fopenreview.net\u002Fforum?id=LjivA1SLZ6)\n  - Hyungho Na, Yunkyeong Seo, Il-chul Moon\n  - 关键点：周期性记忆、合作式多智能体、高效利用\n  - 实验环境：StarCraft II 和 Google Research Football\n\n- [基于扩散的简单层次化规划](https:\u002F\u002Fopenreview.net\u002Fforum?id=kXHEBK9uAY)\n  - Chang Chen, Fei Deng, Kenji Kawaguchi, Caglar Gulcehre, Sungjin Ahn\n  - 关键点：层次化规划、扩散、探索\n  - 实验环境：Maze2D 和 AntMaze\n\n- [通过多样化任务的多任务强化学习实现样本高效的短视探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=YZrg56G0JV)\n  - Ziping Xu, Zifan Xu, Runxuan Jiang, Peter Stone, Ambuj Tewari\n  - 关键点：短视探索、多任务强化学习、多样化任务\n  - 实验环境：合成机器人控制环境\n\n- [PAE：利用外部知识进行强化学习以实现高效探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=R7rZUSGOPD)\n  - Zhe Wu, Haofei Lu, Junliang Xing, You Wu, Renye Yan, Yaozhong Gan, Yuanchun Shi\n  - 关键点：外部知识、高效探索、强化学习\n  - 实验环境：BabyAI 和 MiniHack\n\n- [上下文情境下的强化学习探索–利用权衡](https:\u002F\u002Fopenreview.net\u002Fforum?id=uIKZSStON3)\n  - Zhenwen Dai, Federico Tomasi, Sina Ghiassian\n  - 关键点：上下文情境下的探索–利用权衡、强化学习、探索–利用权衡\n  - 实验环境：Dark Room、Dark Key-to-Door、偏置版 Dark Room。\n\n- [Transformer 作为决策者：通过监督预训练实现可证明的上下文情境强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=yN4Wv17ss3)\n  - Licong Lin, Yu Bai, Song Mei\n  - 关键点：Transformer、决策者、上下文情境强化学习\n  - 实验环境：线性 bandit、伯努利 bandits。\n\n- [无需动作即可学会行动](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rvUq3cxpDF)\n  - Dominik Schmidt, Minqi Jiang\n  - 关键点：从视频中恢复潜在动作信息、预训练\n  - 实验环境：Procgen\n\n- [受意识启发的时空抽象用于强化学习中的更好泛化](https:\u002F\u002Fopenreview.net\u002Fpdf?id=eo9dHwtTFt)\n  - Mingde Zhao, Safa Alver, Harm van Seijen, Romain Laroche, Doina Precup, Yoshua Bengio\n  - 关键点：时空抽象、层次化规划、任务\u002F目标分解\n  - 实验环境：MiniGrid-BabyAI\n\n\u003C\u002Fdetails>\n\n### NeurIPS 2023\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [最大化探索：融合估计、规划与探索的单一目标函数](https:\u002F\u002Fopenreview.net\u002Fpdf?id=A57UMlUJdc)\n  - 刘志涵、陆淼、熊伟、钟翰、胡浩、张申奥、郑思睿、杨竹然、王兆然\n  - 关键点：一个整合了估计和规划组件的单一目标函数，能够自动平衡探索与利用，实现次线性遗憾。\n  - 实验环境：稀疏奖励的MuJoCo\n\n- [强化学习中探索对泛化的重要性研究](https:\u002F\u002Fopenreview.net\u002Fpdf?id=y5duN2j9s6)\n  - 蒋一丁、J·齐科·科尔特、罗伯塔·赖莱阿努\n  - 关键点：探索、泛化、基于分布集成的探索方法。\n  - 实验环境：表格型上下文MDP、Procgen和Crafter\n\n- [基于玻尔兹曼探索的蒙特卡洛树搜索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NG4DaApavi) \n  - 迈克尔·潘特、穆罕默德·巴尤米、尼克·霍斯、布鲁诺·拉塞尔达\n  - 关键点：结合MCTS的玻尔兹曼探索，最大熵目标下的最优动作并不一定对应于原始目标的最优动作，提出了两种改进算法。\n  - 实验环境：Frozen Lake环境、航海问题、围棋\n\n- [通往目标的面包屑：来自人机交互反馈的监督目标选择](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.11049gst)\n  - 马塞尔·托尔内·维拉塞维尔、马克斯·巴尔塞尔斯·伊·帕米耶斯、王子涵、萨梅德·戴赛、陈涛、普尔基特·阿格拉瓦尔、阿比谢克·古普塔\n  - 关键点：人机交互反馈、人类反馈与策略学习的分叉机制。\n  - 实验环境：Bandu、积木堆叠、厨房和推杆任务，以及四个房间和迷宫环境。\n\n- [MIMEx：基于掩码输入建模的内在奖励](https:\u002F\u002Fopenreview.net\u002Fpdf?id=g1dMYenhe4)\n  - 林拓、艾伦·贾布里\n  - 关键点：采用不同掩码分布进行伪似然估计。\n  - 实验环境：PixMC-Sparse、DeepMind Control Suite\n\n- [利用未标注先验数据加速探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Itorzn4Kwf)\n  - 李启阳、张杰森、迪比亚·戈什、张艾米、谢尔盖·列文\n  - 关键点：使用无奖励标签的先验数据，通过在线经验学习奖励模型，并以乐观奖励为未标注的先验数据打上标签。\n  - 实验环境：AntMaze领域、Adroit手部操作领域，以及一个视觉模拟的机器人操作领域。\n\n- [ε-贪心探索下深度Q网络的收敛性与样本复杂度分析](https:\u002F\u002Fopenreview.net\u002Fpdf?id=HWGWeaN76q)\n  - 张帅、李洪康、王萌、刘淼、陈品宇、卢松涛、刘思佳、穆鲁格桑、乔杜里\n  - 关键点：ε-贪心探索、收敛性、样本复杂度。\n  - 实验环境：数值实验\n\n- [乐观主义的陷阱：通过随机化风险准则进行分布强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=v8u3EFAyW9)\n  - 曹泰贤、韩承烨、李熙洙、李京宰、李正宇\n  - 关键点：分布强化学习、随机化风险准则、乐观探索。\n  - 实验环境：Atari 55款游戏。\n\n- [CQM：基于量化世界模型的课程式强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=tcotyjon2a)\n  - 李承宰、曹大瑟、朴宗海、金亨镇\n  - 关键点：课程式强化学习、量化世界模型。\n  - 实验环境：PointNMaze\n\n- [强化学习中的安全探索：广义形式与算法](https:\u002F\u002Fopenreview.net\u002Fpdf?id=dQLsvKNwZC)\n  - 渡边彰史、桥本亘、沈勋、桥本一宗\n  - 关键点：安全探索、广义形式、安全探索算法、安全探索元算法。\n  - 实验环境：网格世界和Safety Gym\n\n- [后继者-前驱者内在探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=ohKbQp0jIY)\n  - 俞昌珉、尼尔·伯吉斯、马尼什·萨哈尼、塞缪尔·J·格什曼\n  - 关键点：基于转移序列的回溯结构，结合前瞻与回溯信息。\n  - 实验环境：网格世界、MountainCar、Atari\n\n- [基于价值条件状态熵的强化学习加速探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19476)\n  - 金东英、申振宇、彼得·阿贝尔、徐永教\n  - 关键点：价值条件状态熵探索。\n  - 实验环境：MiniGrid、DeepMind Control Suite和Meta-World\n\n- [ELDEN：基于局部依赖关系的探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=sL4pJBXkxu)\n  - 王子昭、胡家恒、彼得·斯通、罗伯托·马丁-马丁\n  - 关键点：局部依赖关系、探索奖励、内在动机，鼓励发现实体间的新型交互。\n  - 实验环境：从二维网格世界到三维机器人任务。\n\n\u003C\u002Fdetails>\n\n### ICML 2023\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [情境马尔可夫决策过程中的全局与 episodic 奖励用于探索的研究](https:\u002F\u002Fopenreview.net\u002Fpdf?id=1CqtvwHTKQ)\n  - 米卡埃尔·亨纳夫、蒋敏琪、罗伯塔·赖莱阿努\n  - 关键词：全局新颖性奖励、episodic 新颖性奖励、共享结构\n  - 实验环境：Mini-Hack 套件、Habitat 和蒙特祖玛的复仇\n\n- [事后好奇心：随机环境中的内在探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=fIH2G4fnSy)\n  - 丹尼尔·贾雷特、科伦坦·塔莱克、弗洛朗·阿尔切、托马斯·梅斯纳、雷米·穆诺斯、米哈尔·瓦尔科\n  - 关键词：随机环境、将“噪声”与“新颖性”分离、BYOL-Hindsight\n  - 实验环境：Pycolab 迷宫、Atari、Bank Heist\n\n- [基于奇异值分解的深度强化学习表示与探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=p9wFuLpp0O)\n  - 雅什·钱达克、桑塔努·塔库尔、赵汉·丹尼尔·郭、唐云浩、雷米·穆诺斯、威尔·达布尼、黛安娜·博尔萨\n  - 关键词：奇异值分解、状态访问频率的相对大小、将该分解方法扩展到大规模领域\n  - 实验环境：DMLab-30、DM-Hard-8\n\n- [多模态轨迹优化的重参数化策略学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=5Akrk9Ln6N)\n   - 黄志傲、梁立天、凌展、李轩林、甘创、苏浩\n   - 关键词：多模态策略参数化、最优轨迹的生成模型\n   - 实验环境： bandit、MetaWorld、2D 迷宫\n\n- [通过抛硬币估计强化学习中探索的伪计数](https:\u002F\u002Fopenreview.net\u002Fpdf?id=4RvcXByvnR)\n   - 萨姆·洛贝尔、阿希尔·巴加里亚、乔治·科尼达里斯\n   - 关键词：基于计数的探索、对 Rademacher 分布样本（或抛硬币）取平均\n   - 实验环境：Atari、D4RL、FETCH\n\n- [最大熵探索的快速收敛率](https:\u002F\u002Fopenreview.net\u002Fpdf?id=wcUppxYfLH)\n  - 达尼尔·蒂亚普金、丹尼斯·贝洛梅斯特尼、达尼埃莱·卡兰德里埃洛、埃里克·穆兰、雷米·穆诺斯、阿列克谢·瑙莫夫、皮埃尔·佩罗、唐云浩、米哈尔·瓦尔科、皮埃尔·梅纳尔\n  - 关键词：最大化访问熵、博弈论算法、轨迹熵\n  - 实验环境：Double Chain MDP\n\n- [利用大型语言模型引导强化学习中的预训练](https:\u002F\u002Fopenreview.net\u002Fpdf?id=63704LH4v5)\n  - 杜宇青、奥利维亚·沃特金斯、王子涵、克莱德里克·科拉斯、特雷弗·达雷尔、皮特·阿比尔、阿比舍克·古普塔、雅各布·安德烈亚斯\n  - 关键词：利用文本语料库中的背景知识来塑造探索行为，奖励智能体完成由语言模型根据当前状态描述所建议的目标。\n  - 实验环境：Crafter、Housekeep\n\n- [具身智能体是否梦见像素化的羊？：基于语言指导的世界建模的具身决策](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Rm5Qi57C5I)\n  - 科尔比·诺丁汉、普里特维拉吉·阿曼纳布鲁卢、阿拉恩·苏尔、叶津·崔、汉娜内·哈吉希日齐、萨米尔·辛格、罗伊·福克斯\n  - 关键词：用于规划和探索的抽象世界模型（AWM）、LLM 引导的探索、梦阶段和醒阶段\n  - 实验环境：Minecraft\n\n- [无小区的潜在 Go-Explore](https:\u002F\u002Fopenreview.net\u002Fpdf?id=4TtG42xJvC)\n  - 凯文·加卢勒德克、埃马纽埃尔·德尔安德拉莱亚\n  - 关键词：潜在 Go-Explore、学习得到的潜在表征\n  - 实验环境：2D 迷宫、panda-gym、Atari\n\n- [超越想象：利用世界模型最大化 episodic 可达性](https:\u002F\u002Fopenreview.net\u002Fpdf?id=JsAMuzA9o2)\n  - 傅瑶、彭润、李洪洛克\n  - 关键词：一种旨在最大化每步可达性扩展的 episodic 内在奖励\n  - 实验环境：Minigrid、DeepMind Control Suite\n\n- [利用离线数据实现高效的在线强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=h11j9w1ucU)\n  - 菲利普·J·鲍尔、劳拉·史密斯、伊利亚·科斯特里科夫、谢尔盖·列文\n  - 关键词：样本效率与探索、简单地将现有的 off-policy 方法应用于在线学习时的离线数据、影响性能的关键因素、一组建议\n  - 实验环境：D4RL AntMaze、Locomotion、Adroit\n\n- [通过随机网络蒸馏进行反探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NRQ5lC8Dit)\n  - 亚历山大·尼库林、弗拉季斯拉夫·库伦科夫、丹尼斯·塔拉索夫、谢尔盖·科列斯尼科夫\n  - 关键词：不确定性估计器、反探索奖励、特征线性调制\n  - 实验环境：D4RL\n\n- [探索对多智能体 Q 学习动态收敛与性能的影响](https:\u002F\u002Fopenreview.net\u002Fpdf?id=agPrVQdnxT)\n  - 阿马尔·侯赛因、弗朗切斯科·贝拉尔迪内利、达里奥·帕卡尼亚\n  - 关键词：在任意游戏中，即使无法保证收敛到均衡点，探索如何影响强化学习动态？\n  - 实验环境：Network Shapley Game、Network Chakraborty Game、任意游戏\n\n- [多智能体强化学习的自适应熵正则化框架](https:\u002F\u002Fopenreview.net\u002Fpdf?id=MP7HOGfLf3)\n  - 金宇俊、成英哲\n  - 关键词：自适应熵正则化框架、适当的探索熵水平、解耦的价值函数\n  - 实验环境：SMAC、多智能体 HalfCheetah\n\n- [懒惰智能体：解决多智能体强化学习中稀疏奖励问题的新视角](https:\u002F\u002Fopenreview.net\u002Fpdf?id=DRu5BlRqrn)\n  - 刘博尹、蒲志强、潘毅、易建强、梁艳艳、张杜\n  - 关键词：通过影响外部状态避免懒惰智能体、个体勤勉内在动机（IDI）与协作勤勉内在动机（CDI）、外部状态转移模型\n  - 实验环境：SMAC、Google Research Football\n\n- [深度强化学习中探索的自动内在奖励塑造](https:\u002F\u002Fopenreview.net\u002Fpdf?id=UyJJ1pnb0y)\n  - 袁明琦、李博、金鑫、曾文俊\n  - 关键词：从预定义集合中选择塑造函数、一个内在奖励工具包\n  - 实验环境：MiniGrid、Procgen 和 DeepMind Control Suite\n\n- [LESSON：通过选项框架学习整合强化学习中的探索策略](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vXcvrYJlVm)\n  - 金宇俊、金正惠、成英哲\n  - 关键词：option-critic 模型、自适应选择最有效的探索策略\n  - 实验环境：MiniGrid 和 Atari\n\n\u003C\u002Fdetails>\n\n### ICLR 2023\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [可学习的行为控制：通过样本高效的行为选择打破雅达利人类世界纪录](https:\u002F\u002Fopenreview.net\u002Fpdf?id=FeWvD0L_a4) (口头报告：10, 8, 8)\n  - 范家俊、庄宇正、刘岳成、郝建业、王斌、朱江成、王浩、夏树涛\n  - 关键词：可学习的行为控制、混合行为映射、统一的可学习行为选择流程、基于多臂老虎机的元控制器\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n  \n- [覆盖在在线强化学习中的作用](https:\u002F\u002Fopenreview.net\u002Fpdf?id=LQIjzPdDt3q) (口头报告：8, 8, 5)\n  - 谢腾阳、迪伦·J·福斯特、白宇、蒋楠、沙姆·M·卡卡德\n  - 关键词：覆盖条件、数据记录分布、样本高效探索、序列外推系数\n  - 实验环境：无\n\n- [主动强化学习中的近最优策略识别](https:\u002F\u002Fopenreview.net\u002Fforum?id=3OR2tbtnYC-) (口头报告：8,8,8)\n   - 李翔、维拉吉·梅塔、约翰内斯·基尔希纳、伊恩·查尔、威利·奈斯万格、杰夫·施奈德、安德烈亚斯·克劳塞、伊利雅·博古诺维奇\n  - 关键词：核化最小二乘值迭代，结合乐观与悲观策略进行主动探索\n  - 实验环境：Cartpole、导航、跟踪、旋转、Branin-Hoo、Hartmann\n\n- [为探索规划目标](https:\u002F\u002Fopenreview.net\u002Fpdf?id=6qeBuZSo7Pr) (亮点论文：8, 8, 8, 8, 6)\n  - 爱德华·S·胡、理查德·张、奥列·雷布金、迪内什·贾亚拉曼\n  - 关键词：目标条件化、规划探索目标、世界模型、基于采样的规划算法\n  - 实验环境：点迷宫、Walker、蚂蚁迷宫、三块堆叠\n\n- [粉红噪声就够了：深度强化学习中的彩色噪声探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=hQ9V5QN27eS) (亮点论文：8, 8, 8)\n  - 欧诺·埃伯哈德、雅各布·霍伦斯坦、克里斯蒂娜·皮内里、格奥尔格·马提乌斯\n  - 关键词：连续动作空间、时间相关噪声、彩色噪声\n  - 实验环境：[DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)、[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)、Adroit手部套件\n\n- [从专家那里学习进度](https:\u002F\u002Fopenreview.net\u002Fpdf?id=sKc6fgce1zs) (亮点论文：8, 8, 6)\n  - 杰克·布鲁斯、安基特·阿南德、博格丹·马祖雷、罗布·费格斯\n  - 关键词：利用专家演示、长 horizon 任务、学习一个单调递增的函数来总结进度\n  - 实验环境：[NetHack](https:\u002F\u002Fgithub.com\u002FNetHack\u002FNetHack)\n\n- [DEP-RL：面向过度驱动和肌肉骨骼系统的具身强化学习探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=C-xa_D3oTj6) (亮点论文：10, 8, 8, 8)\n  - 皮埃尔·舒马赫、丹尼尔·豪夫勒、迪特·比希勒、辛·施密特、格奥尔格·马提乌斯\n  - 关键词：大型过度驱动动作空间、差分外在可塑性、状态空间覆盖式探索\n  - 实验环境：肌肉骨骼系统：torquearm、arm26、humanreacher、鸵鸟觅食、鸵鸟奔跑、人类奔跑、人类跳跃\n\n- [是否存在零样本强化学习？](https:\u002F\u002Fopenreview.net\u002Fpdf?id=MYEap_OcQI) (亮点论文：10, 8, 8,3)\n  - 阿迈德·图阿蒂、热雷米·拉潘、扬·奥利维耶\n  - 关键词：零样本强化学习智能体、将通用表征学习与探索解耦、使用拉普拉斯特征函数的SFs\n  - 实验环境：无监督RL和ExORL基准测试\n\n- [以200倍更快的速度达到人类水平的雅达利游戏](https:\u002F\u002Fopenreview.net\u002Fpdf?id=JtC6yOHRoJJ) (海报展示：8, 8, 3)\n  - 史蒂文·卡普图罗夫斯基、维克托·坎波斯、雷·姜、涅曼亚·拉基切维奇、哈多·范哈塞尔特、查尔斯·布伦德尔、阿德里亚·普伊格多梅内奇·巴迪亚\n  - 关键词：经验需求减少200倍、更鲁棒高效的智能体\n  - 实验环境：Atari 57款游戏\n\n- [针对稀疏奖励领域的结构化探索：学习成就结构](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NDWl9qcUpvy) (海报展示：8, 8, 5, 5)\n  - 周子涵、阿尼梅什·加格\n  - 关键词：基于成就的环境、恢复依赖图\n  - 实验环境：Crafter、TreeMaze\n\n- [对于免奖励强化学习而言，安全探索几乎不会增加额外的样本复杂度](https:\u002F\u002Fopenreview.net\u002Fpdf?id=wNUgn1n6esQ) (海报展示：8, 8, 6, 6)\n  - 黄瑞泉、杨静、梁英斌\n  - 关键词：免奖励强化学习、通过最少轨迹数降低对估计模型的不确定性\n  - 实验环境：表格MDP、低秩MDP\n\n- [潜在状态边缘化作为提升探索效率的低成本方法](https:\u002F\u002Fopenreview.net\u002Fpdf?id=b0UksKFcTOL) (海报展示：6, 6, 6)\n  - 张丁怀、阿隆·库维尔、约书亚·本吉奥、郑钦青、艾米·张、陈奕廷\n  - 关键词：在MaxEnt框架下采用潜在变量策略、对潜在状态进行低成本边缘化\n  - 实验环境：[DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)\n\n- [重新审视程序化生成环境中的好奇心驱动探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=j3GK3_xZydY) (海报展示：8, 8, 5, 3, 3)\n  - 王凯欣、周匡齐、康炳义、冯嘉仕、颜水成\n  - 关键词：终身内在奖励与周期性内在奖励，所有终身-周期性组合的表现\n  - 实验环境：[MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid)\n\n- [MoDem：利用示范加速基于视觉模型的强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=JdTnc9gjVfJ) (海报展示：8, 6, 6, 6)\n  - 尼克拉斯·汉森、林一新、苏浩、王小龙、维卡什·库马尔、阿拉文德·拉杰斯瓦兰\n  - 关键词：在模型学习中利用示范的关键要素\n  - 实验环境：Adroit、Meta-World、[DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)\n\n- [简化基于模型的强化学习：用单一目标同时学习表征、隐空间模型和策略](https:\u002F\u002Fopenreview.net\u002Fpdf?id=MQcmfgRxf7a) (海报展示：8, 6, 6, 6, 6)\n  - 拉杰·古加雷、霍芒加·巴拉德瓦杰、本杰明·艾森巴赫、谢尔盖·莱文、罗斯·萨拉胡丁诺夫\n  - 关键词：这些辅助目标与强化学习目标的一致性、预期回报的下界\n  - 实验环境：基于模型的基准测试\n\n- [EUCLID：迈向高效的多选动态模型无监督强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=9-tjK93-rP) (海报展示：6, 6, 6, 6)\n  - 袁益夫、郝建业、倪飞、穆瑶、郑彦、胡玉京、刘金毅、陈英峰、范昌杰\n  - 关键词：转移动态建模、多选动态模型、采样效率\n  - 实验环境：URLB\n\n- [带有不完美在线示范的受保护策略优化](https:\u002F\u002Fopenreview.net\u002Fpdf?id=O5rKg7IRQIO) (口头报告：8, 8, 6, 5)\n  - 薛正海、彭正浩、李全义、刘志翰、周博磊\n  - 关键词：师生共享控制、安全保证与探索引导、基于轨迹的价值估计\n  - 实验环境：MetaDrive\n\n\u003C\u002Fdetails>\n\n### NeurIPS 2022\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [通过约束优化兑现内在奖励](https:\u002F\u002Fwilliamd4112.github.io\u002Fpubs\u002Fneurips22_eipo.pdf)（海报：8, 7, 7）\n  - Eric Chen、Zhang-Wei Hong、Joni Pajarinen、Pulkit Agrawal\n  - 关键点：自动调整内在奖励的重要性，基于原则的约束策略优化方法\n  - 实验环境：Atari\n\n- [你只活一次：基于学习奖励塑造的单次强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=303XqIQ5c_d)（海报：6, 6, 5, 5）\n  - Annie S. Chen、Archit Sharma、Sergey Levine、Chelsea Finn\n  - 关键点：单次强化学习，Q加权对抗性学习（QWALE），分布匹配策略\n  - 实验环境：桌面整理、Pointmass、修改版HalfCheetah、修改版Franka-Kitchen\n\n- [通过结构化世界模型进行好奇探索实现零样本目标操作](https:\u002F\u002Fopenreview.net\u002Fpdf?id=NnuYZ1el24C)（海报：8, 7, 6）\n  - Cansu Sancaktar、Sebastian Blaes、Georg Martius\n  - 关键点：良好模型与良好探索之间的自我强化循环，通过基于模型的规划实现对下游任务的零样本泛化\n  - 实验环境：游乐场、Fetch Pick & Place Construction\n\n- [基于模型的贝叶斯探索终身强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=6I3zJn9Slsb)（海报：7, 6, 6）\n  - Haotian Fu、Shangqun Yu、Michael Littman、George Konidaris\n  - 关键点：层次化贝叶斯后验\n  - 实验环境：Mujoco和Meta-world的HiP-MDP版本\n\n- [关于非线性强化学习中无奖励探索的统计效率](https:\u002F\u002Fopenreview.net\u002Fpdf?id=65eqtvEShR8)（海报：7, 6, 5, 5）\n  - Jinglin Chen、Aditya Modi、Akshay Krishnamurthy、Nan Jiang、Alekh Agarwal\n  - 关键点：样本高效的无奖励探索，可探索性或可达性假设\n  - 实验环境：无\n\n- [DOPE：双重乐观与悲观探索用于安全强化学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=U4BUMoVTrB2)（海报：8, 7, 4）\n  - Archana Bura、Aria Hasanzadezonuzy、Dileep Kalathil、Srinivas Shakkottai、Jean-Francois Chamberland\n  - 关键点：基于模型的安全强化学习，有限 horizon 约束马尔可夫决策过程，带有保守约束的探索奖励（乐观与悲观结合）\n  - 实验环境：分解型 CMDP 环境\n\n- [贝叶斯乐观优化：基于模型的强化学习中的乐观探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=GdHVClGh9N)\n  - Chenyang Wu、Tianci Li、Zongzhang Zhang、Yang Yu\n  - 关键点：面对不确定性时的乐观态度（OFU），贝叶斯乐观优化\n  - 实验环境：RiverSwim、Chain、随机 MDPs。\n\n- [逆向强化学习中的主动探索](https:\u002F\u002Fopenreview.net\u002Fforum?id=TPOJzwv2pc)（海报：7, 7, 7, 7）\n  - David Lindner、Andreas Krause、Giorgia Ramponi\n  - 关键点：主动探索未知环境和专家策略，无需环境的生成模型\n  - 实验环境：四条路径、随机 MDPs、双链、链、Gridworld\n\n- [稀疏奖励下的探索引导奖励塑造强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=W7HvKO1erY)（海报：6, 6, 4）\n  - Rati Devidze、Parameswaran Kamalaruban、Adish Singla\n  - 关键点：奖励塑造、内在奖励函数、基于探索的奖励加成。\n  - 实验环境：Chain、Room、Linek\n\n- [蒙特卡洛增强演员-评论家算法：从次优示范中学习稀疏奖励深度强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=FLzTj4ia8BN)（海报：6, 6, 5, 5）\n  - Albert Wilcox、Ashwin Balakrishna、Jules Dedieu、Wyame Benslimane、Daniel S. Brown、Ken Goldberg\n  - 关键点：无参数，标准 TD 目标与未来奖励蒙特卡洛估计值中的最大值。\n  - 实验环境：Pointmass 导航、方块提取、序列推动、开门、方块提升\n\n- [激励组合多臂赌博机探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=ITXgYOFi8b)（海报：7, 6, 5, 3）\n  - Xinyan Hu、Dung Daniel Ngo、Aleksandrs Slivkins 和 Zhiwei Steven Wu\n  - 关键点：激励探索，大规模结构化动作集和高度相关的信念，组合半赌博机。\n  - 实验环境：无\n\n\u003C\u002Fdetails>\n\n### ICML 2022\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n\n- [从狄利克雷到鲁宾：无需奖励的强化学习中的乐观探索](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.07704)（口头报告）\n  - Daniil Tiapkin、Denis Belomestny、Eric Moulines、Alexey Naumov、Sergey Samsonov、Yunhao Tang、Michal Valko、Pierre Menard\n  - 关键点：Bayes-UCBVI、后悔界、Q值函数后验分布的分位数、狄利克雷加权和的反集中不等式\n  - 实验环境：简单的表格型网格世界环境，[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [最大状态熵探索中非马尔可夫性的重要性](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.03060)（口头报告）\n  - Mirco Mutti、Riccardo De Santi、Marcello Restelli\n  - 关键点：最大状态熵探索，非马尔可夫性，有限样本 regime\n  - 实验环境：3State、River Swim\n\n- [稀疏奖励目标条件强化学习中的相位式自模仿约简](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.12030)（亮点展示）\n  - Yunfei Li、Tian Gao、Jiaqi Yang、Huazhe Xu、Yi Wu\n  - 关键点：稀疏奖励目标条件 RL\u002FSL 的相位式，任务约简\n  - 实验环境：Sawyer Push、[Ant Maze](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)、堆叠\n\n- [汤普森采样用于（组合）纯探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.09150)（亮点展示）\n  - Siwei Wang、Jun Zhu\n  - 关键点：组合纯探索，汤普森采样，更低的复杂度\n  - 实验环境：组合式[多臂赌博机](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMulti-armed_bandit)\n\n- [自主探索与多目标随机最短路径问题的近最优算法](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.10729.pdf)（亮点展示）\n  - Haoyuan Cai、Tengyu Ma、Simon Du\n  - 关键点：增量式自主探索，更强的样本复杂度界，多目标随机最短路径问题\n  - 实验环境：困难 MDP\n\n- [安全探索用于高效策略评估和比较](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.13234.pdf)（亮点展示）\n  - Runzhe Wan、Branislav Kveton、Rui Song\n  - 关键点：为赌博机策略评估进行高效且安全的数据收集。\n  - 实验环境：[多臂赌博机](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMulti-armed_bandit)、上下文多臂赌博机、线性赌博机\n\n\u003C\u002Fdetails>\n\n### ICLR 2022\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [无监督强化学习的信息几何](https:\u002F\u002Fopenreview.net\u002Fpdf?id=3wU2UX0voE) (口头报告：8, 8, 8)\n   - Benjamin Eysenbach、Ruslan Salakhutdinov、Sergey Levine\n   - 关键词：无监督技能发现、互信息目标、对抗性选择的奖励函数\n   - 实验环境：无\n   \n- [智能体何时应该探索？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.11811) (亮点论文：8, 8, 6, 6)\n  - Miruna Pislar、David Szepesvari、Georg Ostrovski、Diana Borsa、Tom Schaul\n  - 关键词：模式切换、非单一体的探索、情节内的探索\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [通过乐观探索学习更多技能](https:\u002F\u002Fopenreview.net\u002Fpdf?id=cU8rknuhxcDJ) (亮点论文：8, 8, 8, 6)\n  - DJ Strouse、Kate Baumli、David Warde-Farley、Vlad Mnih、Steven Hansen\n  - 关键词：判别器分歧内在奖励、信息增益辅助目标\n  - 实验环境：表格型网格世界、[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [通过随机化回报分解学习长期奖励再分配](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.13485) (亮点论文：8, 8, 8, 5)\n  - Zhizhou Ren、Ruihan Guo、Yuan Zhou、Jian Peng\n  - 关键词：稀疏且延迟的奖励、随机化回报分解\n  - 实验环境：[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [利用离线示范指导进行稀疏奖励强化学习](https:\u002F\u002Fopenreview.net\u002Fpdf?id=YJ1WzgMVsMt) (亮点论文：8, 8, 8, 6, 6)\n  - Desik Rengarajan、Gargi Vaidya、Akshay Sarvesh、Dileep Kalathil、Srinivas Shakkottai\n  - 关键词：在线学习结合离线指导\n  - 实验环境：[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)、[TurtleBot](https:\u002F\u002Fwww.turtlebot.com\u002F)（航点跟踪、避障）\n\n- [强化学习中基于生成式规划的时间协调探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=YZHES8wIdE) (亮点论文：8, 8, 8, 6)\n  - Haichao Zhang、Wei Xu、Haonan Yu\n  - 关键词：生成式规划方法、时间协调的探索、粗略的初始计划\n  - 实验环境：[经典连续控制环境](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)、[CARLA](https:\u002F\u002Fgithub.com\u002Fcarla-simulator\u002Fcarla)\n\n- [在没有外部奖励的情况下学习强化学习中的利他行为](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.09598) (亮点论文：8, 8, 6, 6)\n  - Tim Franzmeyer、Mateusz Malinowski、João F. Henriques\n  - 关键词：利他行为、任务无关性\n  - 实验环境：网格世界环境、[觅食](https:\u002F\u002Fgithub.com\u002Fsemitable\u002Flb-foraging)、[多智能体捉迷藏](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FPettingZoo\u002Ftree\u002Fmaster\u002Fpettingzoo\u002Fmpe\u002Fsimple_tag)\n\n- [用于可扩展探索的反集中置信奖金](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11202) (海报展示：8, 6, 5)\n  - Jordan T. Ash、Cyril Zhang、Surbhi Goel、Akshay Krishnamurthy、Sham Kakade\n  - 关键词：反集中置信边界、椭圆型奖金\n  - 实验环境：[多臂老虎机](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMulti-armed_bandit)、[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [Lipschitz约束下的无监督技能发现](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.00914) (海报展示：8, 6, 6, 6)\n  - Seohong Park、Jongwook Choi、Jaekyeom Kim、Honglak Lee、Gunhee Kim\n  - 关键词：无监督技能发现、Lipschitz约束\n  - 实验环境：[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n  \n- [LIGS：用于多智能体学习的可学习内在奖励生成选择](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.02618.pdf) (海报展示：8, 6, 5, 5)\n  - David Henry Mguni、Taher Jafferjee、Jianhong Wang、Nicolas Perez-Nieves、Oliver Slumbers、Feifei Tong、Yang Li、Jiangcheng Zhu、Yaodong Yang、Jun Wang\n  - 关键词：多智能体、协调的探索与行为、可学习的内在奖励生成选择、切换控制\n  - 实验环境：[觅食](https:\u002F\u002Fgithub.com\u002Fsemitable\u002Flb-foraging)、[星际争霸II](https:\u002F\u002Fgithub.com\u002Foxwhirl\u002Fsmac)\n\n- [文本游戏中用于战略探索的多阶段情节控制](https:\u002F\u002Fopenreview.net\u002Fforum?id=Ek7PSN7Y77z) (亮点论文：8, 8, 6, 6)\n  - Jens Tuyls、Shunyu Yao、Sham M. Kakade、Karthik R Narasimhan\n  - 关键词：多阶段方法、策略分解\n  - 实验环境：[Jericho](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fjericho)\n\n- [关于强化学习中蒙特卡洛探索起点算法的收敛性](https:\u002F\u002Fopenreview.net\u002Fforum?id=JzNB0eA2-M4) (海报展示：8, 8, 5, 5)\n  - Che Wang、Shuhan Yuan、Kai Shao、Keith Ross\n  - 关键词：蒙特卡洛探索起点、最优策略前馈MDP\n  - 实验环境：[二十一点](https:\u002F\u002Fgithub.com\u002Ftopics\u002Fblackjack-game)、悬崖漫步\n  \n\u003C\u002Fdetails>\n\n### NeurIPS 2021\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [有趣对象、好奇智能体：学习任务无关的探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.13119) (口头报告：9, 8, 8, 8)\n  - Simone Parisi, Victoria Dean, Deepak Pathak, Abhinav Gupta\n  - 关键词：任务无关的探索、以智能体为中心的组件、以环境为中心的组件\n  - 实验环境：[MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid), [Habitat](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fhabitat-sim)\n\n- [深度强化学习中的战术性乐观与悲观](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.03765.pdf) (海报展示：9, 7, 6, 6) \n  - Ted Moskovitz, Jack Parker-Holder, Aldo Pacchiano, Michael Arbel, Michael Jordan\n  - 关键词：战术性乐观与悲观估计、多臂赌博机问题\n  - 实验环境：[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [哪些互信息表征学习目标足以用于控制？](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.07278.pdf) (海报展示：7, 6, 6, 5) \n  - Kate Rakelly, Abhishek Gupta, Carlos Florensa, Sergey Levine\n  - 关键词：互信息目标、状态表征的充分性\n  - 实验环境：捕手、带抓取功能的捕手\n\n- [关于每轮次一次反馈的强化学习理论](https:\u002F\u002Fopenreview.net\u002Fpdf?id=-uFBxNwRHa2) (海报展示：6, 5, 5, 4) \n  - Niladri S. Chatterji, Aldo Pacchiano, Peter L. Bartlett, Michael I. Jordan\n  - 关键词：二元反馈、亚线性遗憾\n  - 实验环境：无\n\n- [MADE：通过最大化与已探索区域的偏离进行探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=DTVfEJIL3DB) (海报展示：7, 7, 6, 5)\n  - Tianjun Zhang, Paria Rashidinejad, Jiantao Jiao, Yuandong Tian, Joseph Gonzalez, Stuart Russell\n  - 关键词：最大化与已探索区域的偏离、内在奖励\n  - 实验环境：[MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid), [DeepMind Control Suite](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdm_control)\n\n- [强化学习中的对抗性内在动机](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.13345.pdf) (海报展示：7, 7, 6) \n  - Ishan Durugkar, Mauricio Tec, Scott Niekum, Peter Stone\n  - 关键词：Wasserstein-1距离、目标条件化、拟度量、对抗性内在动机\n  - 实验环境：网格世界、Fetch机器人（基于[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)）\n\n- [面向强化学习的信息导向奖励学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.12466.pdf) (海报展示：9, 8, 7, 6) \n  - David Lindner, Matteo Turchetta, Sebastian Tschiatschek, Kamil Ciosek, Andreas Krause\n  - 关键词：专家查询、奖励的贝叶斯模型、最大化信息增益\n  - 实验环境：[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [用于鲁棒自监督探索的动态瓶颈](https:\u002F\u002Fopenreview.net\u002Fpdf?id=-t6TeG3A6Do) (海报展示：8, 6, 6, 6)\n  - Chenjia Bai, Lingxiao Wang, Lei Han, Animesh Garg, Jianye Hao, Peng Liu, Zhaoran Wang\n  - 关键词：动态瓶颈、信息增益\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [用于高效探索的层次化技能](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.10809) (海报展示：7, 6, 6, 6)\n  - Jonas Gehring, Gabriel Synnaeve, Andreas Krause, Nicolas Usunier\n  - 关键词：层次化技能学习、通用性与特异性的平衡、不同复杂度的技能\n  - 实验环境：障碍赛、林博舞、楼梯、GoalWall PoleBalance（基于[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)）\n\n- [多智能体竞争中的探索-利用：有限理性下的收敛](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OSLVL-tIBei) (亮点论文：8, 6, 6)\n  - Stefanos Leonardos, Georgios Piliouras, Kelly Spendlove\n  - 关键词：竞争性多智能体、游戏奖励与探索成本之间的平衡、独特的量化反应均衡\n  - 实验环境：双智能体重加权零和博弈\n\n- [NovelD：简单而有效的探索准则](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper\u002F2021\u002Ffile\u002Fd428d070622e0f4363fceae11f4a3576-Paper.pdf) (海报展示：7, 6, 6, 6)\n  - Tianjun Zhang, Huazhe Xu, Xiaolong Wang, Yi Wu, Kurt Keutzer, Joseph E. Gonzalez, Yuandong Tian\n  - 关键词：对每个新颖区域给予大致相等的权重\n  - 实验环境：[MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid), [NetHack](https:\u002F\u002Fgithub.com\u002FNetHack\u002FNetHack), [Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [基于好奇心驱动探索的轮次式多智能体强化学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.11032.pdf) (海报展示：7, 6, 6, 5)\n  - Lulu Zheng, Jiarui Chen, Jianhao Wang, Jiamin He, Yujing Hu, Yingfeng Chen, Changjie Fan, Yang Gao, Chongjie Zhang\n  - 关键词：轮次式多智能体、好奇心驱动的探索、预测误差、轮次记忆\n  - 实验环境：[捕食者-猎物](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FPettingZoo), [StarCraft II](https:\u002F\u002Fgithub.com\u002Foxwhirl\u002Fsmac)\n\n- [通过宏观目标在MOBA游戏中学习多样化策略](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper\u002F2021\u002Ffile\u002F86dba86754c0ad93997a11fa947d97b2-Paper.pdf) (海报展示：7, 6, 5, 5)\n  - Yiming Gao, Bei Shi, Xueying Du, Liang Wang, Guangwei Chen, Zhenjie Lian, Fuhao Qiu, Guonan Han, Weixuan Wang, Deheng Ye, Qiang Fu, Wei Yang, Lanxiao Huang\n  - 关键词：MOBA游戏、策略多样性、宏观目标引导框架、元控制器、人类示范\n  - 实验环境：[王者荣耀](https:\u002F\u002Fwww.honorofkings.com\u002F)\n\n- [CIC：对比式内在控制用于无监督技能发现](https:\u002F\u002Fopenreview.net\u002Fpdf\u002Ff6d399de0d94e1c67c8b556e4ab6c0ee5b77a10f.pdf) (目前未被接受：8, 8, 6, 3)\n    - Michael Laskin, Hao Liu, Xue Bin Peng, Denis Yarats, Aravind Rajeswaran, Pieter Abbeel\n    - 关键词：互信息分解、粒子估计器、对比学习\n    - 实验环境：[URLB](https:\u002F\u002Fopenreview.net\u002Fpdf?id=lwrPkQP_is)\n    \n\u003C\u002Fdetails>\n\n### 经典探索型强化学习论文\n\n\u003Cdetails open>\n\u003Csummary>(点击收起)\u003C\u002Fsummary>\n\n- [利用置信区间进行探索-利用权衡](https:\u002F\u002Fwww.jmlr.org\u002Fpapers\u002Fvolume3\u002Fauer02a\u002Fauer02a.pdf) *机器学习研究期刊，2002年*\n  - Peter Auer\n  - 关键词：线性上下文赌博机\n  - 实验环境：无\n\n\u003C!--\n- [如何定义内在动机？](https:\u002F\u002Fciteseerx.ist.psu.edu\u002Fviewdoc\u002Fdownload?doi=10.1.1.567.6524&rep=rep1&type=pdf) *表观遗传机器人学会议，2008年*\n  - Pierre-Yves Oudeyer, Frederic Kaplan.\n  - 关键词：内在动机\n  - 实验环境：无\n-->\n\n- [基于上下文赌博机的个性化新闻文章推荐](https:\u002F\u002Farxiv.org\u002Fabs\u002F1003.0146) *WWW 2010*\n  - Lihong Li, Wei Chu, John Langford, Robert E. Schapire\n  - 关键词：LinUCB\n  - 实验环境：雅虎首页今日模块数据集\n\n- [(更)高效的强化学习：后验采样法](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1306.0940.pdf) *NeurIPS 2013*\n  - Ian Osband, Benjamin Van Roy, Daniel Russo\n  - 关键词：先验分布、后验采样\n  - 实验环境：RiverSwim\n\n- [汤普森采样的经验评估](http:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F4321-an-empirical-evaluation-of-thompson-sampling.pdf) *NeurIPS 2011*\n  - Olivier Chapelle, Lihong Li\n  - 关键词：汤普森采样、实验结果\n  - 实验环境：无\n\n- [汤普森采样教程](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.02038.pdf) *arXiv 2017*\n  - Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, Zheng Wen\n  - 关键词：汤普森采样\n  - 实验环境：无\n\n- [统一基于计数的探索与内在动机](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.01868) *NeurIPS 2016*\n  - Marc G. Bellemare, Sriram Srinivasan, Georg Ostrovski, Tom Schaul, David Saxton, Remi Munos\n  - 关键词：内在动机、密度模型、伪计数\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [通过自举DQN进行深度探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1602.04621) *NeurIPS 2016*\n  - Ian Osband, Charles Blundell, Alexander Pritzel, Benjamin Van Roy\n  - 关键词：时序扩展（或深度）探索、随机化价值函数、自举DQN\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [VIME：变分信息最大化探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.09674) *NeurIPS 2016*\n  - Rein Houthooft, Xi Chen, Yan Duan, John Schulman, Filip De Turck, Pieter Abbeel\n  - 关键词：信息增益最大化、环境动态信念、贝叶斯神经网络中的变分推断\n  - 实验环境：[rllab](https:\u002F\u002Fgithub.com\u002Frll\u002Frllab)\n\n- [\\#Exploration：深度强化学习中基于计数的探索研究](http:\u002F\u002Fpapers.neurips.cc\u002Fpaper\u002F6868-exploration-a-study-of-count-based-exploration-for-deep-reinforcement-learning.pdf) *NeurIPS 2017*\n  - Haoran Tang, Rein Houthooft, Davis Foote, Adam Stooke, Xi Chen, Yan Duan, John Schulman, Filip De Turck, Pieter Abbeel\n  - 关键词：哈希计数、内在动机\n  - 实验环境：[rllab](https:\u002F\u002Fgithub.com\u002Frll\u002Frllab)、[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [EX2：用于深度强化学习的示例模型探索](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2017\u002Ffile\u002F1baff70e2669e8376347efd3a874a341-Paper.pdf) *NeurIPS 2017*\n  - Justin Fu, John D. Co-Reyes, Sergey Levine\n  - 关键词：新颖性检测、判别式训练的示例模型、隐式密度估计\n  - 实验环境：[VizDoom](https:\u002F\u002Fgithub.com\u002Fmwydmuch\u002FViZDoom)、[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [事后经验回放](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.01495.pdf) *NeurIPS 2017*\n  - Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, Wojciech Zaremba\n  - 关键词：事后经验回放、隐式课程\n  - 实验环境：推、滑动、拾取放置等物理机器人任务\n\n- [由自监督预测驱动的 curiosity 探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.05363) *ICML 2017*\n  - Deepak Pathak, Pulkit Agrawal, Alexei A. Efros, Trevor Darrell\n  - 关键词：curiosity、自监督逆动力学模型\n  - 实验环境：[VizDoom](https:\u002F\u002Fgithub.com\u002Fmwydmuch\u002FViZDoom)、[超级马里奥兄弟](https:\u002F\u002Fsupermario-game.com\u002F)\n\n- [从示范中学习深度Q学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.03732) *AAAI 2018*\n  - Todd Hester, Matej Vecerik, Olivier Pietquin, Marc Lanctot, Tom Schaul, Bilal Piot, Dan Horgan, John Quan, Andrew Sendonaris, Gabriel Dulac-Arnold, Ian Osband, John Agapiou, Joel Z. Leibo, Audrunas Gruslys\n  - 关键词：将时序差分更新与对示范者行为的监督分类相结合\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [用于探索的噪声网络](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rywHCPkAW) *ICLR 2018*\n  - Meire Fortunato, Mohammad Gheshlaghi Azar, Bilal Piot, Jacob Menick, Matteo Hessel, Ian Osband, Alex Graves, Volodymyr Mnih, Remi Munos, Demis Hassabis, Olivier Pietquin, Charles Blundell, Shane Legg\n  - 关键词：学习到的参数化噪声\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [通过随机网络蒸馏进行探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12894) *ICLR 2018*\n  - Yuri Burda, Harrison Edwards, Amos Storkey, Oleg Klimov\n  - 关键词：随机网络蒸馏\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [软演员-评论家：带有随机演员的离策略最大熵深度强化学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.01290) *ICML 2018*\n  - Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine\n  - 关键词：软演员-评论家、最大熵、策略迭代\n  - 实验环境：[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [大规模好奇心驱动学习研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.04355) *ICLR 2019*\n  - Yuri Burda、Harri Edwards & Deepak Pathak、Amos Storkey、Trevor Darrell、Alexei A. Efros\n  - 关键词：curiosity、预测误差、纯好奇心驱动学习、特征空间\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)、[超级马里奥兄弟](https:\u002F\u002Fsupermario-game.com\u002F)\n\n- [多样性就是一切：无需奖励函数即可学习技能](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.06070) *ICLR 2019*\n  - Benjamin Eysenbach、Abhishek Gupta、Julian Ibarz、Sergey Levine\n  - 关键词：最大化信息论目标、无监督地涌现多样化技能\n  - 实验环境：[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [通过可达性实现的情节式好奇心](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.02274) *ICLR 2019*\n  - Nikolay Savinov、Anton Raichuk、Rapha¨el Marinier、Damien Vincent、Marc Pollefeys、Timothy Lillicrap、Sylvain Gelly\n  - 关键词：curiosity、情节记忆、到达当前观测所需经过的环境步数\n  - 实验环境：[VizDoom](https:\u002F\u002Fgithub.com\u002Fmwydmuch\u002FViZDoom)、[DMLab](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Flab)、[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [通过分歧进行自监督探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04161) *ICML 2019*\n  - Deepak Pathak、Dhiraj Gandhi、Abhinav Gupta\n  - 关键词：动力学模型集成、最大化这些集成之间的分歧、可微分方式\n  - 实验环境：噪声MNIST、[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)、[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)、[Unity](https:\u002F\u002Funity.com\u002Fproducts\u002Fmachine-learning-agents)、真实机器人\n\n- [EMI：基于互信息的探索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.01176) *ICML 2019*\n  - Hyoungseok Kim、Jaekyeom Kim、Yeonwoo Jeong、Sergey Levine、Hyun Oh Song\n  - 关键词：状态和动作的嵌入表示、前向预测、互信息\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)、[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [高效利用示范解决困难探索问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01387) *arXiv 2019*\n  - Caglar Gulcehre、Tom Le Paine、Bobak Shahriari、Misha Denil、Matt Hoffman、Hubert Soyer、Richard Tanburn、Steven Kapturowski、Neil Rabinowitz、Duncan Williams、Gabriel Barth-Maron、Ziyu Wang、Nando de Freitas\n  - 关键词：R2D2、高效利用示范、困难探索问题\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [即使在悲观初始化下也能进行乐观探索](http:\u002F\u002Fwww.cs.ox.ac.uk\u002Fpeople\u002Fshimon.whiteson\u002Fpubs\u002Frashidiclr20.pdf) *ICLR 2020*\n    - Tabish Rashid、Bei Peng、Wendelin Böhmer、Shimon Whiteson\n    - 关键点：悲观初始化的Q值、基于计数的奖励项、在动作选择和自举过程中保持乐观\n    - 实验环境：随机链、迷宫、[蒙特祖玛的复仇](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n\n- [RIDE：针对程序化生成环境的奖励驱动型探索](https:\u002F\u002Fopenreview.net\u002Fpdf?id=rkg-TJBFPB) *ICLR 2020*\n    - Roberta Raileanu、Tim Rocktäschel\n    - 关键点：引导学习到的状态表示发生显著变化\n    - 实验环境：[MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid)\n\n- [永不放弃：学习有方向性的探索策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06038) *ICLR 2020*\n  - Adrià Puigdomènech Badia、Pablo Sprechmann、Alex Vitvitskyi、Daniel Guo、Bilal Piot、Steven Kapturowski、Olivier Tieleman、Martín Arjovsky、Alexander Pritzel、Andew Bolt、Charles Blundell\n  - 关键点：ICM+RND，不同强度的探索与利用平衡\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)\n  \n- [Agent57：超越Atari人类基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13350) *ICML 2020* \n  - Adrià Puigdomènech Badia、Bilal Piot、Steven Kapturowski、Pablo Sprechmann、Alex Vitvitskyi、Daniel Guo、Charles Blundell\n  - 关键点：参数化一组策略、自适应机制、状态-动作价值函数的参数化表示\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)、[roboschool](https:\u002F\u002Fgithub.com\u002Fopenai\u002Froboschool)\n\n- [基于UCB的探索的神经上下文多臂老虎机](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.04462.pdf) *ICML 2020*\n  - Dongruo Zhou、Lihong Li、Quanquan Gu\n  - 关键点：随机上下文多臂老虎机、基于神经网络的随机特征、近似最优的遗憾保证\n  - 实验环境：上下文多臂老虎机、UCI机器学习库、[MNIST](http:\u002F\u002Fyann.lecun.com\u002Fexdb\u002Fmnist\u002F)\n\n- [对剧集进行排序：一种针对程序化生成环境的简单探索方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.08152) *ICLR 2021*\n  - Daochen Zha、Wenye Ma、Lei Yuan、Xia Hu、Ji Liu\n  - 关键点：程序化生成环境、结合单个剧集和长期视角的剧集探索得分\n  - 实验环境：[MiniGrid](https:\u002F\u002Fgithub.com\u002FFarama-Foundation\u002FMinigrid)、MiniWorld、[MuJoCo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmujoco-py)\n\n- [先返回再探索](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03157-9) *Nature 2021*\n  - Adrien Ecoffet、Joost Huizinga、Joel Lehman、Kenneth O. Stanley、Jeff Clune\n  - 关键点：脱离与偏离、记忆状态、返回并在此基础上进行探索\n  - 实验环境：[Atari](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym)、抓取放置机器人任务\n\n\u003C\u002Fdetails>\n\n\n\n\n## 贡献\n我们的目标是为对强化学习中探索方法感兴趣的人提供一份入门级论文指南。\n如果您有兴趣参与贡献，请参阅[此处](CONTRIBUTING.md)以获取贡献说明。\n\n\n## 许可证\nAwesome Exploration RL 采用 Apache 2.0 许可证发布。\n\n\u003Cp align=\"right\">(\u003Ca href=\"#top\">回到顶部\u003C\u002Fa>)\u003C\u002Fp>","# awesome-exploration-rl 快速上手指南\n\n`awesome-exploration-rl` 并非一个可直接运行的软件库或框架，而是一个** curated list（精选列表）**，用于收集和整理强化学习（Reinforcement Learning, RL）中**探索方法（Exploration Methods）**的相关研究论文。\n\n本指南旨在帮助开发者高效利用该资源进行文献调研、算法选型及复现参考。\n\n## 环境准备\n\n由于本项目本质为文献索引仓库，无需安装特定的运行时环境，但建议具备以下基础条件以便阅读和验证相关论文代码：\n\n### 系统要求\n- **操作系统**：Windows \u002F macOS \u002F Linux\n- **浏览器**：现代浏览器（Chrome, Edge, Firefox 等），用于访问 GitHub 及论文链接。\n\n### 前置依赖与知识储备\n- **Git**：用于克隆仓库或在本地更新。\n- **强化学习基础**：理解“探索与利用（Exploration vs. Exploitation）”平衡问题。\n- **常用 RL 环境**：部分论文涉及的环境包括 `MiniGrid`, `MuJoCo`, `Atari`, `OpenAI Gym`, `DMC` 等。若需复现论文，需自行配置对应环境。\n- **深度学习框架**：大多数关联论文的代码实现基于 `PyTorch` 或 `TensorFlow`。\n\n## 安装步骤\n\n### 1. 克隆仓库\n在终端中执行以下命令，将仓库下载到本地：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fyour-repo-link\u002Fawesome-exploration-rl.git\ncd awesome-exploration-rl\n```\n\n*(注：请替换为实际的 GitHub 仓库地址，若原 README 未提供具体 URL，通常格式为 `https:\u002F\u002Fgithub.com\u002F\u003C作者>\u002F\u003C仓库名>.git`)*\n\n### 2. 保持更新\n由于该仓库持续更新（最新更新于 2025.12.02），建议定期拉取最新内容：\n\n```bash\ngit pull origin main\n```\n\n### 3. 访问在线版本\n推荐直接访问 GitHub 页面浏览，以获得最佳的目录导航和链接跳转体验：\n- 访问项目主页查看最新的 [Table of Contents](#table-of-contents) 和分类 taxonomy。\n\n## 基本使用\n\n本仓库的核心价值在于其**分类体系（Taxonomy）**和**按年份\u002F会议整理的论文列表**。以下是高效使用指南：\n\n### 1. 理解探索方法分类\n在阅读论文前，建议先查看 `A Taxonomy of Exploration RL Methods` 部分，了解探索策略的两个主要阶段及细分领域：\n\n*   **增强收集策略 (Augmented Collecting Strategy)** - 作用于数据收集阶段：\n    *   `Action Selection Perturbation` (动作选择扰动，如 NoisyNet)\n    *   `Action Selection Guidance` (动作选择引导)\n    *   `State Selection Guidance` (状态选择引导)\n    *   `Parameter Space Perturbation` (参数空间扰动)\n\n*   **增强训练策略 (Augmented Training Strategy)** - 作用于模型训练阶段：\n    *   `Count Based` (基于计数，如 #Exploration)\n    *   `Prediction Based` (基于预测，如 ICM, RND)\n    *   `Information Theory Based` (基于信息论，如 VIME)\n    *   `Entropy Augmented` (熵增强，如 SAC)\n    *   `Bayesian Posterior Based` (基于贝叶斯后验，如 BootstrappedDQN)\n    *   `Goal Based` (基于目标，如 HER)\n    *   `(Expert) Demo Data` (专家演示数据，如 DQfD)\n\n### 2. 查找特定论文\n根据需求，在 `Papers` 部分按会议年份查找。例如，查找 **NeurIPS 2025** 中关于 **LLM 结合 RL 探索** 的论文：\n\n1.  展开 `NeurIPS 2025` 章节。\n2.  浏览标题，找到 `[LLM-Explorer: A Plug-in Reinforcement Learning Policy Exploration Enhancement Driven by Large Language Models]`。\n3.  点击链接访问 OpenReview 页面获取全文。\n4.  查看摘要中的关键信息：\n    *   **Key**: Reinforcement learning, large language model, policy exploration\n    *   **ExpEnv**: Atari, MuJoCo\n\n### 3. 追踪经典算法\n若需了解基础算法，可滚动至 `Classic Exploration RL Papers` 或查看 Taxonomy 中的示例链接，如：\n*   **[RND](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12894)**: 随机网络蒸馏，常用于硬探索环境。\n*   **[HER](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.01495.pdf)**:  hindsight experience replay，适用于稀疏奖励环境。\n\n### 4. 贡献与反馈\n若发现遗漏的重要论文，可通过 GitHub Issues 或 Pull Request 向仓库贡献者提交新增条目，格式需遵循 README 中定义的规范：\n\n```markdown\n- [title](paper link) (presentation type, openreview score [if the score is public])\n  - author1, author2, author3, ...\n  - Key: key problems and insights\n  - ExpEnv: experiment environments\n```\n\n---\n**提示**：本仓库仅包含论文索引和链接，不包含可执行的代码实现。如需运行相关算法，请点击论文链接查找作者提供的官方代码仓库（通常在论文首页或 ArXiv 页面注明）。","某AI实验室的研究员正在开发一款用于复杂迷宫导航的强化学习智能体，该任务属于典型的“硬探索”场景（如 MiniGrid-ObstructedMaze），要求智能体在数百步的长序列动作中找到唯一正确路径，这对探索与利用的平衡提出了极高要求。\n\n### 没有 awesome-exploration-rl 时\n- **文献检索效率低下**：研究员需手动在 arXiv、NeurIPS 等各大顶会中筛选“探索策略”相关论文，面对海量且分散的资源，难以快速锁定最新前沿成果，耗费大量时间在信息搜集而非算法设计上。\n- **技术选型缺乏体系**：由于缺乏统一的分类标准，研究员难以厘清“基于计数的探索”与“基于信息论的探索”等方法的具体适用边界，容易盲目尝试不匹配当前场景的 SOTA 模型，导致实验方向偏差。\n- **复现与对比困难**：找不到权威的资源汇总，难以系统性地对比不同探索机制（如动作选择扰动 vs 状态选择引导）在特定环境下的表现，导致基线实验设计不完整，论文论证力度不足。\n\n### 使用 awesome-exploration-rl 后\n- **一站式获取前沿资源**：通过该工具持续更新的列表，研究员能直接获取按年份（如 ICLR 2025、NeurIPS 2024）整理的精选论文，迅速掌握领域最新动态，将文献调研时间从数周缩短至数天。\n- **清晰的技术路线指引**：借助其提供的详细分类 taxonomy，研究员能明确区分“增强收集策略”与“增强训练策略”，根据迷宫导航的稀疏奖励特性，精准定位到“基于目标（Goal Based）”或“基于计数（Count Based）”的有效方法。\n- **高效实验设计与验证**：参考列表中提供的经典与最新算法示例，研究员能快速构建全面的基线对比实验，科学评估不同探索机制在长 horizon 任务中的有效性，显著提升研发迭代速度与论文质量。\n\nawesome-exploration-rl 通过结构化的知识整理，帮助研究者从繁琐的信息噪音中解脱，专注于解决强化学习中核心的探索难题，极大提升了科研与工程落地的效率。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fopendilab_awesome-exploration-rl_e42b34f4.png","opendilab","OpenDILab","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fopendilab_83f31d72.png","Open-source Decision Intelligence (DI) Platform",null,"opendilab@pjlab.org.cn","https:\u002F\u002Fgithub.com\u002Fopendilab",663,24,"2026-04-03T03:45:07","Apache-2.0","","未说明",{"notes":91,"python":89,"dependencies":92},"该仓库是一个强化学习探索方法（ERL）的论文列表集合（Awesome List），并非可执行的软件代码库，因此 README 中未包含具体的运行环境、依赖库或硬件需求信息。",[],[18],[95,96,97,98,99,100,101,102,103,104],"exploration-exploitation","reinforcement-learning","awesome-list","hard-exploration","awesome","delayed-rewards","exploration","exploratory","reinforcement-learning-algorithms","sparse-reward-algorithms","2026-03-27T02:49:30.150509","2026-04-06T07:13:54.209797",[108,113,117,121,125,129],{"id":109,"question_zh":110,"answer_zh":111,"source_url":112},11772,"KlearReasoner 在探索（exploration）方面的算法贡献是什么？","KlearReasoner 改进了 GRPO 算法并提出了 GPPO 算法。后续深入研究显示，通过裁剪高 token 的梯度可以促进探索（exploration），而裁剪低 token 的梯度则促进利用（exploitation）。通过调节这两者之间的梯度大小，可以实现探索与利用之间的权衡（trade-off）。","https:\u002F\u002Fgithub.com\u002Fopendilab\u002Fawesome-exploration-rl\u002Fissues\u002F5",{"id":114,"question_zh":115,"answer_zh":116,"source_url":112},11773,"如何向 awesome-exploration-rl 仓库推荐新的工作或项目？","您可以直接提交一个 Pull Request (PR) 到该仓库。维护者在 review（审查）通过后，会将您的内容合并到 main 分支。",{"id":118,"question_zh":119,"answer_zh":120,"source_url":112},11774,"KlearReasoner 模型的性能表现如何？","KlearReasoner 基于 Qwen3-8B-Base，通过长思维链（Long CoT）监督微调（SFT）链路以及强化学习（RL），在 7\u002F8B 尺寸模型中达到了最先进（SOTA）性能。具体得分为：AIME 2024 为 90.5%，AIME 2025 为 83.2%，LiveCodeBench V5 为 66.0%，LiveCodeBench V6 为 58.1%。",{"id":122,"question_zh":123,"answer_zh":124,"source_url":112},11775,"GPPO 算法相比 GRPO 和 CISPO 有什么优势？","GPPO 是 KlearReasoner 中提出的改进算法，旨在解决 GRPO 的局限性。它通过更精细地控制梯度裁剪（针对高\u002F低 token 分别处理），能够更好地平衡探索与利用，从而取得比 GRPO 和 CISPO 更好的效果。",{"id":126,"question_zh":127,"answer_zh":128,"source_url":112},11776,"我的项目更适合收录在哪个 Awesome 列表中？","如果您的工作主要关注强化学习与价值推理（RLVR），维护者建议将其收录到 awesome-RLVR 仓库中，而不是通用的探索 RL 列表。您可以根据项目的具体侧重点选择合适的仓库提交 PR。",{"id":130,"question_zh":131,"answer_zh":132,"source_url":112},11777,"哪里可以获取 KlearReasoner 的代码和论文？","论文地址为：https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.07629\n代码地址为：https:\u002F\u002Fgithub.com\u002FKwai-Klear\u002FKlearReasoner",[]]