[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-TJU-DRL-LAB--AI-Optimizer":3,"tool-TJU-DRL-LAB--AI-Optimizer":61},[4,18,26,36,44,52],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",141543,2,"2026-04-06T11:32:54",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":10,"last_commit_at":50,"category_tags":51,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":53,"name":54,"github_repo":55,"description_zh":56,"stars":57,"difficulty_score":10,"last_commit_at":58,"category_tags":59,"status":17},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[14,15,13,60],"视频",{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":76,"owner_website":78,"owner_url":79,"languages":80,"stars":96,"forks":97,"last_commit_at":98,"license":76,"difficulty_score":99,"env_os":100,"env_gpu":101,"env_ram":101,"env_deps":102,"category_tags":106,"github_topics":107,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":111,"updated_at":112,"faqs":113,"releases":124},4523,"TJU-DRL-LAB\u002FAI-Optimizer","AI-Optimizer","The next generation deep reinforcement learning tookit","AI-Optimizer 是一款新一代深度强化学习工具套件，旨在为从单智能体到多智能体、从无模型到基于模型的各类算法提供全面支持。它内置了丰富的算法库，涵盖多智能体协作、自监督表示学习、离线强化学习以及迁移学习等前沿领域，并配备了灵活高效的分布式训练框架，帮助用户轻松完成策略训练。\n\n针对现实世界中如无人驾驶、即时战略游戏及机器人控制等复杂场景，传统方法常面临维度灾难、环境非平稳性、探索与利用难以平衡及信用分配困难等挑战。AI-Optimizer 通过设计具备排列不变性的可扩展神经网络，有效压缩搜索空间；同时引入渐进式互信息协作机制，显著提升了多智能体间的合作探索效率，致力于推动强化学习理论在实际应用中的落地。\n\n这款工具特别适合人工智能研究人员、算法工程师及开发者使用。无论是希望复现顶尖实验室（如 TJU-DRL-LAB）研究成果的学者，还是需要将强化学习技术应用于复杂决策系统的开发团队，AI-Optimizer 都能提供坚实的底层支持与丰富的代码实现，助力用户高效开展创新研究与应用开发。","# AI-Optimizer\nAI-Optimizer is a next-generation deep reinforcement learning suit, providing rich algorithm libraries ranging from model-free to model-based RL algorithms, from single-agent to multi-agent algorithms. Moreover, AI-Optimizer contains a flexible and easy-to-use distributed training framework for efficient policy training.\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_32e2c52b4934.png)\n\nAI-Optimizer now provides the following built-in libraries, and more libraries and implementations are coming soon.\n- [Multiagent Reinforcement learning](multiagent-rl)\n- [Self-supervised Representation Reinforcement Learning](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Fself-supervised-rl)\n- [Offline Reinforcement Learning](offline-rl-algorithms)\n- [Transfer and Multi-task Reinforcement Learning](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Ftransfer-and-multi-task-reinforcement-learning)\n- [Model-based Reinforcement Learning](modelbased-rl)\n\n## Multiagent Reinforcement Learning (MARL)\nThe Multiagent RL repo contains the released codes of representative research works of TJU-RL-Lab on Multiagent Reinforcement Learning (MARL). \n\n### ❓ Problem to Solve\n\n\u003Cp align=\"center\">\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_0b1c5633a7ba.png\" alt=\"Four representative applications of recent successes of MARL: unmanned aerial vehicles, game of Go, Poker games, and team-battle video games.\"\u002F>\u003C\u002Fp>\n\nMulti-agent reinforcement learning (MARL) has successfully addressed many complex real-world problems, such as playing the game of Go ([AlphaGo](https:\u002F\u002Fidp.nature.com\u002Fauthorize\u002Fcasa?redirect_uri=https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature16961%257D&casa_token=JKjRDdaog1cAAAAA:cwpvaRtkWOQi-K-NGT2AT9bNM1kcA5NgXWU7MVIdrI6poJ8FwDxyunnDEpGaOuoUxfA4RzOumao3MqS-8mU), [AlphaGo Zero](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature24270?sf123103138=1)), playing real-time multi-player strategy games ([StarCraft II](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-019-1724-z?), [Dota 2](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06680), [Honor of Kings](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.12895))，playing card games ([Poker](https:\u002F\u002Fwww.science.org\u002Fdoi\u002Fabs\u002F10.1126\u002Fscience.aay2400?casa_token=YpsKCNt7LNwAAAAA:POhK0ufRyfzaHXWiywSHPk6nvzugQVsTNYdSZyteYTkRRPA4zccUvmnOBYC2DBFcIytHN9FPqZ-s6SUY), [no-limit Poker](https:\u002F\u002Fwww.onlinecasinoground.nl\u002Fwp-content\u002Fuploads\u002F2018\u002F10\u002FLibratus-super-human-no-limit-poker-Sandholm-Brown.pdf)), [robotic control](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.06011) and autonomous driving ([Smarts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.09776)). However, MARL suffers from several challenges in theoretical analysis, in addition to those that arise in single-agent RL. We summarize below the challenges that we regard as fundamental in developing theories for MARL.\n\n- **The curse of dimensionality (scalability) issue**\n- **Non-stationarity**\n- **Non-Unique Learning Goals**\n- **Exploration–exploitation tradeoff**\n- **Multiagent credit assignment problem**\n- **Partial observability**\n- **Hybrid action**\n\nOur target is to design ![image-20220408090029202](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_eb16f4dcbbcf.png)MARL algorithms which could solve or alleviate the problems mentioned above and promote the deployment and landing of MARL in more real-world applications.\n\n\n\n### ⭐️ Core Directions\n\nWe carry out our studies according to the challenges mentioned above. To solve the the curse of dimensionality issue, we design a series of scalable multiagent neural networks which could efficiently reduce the size of the search space by leveraging the [permutation invariance and permutation equivariance properties](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05285), explicitly taking the [action semantics](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11461) into consideration, etc. To better make a balance of the exploration–exploitation tradeoff, we propose Progressive Mutual Information Collaboration to achieve more efficient cooperative  exploration... An overall picture of the proposed methods is shown below.\n\n\n\u003Cp align=\"center\">\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_028d61952839.png\" alt=\"our solutions\"  \u002F>\u003C\u002Fp>\n\n\n\n### 💦 Contribution\n\nThe main contribution of this repository is that:\n\n- **For beginners** who are interested in MARL,  our [easy-marl](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Ftree\u002Fmain\u002Fmultiagent-rl\u002Feasy-marl) codebase and ZhiHu blogs: [MARL](https:\u002F\u002Fwww.zhihu.com\u002Fcolumn\u002Fc_1479535265715298304) and [communication-based MARL](https:\u002F\u002Fwww.zhihu.com\u002Fcolumn\u002Fc_1431679500950560768) can be a preliminary tutorial.\n\n- **For researchers,** we provide a systematic overview of typical challenges in MARL from different perspectives, each of which is a very valuable research direction and contains a series recent research works. We hope with our research works and the corresponding released codes can make it easier for researchers to design new algorithms. \n\n  - For example, given the significant interest in designing novel MARL architectures over the past few years, the research direction of [scalable multiagent networks](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FMultiagent-RL\u002Ftree\u002F304dc434f5be947641ab8eed9857a034f3ec1507\u002Fscalability) is definitely of interest to the MARL community. More recently, the notion of *permutation-invariance* and *permutation-equivariance* in the design of MARL agents has relatively drawn less attention than deserved, and therefore the presented idea in [API paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.05285.pdf) is interesting and very relevant to MARL researchers.\n\n- **For practitioners**,  we release a serials of **efficient, scalable, well-performed** and **easy to use** MARL algorithms which achieve superior performance in the typical benchmarks of the MARL research community. \n\n  - For example, the API-QMIX, API-VDN, API-MAPPO and API-MADDPG algorithms proposed in our paper [\"API: Boosting Multi-Agent Reinforcement Learning via Agent-Permutation-Invariant Networks\"](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.05285.pdf) achieve State-Of-The-Art Performance in the [StarCraft Multi-Agent Challenge (SMAC)](https:\u002F\u002Fgithub.com\u002Foxwhirl\u002Fsmac) and [Multi-agent Particle Environment](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmultiagent-particle-envs) benchmarks, which achieves **100% win-rates in almost all hard and super-hard SMAC scenarios (never achieved before)**.\n  - **We strongly recommend** that practitioners **try and use our API-Network solution FIRST** when solving practical MARL problems (because it is very easy to use and does work very well) !  We hope our works can promote the deployment and landing of MARL in more real-world applications.\n\n  See more [here](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Ftree\u002Fmain\u002Fmultiagent-rl).\n\n  \n\n## Offline-rl-algorithms (Offrl)\n### ❓ Problem to Solve\nCurrent deep RL methods still typically rely on active data collection to succeed, hindering their application in the real world especially when the data collection is dangerous or expensive.  Offline RL (also known as batch RL) is a data-driven RL paradigm concerned with learning exclusively from static datasets of previously-collected experiences. In this setting, a behavior policy interacts with the environment to collect a set of experiences, which can later be used to learn a policy without further interaction. This paradigm can be extremely valuable in settings where online interaction is impractical. However, current offline rl methods are restricted to three challenges: \n * Low upper limit of algorithm: The quality of offline data determines the performance of offline reinforcement learning algorithms. How to expand low-quality offline data without additional interaction to increase the learning upper limit of offline reinforcement learning algorithms?\n * Poor algorithm effect: Existing off-policy\u002Foffline algorithm trains on the offline data distribution. When interacting with the environment, the distribution of the accessed state-action may change compared with the offline data (Distributional Shift). In this situation, the Q value of the \u003Cstate, action> pair is easy to be overestimated, which affects the overall performance. How to characterize the data outside the offline data distribution (Out Of Distribution, OOD) to avoid overestimation?\n * Difficulty in applying the algorithm: Due to the limited quality of the dataset, the learned strategy cannot be directly deployed in the production environment, and further online learning is required. How to design data sampling in the online training phase to avoid the sudden drop in the initial performance of the strategy due to the redundant data generated by the distribution change, and quickly converge to the optimal solution in a limited number of interactions?\n\n### 💦 Contribution\nThis repository contains the codes of representative benchmarks and algorithms on the topic of Offline Reinforcement Learning. The repository is developed based on d3rlpy(https:\u002F\u002Fgithub.com\u002Ftakuseno\u002Fd3rlpy) following MIT license to shed lights on the research on the above three challenges. While inheriting its advantages, the additional features include (or will be included):\n - A unified algorithm framework with rich and fair comparisons bewteen different algorithms:\n   - REDQ\n   - UWAC\n   - BRED\n   - …\n - Abundant and real-world datasets:\n   - Real-world industrial datasets\n   - Multimodal datasets\n   - Augmented datasets (and corresponding methods)\n   - Datasets obtained using representation learning (and corresponding methods)\n - More easy-to-use log systems support: \n   - Wandb\n\n\n\n\n![Ecology of Offline RL](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_855955fb011f.png)\n\n## Self-supervised Reinforcement Learning (SSRL)\nSSRL repo contains the released codes of representative research works of TJU-RL-Lab on Self-supervised Representation Learning for RL. \n\nTo the best of our knowledge, this is the **first** code repository for SSRL established by following **a systematic research taxonomy** and **a unified algorithmic framework**.\n\n### ❓ Problem to Solve\nSince the RL agent always _receives_, _processes_, and _delivers_ all kinds of data in the learning process (i.e., the typical Agent-Environment Interface), \nhow to **properly represent such \"data\"** is naturally the key point to the effectiveness and efficiency of RL.\n\nIn this branch, we focus on **three key questions** as follows:\n- **What should a good representation for RL be? (Theory)** \n- **How can we obtain or realize such good representations? (Methodology)**\n- **How can we making use of good representations to improve RL? (Downstream Learning Tasks & Application)**\n\n### ⭐️ Core Idea\nTaking **Self-supervised Learning** (SSL) as our major paradigm for representation learning, we carry out our studies from four perspectives: \n- **State Representation**,\n- **Action Representation**,\n- **Policy Representation**,\n- **Environment (and Task) Representation**.\n\nThese four pespectives are major elements involved in general _Agent-Environment Interface_ of RL.\nThey play the roles of _input_, _optimization target_ and etc. in the process of RL.\nThe representation of these elements make a great impact on the **sample efficiency**, **convergence optimality** and **cross-enviornment generalization**.\n\n\nThe central contribution of this repo is **A Unified Algorithmic Framework (Implementation Design) of SSRL Algorithm**.\nThe framework provides a unified interpretation for almost all currently existing SSRL algorithms. Moreover, the framework can also serve as a paradigm when we are going to devise new methods.\n\nOur ultimate goal is to promote the establishment of the ecology of SSRL, which is illustrated below.\n\nTowards addressing the key problems of RL,\nwe study SSRL with four types of representations.\nFor researches from all four pespectives, a unified framework of algorithm and imeplementation serves as the underpinnings.\nThe representations studied from different pespectives further boost various downstream RL tasks.\nFinally, this promotes the deployment and landing of RL in real-world applications.\n\n\u003Cdiv align=center>\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_aeeddf3c9a61.png\" alt=\"Ecology of SSRL\" style=\"zoom:40%;\" \u002F>\u003C\u002Fdiv>\n\nSee more [here](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Fself-supervised-rl).\n\n### 💦 Contribution\n\nWith this repo and our research works, we want to draw the attention of RL community to studies on Self-supervised Representation Learning for RL.\n\n- For people who are insterested in RL, our introduction in this repo and our [blogs](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F413321572) can be a preliminary tutorial.\n- For cutting-edge RL researchers, we believe that our research thoughts and the proposed SSRL framework are insightful and inspiring, openning up new angles for future works on more advanced RL.\n- For RL practicers (especially who work on related fields), we provide advanced RL algorithms with strong performance in online RL (e.g., [PPO-PeVFA](.\u002Fself-supervised-rl\u002FRL_with_Policy_Representation\u002FPolicy-based_RL_with_PeVFA\u002FPPO-PeVFA)), hybrid-action decision-making (e.g., [HyAR](.\u002Fself-supervised-rl\u002FRL_with_Action_Representation\u002FHyAR)), policy adaptation from offline experience (e.g., [PAnDR](.\u002Fself-supervised-rl\u002FRL_with_Environment_Representation\u002FPAnDR)) ..., which can be adopted or developed in associated academic and industrial problems.\n\nWe are also looking forward to feedback in any form to promote more in-depth researches.\n\n\n## Transfer and Multi-task Reinforcement Learning\nRecently, Deep Reinforcement Learning (DRL) has achieved a lot of success in human-level control problems, such as video games, robot control, autonomous vehicles, smart grids and so on. However, DRL is still faced with the **sample-inefficiency problem** especially when the state-action space becomes large, which makes it difficult to learn from scratch. This means the agent has to use a large number of samples to learn a good policy. Furthermore, the sample-inefficiency problem is much more severe in Multiagent Reinforcement Learning (MARL) due to the exponential increase of the state-action space.  \n\n### ❓ Problem to Solve\n\n**Sample-inefficiency problem**: The main challenge that transfer and multi-task RL aims to solve is the sample-inefficiency problem. This problem forces the agent to collect a huge amount of training data to learn the optimal policy. For example, the Rainbow DQN requires around 18 million frames of training data to exceed the average level of human players, which is equivalent to 60 hours of games played by human players. However, human players can usually learn an Atari game within a few minutes and can reach the average level of the same player after one hour of training. \n\n### ⭐️ Core Idea\n\n- **Transfer RL** which leverages prior knowledge from previously related tasks to accelerate the learning process of RL, has become one popular research direction to significantly improve sample efficiency of DRL. \n\n- **Multi-task RL**, in which one network learns policies for multiple tasks, has emerged as another promising direction with fast inference and good performance.\n\n### 💦 Contribution\n\nThis repository contains the released codes of representative benchmarks and algorithms of TJU-RL-Lab on the topic of Transfer and Multi-task Reinforcement Learning, including the single-agent domain and multi-agent domain, addressing the sample-inefficiency problem in different ways.\n\n\u003Cp align=\"center\">\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_640be52cf554.png\" alt=\"overview\" style=\"zoom:60%;\" \u002F>\u003C\u002Fp>\n\n\nIn this repo, we provide specific solutions of our lab including:\n* **PTF** addresses the **Sample-inefficiency problem** in DRL by proposing a novel Policy Transfer Framework (PTF).\n\n* **MAPTF** addresses the **Sample-inefficiency problem** in deep MARL by proposing a Multi-Agent Policy Transfer Framework (MAPTF).\n\n* **KTM-DRL(reproduced)** : A Knowledge Transfer based Multi-task Deep Reinforcement Learning framework (KTM-DRL) for continuous control. We reproduce the results in the MuJoCo continuous control task suite, more details can be find [here](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Ftransfer-and-multi-task-reinforcement-learning\u002Ftree\u002Fmain\u002FSingle-agent%20Multi-task%20RL\u002FKTM-DRL).\n\nSee more [here](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Ftransfer-and-multi-task-reinforcement-learning).\n\n## Model-based Reinforcement Learning (MBRL)\nModel-based reinforcement learning (MBRL) is widely seen as having the potential to be significantly more sample efficient than model-free RL. By learning a model of the environment, model-based methods learn with significantly lower sample complexity.The model of the environment is a representation model that explicitly contains knowledge about the environment or the task, and generally two types of models are included: a transition model or a dynamics model and the reward model. Once this model is modeled, it can be properly integrated into the interaction with the environment and the learning of strategies. \n\n### ❓Problems to Solve\n\nThe current classifications of the mainstream algorithms in the modern Model-Based RL area are orthogonal, which means some algorithms can be grouped into different categories according to different perspectives. In this branch, we focus on two key questions  :`How to Learn a Model` and `How to Utilize a Model`.\n\n- `How to Learn a Model` mainly focuses on how to build the environment model. \n- `How to Utilize a Model` cares about how to utilize the learned model. \n\n### ⭐️ Core Directions\n\nIgnoring the differences in specific methods, the purpose of MBRL algorithms can be more finely divided into four directions as follows: `Reduce Model Error`、`Faster Planning`、` Higher Tolerance to Model Error` 、`Scalability to Harder Problems`.  For the problem of `How to Learn a Model`, we can study on reducing model error to learn a more accurate world model or learning a world model with higher tolerance to model error. For the problem of `How to Utilize a Model`, we can study on faster planning with a learned model or the scalability of the learned model to harder problems.   \n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_7d443bd9df3b.png)\n### 💦 Contributions\n#### Why MBRL ?\n\nModel-based reinforcement learning (MBRL) enjoys several benefits, such as data-efficiency and planning, by learning a model of the environments dynamics. The model of the environment is a representation model that explicitly contains knowledge about the environment or the task, and generally two types of models are included: a transition model or a dynamics model and the reward model. Once this model is modeled, it can be properly integrated into the interaction with the environment and the learning of strategies. \n\n#### Why Our Lib?\n\nWith this repo and our research works, we want to draw the attention of RL community to studies on Model Based RL.\n\n- For people who are insterested in model based RL, our introductions in this repo and our [ZhiHu blog series](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F425318401) can be a preliminary tutorial.\n\n- For Researchers in model-based RL, we collect several separate lines of research, which are sometimes closed-sourced or not reproducible and make some code-level optimizations for the convinience to find comparative baselines without the need to search around for implementations.\n\nWe expect that our research thoughts and proposed topic for MBRL area can open up some new angles for future works on more advanced RL. **What' more, We want to cover as many interesting new directions as possible, and then divide it into the topic we listed above, to give you some inspiration and ideas for your RESEARCH.** Research in model-based RL has not been very standardized. It is fairly common for authors to experiment with self-designed environments, and there are several separate lines of research, which are sometimes closed-sourced or not reproducible. And for this, we have collected some of the mainstream MBRL algorithms and made some code-level optimizations. Bringing these algorithms together in a unified framework can save the researchers time in finding comparative baselines without the need to search around for implementations. Currently, we have implemented Dreamer, MBPO,BMPO, MuZero, PlaNet, SampledMuZero, CaDM and we plan to keep increasing this list in the future.  We will constantly update this repo to include new research made by TJU-DRL-Lab to ensure sufficient coverage and reliability. We are also looking forward to feedback in any form to promote more in-depth researches. See more [here](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Ftree\u002Fmain\u002Fmodelbased-rl).\n\n# Contributing\nAI-Optimizer is still under development. More algorithms and features are going to be added and we always welcome contributions to help make AI-Optimizer better. Feel free to contribute.\n","# AI-Optimizer\nAI-Optimizer 是一款新一代深度强化学习套件，提供了从无模型到基于模型的强化学习算法，以及从单智能体到多智能体算法的丰富算法库。此外，AI-Optimizer 还包含一个灵活且易于使用的分布式训练框架，用于高效地训练策略。\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_32e2c52b4934.png)\n\nAI-Optimizer 目前提供了以下内置库，更多库和实现即将推出。\n- [多智能体强化学习](multiagent-rl)\n- [自监督表征强化学习](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Fself-supervised-rl)\n- [离线强化学习](offline-rl-algorithms)\n- [迁移与多任务强化学习](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Ftransfer-and-multi-task-reinforcement-learning)\n- [基于模型的强化学习](modelbased-rl)\n\n## 多智能体强化学习 (MARL)\n多智能体 RL 仓库包含了 TJU-RL 实验室在多智能体强化学习 (MARL) 方面具有代表性的研究成果代码。\n\n### ❓ 待解决的问题\n\n\u003Cp align=\"center\">\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_0b1c5633a7ba.png\" alt=\"近年来 MARL 取得成功的四个代表性应用：无人机、围棋、扑克游戏和团队对战类电子游戏.\"\u002F>\u003C\u002Fp>\n\n多智能体强化学习 (MARL) 已成功解决了许多复杂的现实世界问题，例如下围棋（[AlphaGo](https:\u002F\u002Fidp.nature.com\u002Fauthorize\u002Fcasa?redirect_uri=https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature16961%257D&casa_token=JKjRDdaog1cAAAAA:cwpvaRtkWOQi-K-NGT2AT9bNM1kcA5NgXWU7MVIdrI6poJ8FwDxyunnDEpGaOuoUxfA4RzOumao3MqS-8mU)，[AlphaGo Zero](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature24270?sf123103138=1))、玩即时战略多人游戏（[星际争霸 II](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-019-1724-z?)，[Dota 2](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06680)，[王者荣耀](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.12895))、玩纸牌游戏（[扑克](https:\u002F\u002Fwww.science.org\u002Fdoi\u002Fabs\u002F10.1126\u002Fscience.aay2400?casa_token=YpsKCNt7LNwAAAAA:POhK0ufRyfzaHXWiywSHPk6nvzugQVsTNYdSZyteYTkRRPA4zccUvmnOBYC2DBFcIytHN9FPqZ-s6SUY)，[无限注扑克](https:\u002F\u002Fwww.onlinecasinoground.nl\u002Fwp-content\u002Fuploads\u002F2018\u002F10\u002FLibratus-super-human-no-limit-poker-Sandholm-Brown.pdf))、机器人控制（[arXiv.org\u002Fabs\u002F1709.06011]）以及自动驾驶（[Smarts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.09776)）。然而，除了单智能体 RL 中存在的挑战之外，MARL 在理论分析方面还面临诸多难题。我们总结了在发展 MARL 理论时认为至关重要的几个挑战：\n\n- **维度灾难（可扩展性）问题**\n- **非平稳性**\n- **学习目标不唯一**\n- **探索与利用之间的权衡**\n- **多智能体信用分配问题**\n- **部分可观测性**\n- **混合动作**\n\n我们的目标是设计 ![image-20220408090029202](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_eb16f4dcbbcf.png)MARL 算法，以解决或缓解上述问题，并推动 MARL 在更多实际应用中的部署和落地。\n\n\n\n### ⭐️ 核心方向\n\n我们根据上述挑战开展研究。为了解决维度灾难问题，我们设计了一系列可扩展的多智能体神经网络，这些网络能够通过利用 [排列不变性和排列等变性特性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05285)，明确考虑 [动作语义](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11461)，从而有效地缩小搜索空间。为了更好地平衡探索与利用之间的权衡，我们提出了渐进式互信息协作，以实现更高效的协作式探索……以下是所提出方法的整体概览。\n\n\n\u003Cp align=\"center\">\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_028d61952839.png\" alt=\"我们的解决方案\"  \u002F>\u003C\u002Fp>\n\n\n\n### 💦 贡献\n\n本仓库的主要贡献在于：\n\n- 对于对 MARL 感兴趣的 **初学者**，我们的 [easy-marl](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Ftree\u002Fmain\u002Fmultiagent-rl\u002Feasy-marl) 代码库以及知乎专栏：[MARL](https:\u002F\u002Fwww.zhihu.com\u002Fcolumn\u002Fc_1479535265715298304) 和 [基于通信的 MARL](https:\u002F\u002Fwww.zhihu.com\u002Fcolumn\u002Fc_1431679500950560768) 可以作为初步教程。\n\n- 对于 **研究人员**，我们从不同角度系统地概述了 MARL 中典型的挑战，每一个挑战都是极具价值的研究方向，并且都包含一系列近期的研究成果。我们希望通过我们的研究工作及相应的开源代码，能够帮助研究人员更轻松地设计新算法。\n\n  - 例如，鉴于近年来设计新型 MARL 架构的兴趣日益浓厚，[可扩展的多智能体网络](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FMultiagent-RL\u002Ftree\u002F304dc434f5be947641ab8eed9857a034f3ec1507\u002Fscalability) 这一研究方向无疑引起了 MARL 社区的广泛关注。最近，在 MARL 智能体设计中，“排列不变性”和“排列等变性”的概念受到的关注相对较少，因此我们在 [API 论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.05285.pdf) 中提出的观点非常有趣，也与 MARL 研究人员密切相关。\n\n- 对于 **从业者**，我们发布了一系列 **高效、可扩展、性能优异**且 **易于使用**的 MARL 算法，这些算法在 MARL 研究社区的典型基准测试中表现出色。\n\n  - 例如，我们在论文《API：通过智能体排列不变网络提升多智能体强化学习》中提出的 API-QMIX、API-VDN、API-MAPPO 和 API-MADDPG 算法，在 [星际争霸多智能体挑战 (SMAC)](https:\u002F\u002Fgithub.com\u002Foxwhirl\u002Fsmac) 和 [多智能体粒子环境](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmultiagent-particle-envs) 的基准测试中均取得了最先进的性能，其中在几乎所有的 SMAC 困难和超困难场景中实现了 **100% 的胜率（这是前所未有的成绩）**。\n  - 我们 **强烈建议**从业者在解决实际的 MARL 问题时，**首先尝试并使用我们的 API 网络解决方案**（因为它非常容易使用且效果非常好）。我们希望我们的工作能够促进 MARL 在更多现实世界应用中的部署和落地。\n\n  更多信息请参见 [此处](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Ftree\u002Fmain\u002Fmultiagent-rl)。\n\n  \n\n## 离线强化学习算法 (Offrl)\n\n### ❓ 待解决的问题\n当前的深度强化学习方法通常仍依赖于主动的数据收集才能取得成功，这在数据收集危险或昂贵的情况下，极大地限制了其在现实世界中的应用。离线强化学习（也称为批处理强化学习）是一种数据驱动的强化学习范式，专注于仅从先前收集的静态数据集中进行学习。在这种设置中，行为策略与环境交互以收集一组经验，这些经验随后可用于在无需进一步交互的情况下学习策略。这一范式在在线交互不切实际的场景中具有极高的价值。然而，现有的离线强化学习方法主要受限于以下三个挑战：\n * 算法上限较低：离线数据的质量决定了离线强化学习算法的性能。如何在不进行额外交互的情况下扩展低质量的离线数据，从而提高离线强化学习算法的学习上限？\n * 算法效果不佳：现有的离线\u002F非策略算法是在离线数据分布上进行训练的。当与环境交互时，访问的状态-动作分布可能会相对于离线数据发生改变（分布偏移）。在这种情况下，\u003C状态, 动作>对的Q值容易被高估，从而影响整体性能。如何表征离线数据分布之外的数据（Out Of Distribution, OOD），以避免过估计问题？\n * 算法应用困难：由于数据集质量有限，所学策略无法直接部署到生产环境中，还需要进一步的在线学习。如何设计在线训练阶段的数据采样策略，以避免因分布变化产生的冗余数据导致策略初始性能骤降，并在有限的交互次数内快速收敛到最优解？\n\n### 💦 贡献\n本仓库包含了离线强化学习领域中具有代表性的基准和算法代码。该仓库基于d3rlpy（https:\u002F\u002Fgithub.com\u002Ftakuseno\u002Fd3rlpy）开发，遵循MIT许可证，旨在为上述三个挑战的研究提供参考。在继承其优势的基础上，新增功能包括（或即将加入）：\n - 统一的算法框架，支持不同算法之间丰富且公平的对比：\n   - REDQ\n   - UWAC\n   - BRED\n   - …\n - 丰富且贴近真实世界的数据集：\n   - 工业级真实世界数据集\n   - 多模态数据集\n   - 增强型数据集（及相应方法）\n   - 通过表示学习获得的数据集（及相应方法）\n - 更易用的日志系统支持：\n   - Wandb\n\n\n\n\n![离线强化学习生态](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_855955fb011f.png)\n\n## 自监督强化学习（SSRL）\nSSRL仓库包含了天津大学强化学习实验室在自监督表示学习用于强化学习领域的代表性研究成果代码。\n\n据我们所知，这是**首个**按照**系统的研究分类体系**和**统一的算法框架**建立的SSRL代码仓库。\n\n### ❓ 待解决的问题\n由于强化学习智能体在学习过程中始终负责接收、处理和传递各类数据（即典型的智能体-环境接口），因此如何**恰当地表示这些“数据”**自然成为决定强化学习有效性和效率的关键所在。\n\n在本分支中，我们重点关注以下三个关键问题：\n- **什么样的表示才适合强化学习？（理论）**\n- **如何获取或实现这样的良好表示？（方法论）**\n- **如何利用良好的表示来提升强化学习的效果？（下游学习任务与应用）**\n\n### ⭐️ 核心思想\n我们以**自监督学习**（SSL）作为表示学习的主要范式，从四个角度展开研究：\n- **状态表示**\n- **动作表示**\n- **策略表示**\n- **环境（及任务）表示**\n\n这四个视角是强化学习中通用的“智能体-环境接口”所涉及的主要要素。它们在强化学习过程中分别扮演着输入、优化目标等角色。这些要素的表示方式对强化学习的**样本效率**、**收敛最优性**以及**跨环境泛化能力**有着重要影响。\n\n本仓库的核心贡献在于**SSRL算法的统一算法框架（实现设计）**。该框架能够统一解释目前几乎所有现有的SSRL算法，同时也可以作为我们设计新方法时的范式参考。\n\n我们的最终目标是推动SSRL生态系统的建立，具体如下所示。\n\n为了解决强化学习中的关键问题，我们从四种表示入手研究SSRL。针对这四个视角的研究，都以统一的算法与实现框架作为基础支撑。不同视角下研究得到的表示进一步促进了各种下游强化学习任务的开展，最终推动强化学习在实际应用中的部署与落地。\n\n\u003Cdiv align=center>\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_aeeddf3c9a61.png\" alt=\"SSRL生态\" style=\"zoom:40%;\" \u002F>\u003C\u002Fdiv>\n\n更多信息请参见[此处](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Fself-supervised-rl)。\n\n### 💦 贡献\n\n借助本仓库及我们的研究成果，我们希望引起强化学习社区对自监督表示学习的关注。\n\n- 对于对强化学习感兴趣的人士，本仓库的介绍以及我们的[博客文章](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F413321572)可以作为初步的入门教程。\n- 对于前沿的强化学习研究人员，我们认为我们的研究思路和提出的SSRL框架具有启发性和洞察力，能够为未来更高级的强化学习研究开辟新的方向。\n- 对于强化学习的实践者（尤其是相关领域的从业者），我们提供了在在线强化学习中表现优异的先进算法（例如：[PPO-PeVFA](.\u002Fself-supervised-rl\u002FRL_with_Policy_Representation\u002FPolicy-based_RL_with_PeVFA\u002FPPO-PeVFA)）、混合动作决策算法（例如：[HyAR](.\u002Fself-supervised-rl\u002FRL_with_Action_Representation\u002FHyAR)）以及基于离线经验的策略适应算法（例如：[PAnDR](.\u002Fself-supervised-rl\u002FRL_with_Environment_Representation\u002FPAnDR)）等，这些算法可以直接应用于或进一步开发于相关的学术和工业问题中。\n\n我们也期待收到任何形式的反馈，以促进更深入的研究。\n\n## 迁移与多任务强化学习\n近年来，深度强化学习（DRL）在诸多人类水平的控制问题上取得了显著成果，例如视频游戏、机器人控制、自动驾驶汽车、智能电网等。然而，DRL仍然面临**样本效率低下问题**，尤其是在状态-动作空间变得庞大时，这使得从零开始学习变得十分困难。这意味着智能体需要使用大量的样本才能学习到一个良好的策略。此外，在多智能体强化学习（MARL）中，由于状态-动作空间呈指数级增长，样本效率低下的问题更加严重。\n\n### ❓ 待解决的问题\n\n**样本效率低下问题**：迁移与多任务强化学习旨在解决的主要挑战就是样本效率低下问题。这一问题迫使智能体收集海量的训练数据才能学习到最优策略。例如，Rainbow DQN大约需要1800万帧的训练数据才能超越人类玩家的平均水平，这相当于人类玩家连续游玩60小时的游戏内容。然而，人类玩家通常只需几分钟就能学会一款Atari游戏，并在经过一小时的训练后达到该游戏玩家的平均水平。\n\n### ⭐️ 核心思想\n\n- **迁移强化学习**通过利用先前相关任务中的先验知识来加速强化学习过程，已成为显著提升DRL样本效率的热门研究方向之一。\n  \n- **多任务强化学习**则采用单个网络同时学习多个任务的策略，凭借其推理速度快、性能优异的特点，也逐渐成为另一条极具前景的研究路径。\n\n### 💦 贡献\n\n本仓库包含了TJU-RL-Lab在迁移与多任务强化学习领域发布的代表性基准和算法代码，涵盖单智能体和多智能体场景，以不同方式应对样本效率低下的问题。\n\n\u003Cp align=\"center\">\u003Cimg align=\"center\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_640be52cf554.png\" alt=\"overview\" style=\"zoom:60%;\" \u002F>\u003C\u002Fp>\n\n\n在本仓库中，我们提供了实验室的具体解决方案，包括：\n* **PTF**通过提出一种新颖的策略迁移框架（PTF），有效解决了DRL中的**样本效率低下问题**。\n  \n* **MAPTF**则针对深度多智能体强化学习中的**样本效率低下问题**，提出了多智能体策略迁移框架（MAPTF）。\n  \n* **KTM-DRL（复现版）**：基于知识迁移的连续控制多任务深度强化学习框架（KTM-DRL）。我们在MuJoCo连续控制任务集中复现了相关结果，更多详情请参见[此处](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Ftransfer-and-multi-task-reinforcement-learning\u002Ftree\u002Fmain\u002FSingle-agent%20Multi-task%20RL\u002FKTM-DRL)。\n\n更多内容请访问[此处](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002Ftransfer-and-multi-task-reinforcement-learning)。\n\n## 基于模型的强化学习（MBRL）\n基于模型的强化学习（MBRL）被广泛认为具有比无模型强化学习更高的样本效率潜力。通过学习环境模型，基于模型的方法能够以更低的样本复杂度进行学习。环境模型是一种包含关于环境或任务明确知识的表示模型，通常包括两类：转移模型（动力学模型）和奖励模型。一旦构建出这样的模型，便可以将其有效地融入与环境的交互以及策略的学习过程中。\n\n### ❓待解决的问题\n\n当前主流的基于模型强化学习算法分类方式较为多样化，这意味着某些算法可以根据不同的视角被归入不同的类别。在本分支中，我们重点关注两个核心问题：`如何学习模型`和`如何利用模型`。\n\n- `如何学习模型`主要关注如何构建环境模型。\n- `如何利用模型`则关注如何有效利用已学习到的模型。\n\n### ⭐️ 核心方向\n\n尽管具体方法存在差异，但MBRL算法的目的可以更细致地划分为以下四个方向：`降低模型误差`、`加快规划速度`、`提高对模型误差的容忍度`、`扩展至更复杂问题的能力`。对于`如何学习模型`这一问题，我们可以研究如何减少模型误差以学习更精确的世界模型，或者如何构建对模型误差具有更高容忍度的世界模型。而对于`如何利用模型`这一问题，我们可以探索如何利用已学习的模型实现更快的规划，或者如何将已学习的模型扩展应用于更复杂的任务。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_readme_7d443bd9df3b.png)\n\n### 💦 贡献\n#### 为什么选择 MBRL？\n\n基于模型的强化学习（MBRL）通过学习环境动态的模型，具备数据高效性和规划能力等优势。环境模型是一种表征模型，显式地包含关于环境或任务的知识，通常包括两类：转移模型（或称动力学模型）和奖励模型。一旦构建了这样的模型，就可以将其有效地融入与环境的交互以及策略的学习过程中。\n\n#### 为什么需要我们的库？\n\n通过这个仓库及我们的研究工作，我们希望引起强化学习社区对基于模型的强化学习领域的关注。\n\n- 对于对基于模型的强化学习感兴趣的研究者，本仓库中的介绍以及我们的[知乎专栏系列](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F425318401)可以作为入门教程。\n\n- 对于从事基于模型的强化学习的研究人员，我们整理了多个独立的研究方向，其中一些可能是闭源的或难以复现的，并在代码层面进行了优化，以便更便捷地找到可比较的基准方法，而无需四处寻找实现细节。\n\n我们期望，我们的研究思路和针对 MBRL 领域提出的研究方向能够为未来更高级的强化学习研究开辟新的视角。**此外，我们希望尽可能覆盖更多有趣的新方向，并将其归类到上述主题中，以激发您的研究灵感和创意。** 目前，基于模型的强化学习研究尚未形成非常标准化的体系。研究者常常会使用自行设计的环境进行实验，且存在多条独立的研究路线，其中部分工作可能是闭源的或难以复现。为此，我们收集了一些主流的 MBRL 算法，并在代码层面进行了优化。将这些算法整合到统一的框架中，可以帮助研究人员节省寻找对比基准的时间，而无需再费力搜索不同的实现。目前，我们已实现了 Dreamer、MBPO、BMPO、MuZero、PlaNet、SampledMuZero 和 CaDM 等算法，并计划在未来继续扩充这一列表。我们将持续更新本仓库，纳入 TJU-DRL-Lab 的最新研究成果，以确保内容的全面性和可靠性。我们也欢迎任何形式的反馈，以推动更深入的研究。更多信息请参见 [这里](https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Ftree\u002Fmain\u002Fmodelbased-rl)。\n\n# 贡献\nAI-Optimizer 仍处于开发阶段。未来还将添加更多算法和功能，我们始终欢迎各方贡献，共同使 AI-Optimizer 更加完善。欢迎您随时参与贡献。","# AI-Optimizer 快速上手指南\n\nAI-Optimizer 是天津大学深度强化学习实验室（TJU-DRL-LAB）推出的下一代深度强化学习套件。它提供了从无模型到基于模型、从单智能体到多智能体的丰富算法库，并包含灵活高效的分布式训练框架。本指南将帮助您快速开始使用其核心模块。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 18.04\u002F20.04) 或 macOS。Windows 用户建议使用 WSL2。\n*   **Python 版本**: Python 3.7 - 3.9\n*   **深度学习框架**: PyTorch 1.8+\n*   **硬件要求**: 推荐使用 NVIDIA GPU 以加速训练（需安装对应的 CUDA 驱动和 Toolkit）。\n\n**前置依赖安装：**\n\n建议先更新 `pip` 并安装基础科学计算库。国内用户可使用清华源加速下载。\n\n```bash\npython -m pip install --upgrade pip -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\npip install torch torchvision torchaudio -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\npip install numpy gym matplotlib tqdm -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n## 安装步骤\n\n克隆仓库并安装项目依赖。\n\n```bash\n# 克隆仓库\ngit clone https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer.git\ncd AI-Optimizer\n\n# 安装核心依赖\n# 注意：不同子模块可能有特定依赖，建议进入对应子目录安装\npip install -e . -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n针对特定算法库的安装（以多智能体强化学习 MARL 为例）：\n\n```bash\ncd multiagent-rl\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n> **提示**：若需使用 Offline RL 模块，该库基于 `d3rlpy` 构建，请确保额外安装：\n> `pip install d3rlpy -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n## 基本使用\n\nAI-Optimizer 的不同子库针对不同的应用场景。以下是两个最常用模块的快速启动示例。\n\n### 1. 多智能体强化学习 (MARL)\n\n本模块包含了高性能的 API 系列算法（如 API-QMIX, API-MAPPO），在 SMAC 等基准测试中表现优异。\n\n**运行示例（使用 API-QMIX 在 SMAC 环境中训练）：**\n\n```bash\ncd multiagent-rl\n\n# 运行训练脚本\n# 参数说明：--algo 指定算法，--env 指定环境，--name 指定实验名称\npython train.py --algo api_qmix --env sc2_3s5z --name my_first_marl_run\n\n# 查看结果\n# 训练日志和模型权重通常保存在 .\u002Fresults 目录下\n```\n\n对于初学者，推荐使用 `easy-marl` 子目录中的简化代码进行教程学习：\n\n```bash\ncd easy-marl\npython quick_start.py\n```\n\n### 2. 离线强化学习 (Offline RL)\n\n本模块用于仅利用静态数据集进行策略学习，无需与环境实时交互。\n\n**运行示例（加载数据集并训练 REDQ 算法）：**\n\n```bash\ncd offline-rl-algorithms\n\n# 运行离线训练\n# d3rlpy 风格接口，指定算法、数据集路径和迭代次数\npython train_offline.py --algo redq --dataset hopper-medium-v0 --iterations 100000\n\n# 评估策略\npython evaluate.py --model_path .\u002Fsaved_models\u002Fredq_hopper.pt --dataset hopper-medium-v0\n```\n\n### 3. 自监督表示强化学习 (SSRL)\n\n该模块提供统一的算法框架，用于状态、动作或策略的表示学习。\n\n**基本调用逻辑：**\n\n```bash\ncd self-supervised-rl\n\n# 运行自监督预训练 + 下游任务微调\npython run_ssrl.py --rep_type state --downstream_task control --env halfcheetah\n```\n\n---\n**下一步建议**：\n*   访问各子目录下的 `README.md` 获取详细的参数配置说明。\n*   参考项目中提供的 ZhiHu 专栏文章深入理解算法原理。\n*   利用内置的 Wandb 支持监控训练过程（需在代码中配置 `wandb login`）。","某物流科技公司正在研发一套由上百台自动导引车（AGV）组成的智能仓储调度系统，旨在实现货物搬运的全自动化协同。\n\n### 没有 AI-Optimizer 时\n- **扩展性瓶颈**：随着 AGV 数量增加，状态空间呈指数级爆炸，传统单智能体算法无法处理高维数据，导致训练难以收敛。\n- **环境非平稳性**：每辆车的策略都在实时变化，其他车辆对个体而言如同“移动的目标”，导致学习过程极不稳定，策略频繁失效。\n- **协作奖励分配难**：当团队整体完成搬运任务时，难以精准量化每辆车的贡献（信用分配问题），导致部分车辆“搭便车”或学习方向错误。\n- **探索效率低下**：在复杂的动态避障场景中，算法难以平衡“尝试新路径”与“利用已知最优解”，常陷入局部最优或发生碰撞。\n- **训练资源浪费**：缺乏高效的分布式训练框架，多车协同策略的训练周期长达数周，严重拖慢项目迭代速度。\n\n### 使用 AI-Optimizer 后\n- **突破维度诅咒**：利用其内置的可扩展多智能体神经网络，通过排列不变性特性有效压缩搜索空间，轻松支撑百车规模的协同训练。\n- **稳定动态学习**：采用先进的多智能体强化学习（MARL）算法，专门针对非平稳环境设计，确保在同伴策略变化时仍能稳定优化。\n- **精准信用分配**：内置的多智能体信用分配机制能精确拆解团队奖励，让每辆 AGV 都明确自身动作价值，显著提升协作默契。\n- **高效协同探索**：借助渐进式互信息协作机制，智能体间能主动共享探索信息，快速找到全局最优的避障与路径规划策略。\n- **加速策略落地**：依托灵活的分布式训练框架，将原本数周的训练时间缩短至数天，大幅加快从仿真到真实仓库部署的进程。\n\nAI-Optimizer 通过解决多智能体协作中的核心理论难题，将复杂的群智调度从“不可训”变为“高效落地”，真正释放了群体智能的商业价值。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTJU-DRL-LAB_AI-Optimizer_a8c96a63.png","TJU-DRL-LAB","Tianjin University - Deep Reinforcement Learning Lab (DRL-LAB)","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FTJU-DRL-LAB_a4df2db8.png","Offiical account of Tianjin University Deep Reinforcement Learning Lab",null,"yanzheng@tju.edu.cn","http:\u002F\u002Fwww.icdai.org\u002F","https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB",[81,85,88,92],{"name":82,"color":83,"percentage":84},"Python","#3572A5",97.9,{"name":86,"color":87,"percentage":32},"Cython","#fedf5b",{"name":89,"color":90,"percentage":91},"Shell","#89e051",0.1,{"name":93,"color":94,"percentage":95},"C++","#f34b7d",0,3462,597,"2026-03-28T17:39:43",4,"","未说明",{"notes":103,"python":101,"dependencies":104},"README 主要介绍了算法库的功能（多智能体、离线、自监督强化学习等）及研究成果，未提供具体的安装指南或硬件环境需求。其中离线强化学习模块基于 d3rlpy 开发。建议参考各子模块的具体代码仓库或联系作者获取详细运行环境配置。",[105],"d3rlpy (针对 Offline RL 模块)",[14],[108,109,110],"reinforcement-learning","transfer-learning","deep-learning","2026-03-27T02:49:30.150509","2026-04-07T00:48:25.233122",[114,119],{"id":115,"question_zh":116,"answer_zh":117,"source_url":118},20593,"采样版 MuZero（Sampled MuZero）的实现是否已完成？","目前尚未完成。当前仓库中的代码实际上是通用版 MuZero（General MuZero），仅作为开发采样版 MuZero 的基础。团队已实现的版本尚无法复现论文中的结果，因此在性能达到论文报告水平之前，不打算开源该代码。一旦获得可比的性能，将尽快发布代码。","https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Fissues\u002F2",{"id":120,"question_zh":121,"answer_zh":122,"source_url":123},20594,"项目推荐使用哪个版本的 Python？","虽然用户建议 Python 3.7 是机器学习中最稳定且常用的版本，但维护者表示正在改进代码并更新 README 中的相关说明（包括具体的 Python 版本要求）。建议关注 README 文件的后续更新以获取官方指定的版本信息。","https:\u002F\u002Fgithub.com\u002FTJU-DRL-LAB\u002FAI-Optimizer\u002Fissues\u002F1",[]]