[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-FoundationAgents--awesome-foundation-agents":3,"tool-FoundationAgents--awesome-foundation-agents":61},[4,18,26,36,44,52],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",142651,2,"2026-04-06T23:34:12",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":10,"last_commit_at":50,"category_tags":51,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":53,"name":54,"github_repo":55,"description_zh":56,"stars":57,"difficulty_score":10,"last_commit_at":58,"category_tags":59,"status":17},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[14,15,13,60],"视频",{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":75,"owner_location":75,"owner_email":75,"owner_twitter":75,"owner_website":75,"owner_url":76,"languages":75,"stars":77,"forks":78,"last_commit_at":79,"license":80,"difficulty_score":81,"env_os":82,"env_gpu":83,"env_ram":83,"env_deps":84,"category_tags":87,"github_topics":75,"view_count":32,"oss_zip_url":75,"oss_zip_packed_at":75,"status":17,"created_at":89,"updated_at":90,"faqs":91,"releases":92},4694,"FoundationAgents\u002Fawesome-foundation-agents","awesome-foundation-agents","About Awesome things towards foundation agents. Papers \u002F Repos \u002F Blogs \u002F ...","awesome-foundation-agents 是一个专注于“基础智能体（Foundation Agents）”领域的精选资源库，旨在系统性地梳理通往通用智能体的技术路径。它汇集了该方向最前沿的学术论文、开源代码库、技术博客及综述文章，帮助从业者快速把握研究脉络。\n\n当前大模型虽强，但构建具备认知、记忆、感知、规划及自我进化能力的完整智能体仍面临诸多挑战，相关研究分散且难以追踪。awesome-foundation-agents 通过将复杂的研究版图结构化，按认知机制、自我增强、多智能体协作及 AI 安全等核心维度进行分类整理，有效解决了信息碎片化问题，让开发者能一站式获取从理论框架到落地实践的关键资料。\n\n该项目特别适合人工智能研究人员、大模型开发者以及对 Agent 架构感兴趣的技术决策者使用。无论是希望深入理解智能体底层原理的学者，还是寻求复现最新算法（如强化学习微调、思维链推理）的工程师，都能从中找到高价值的参考指引。\n\n其独特亮点在于不仅罗列资源，更提出了清晰的基础智能体概念框架，并持续更新包括自进化机制、世界模型构建等前沿议题的最新成果。作为一个由社区共同维护的开放项目","awesome-foundation-agents 是一个专注于“基础智能体（Foundation Agents）”领域的精选资源库，旨在系统性地梳理通往通用智能体的技术路径。它汇集了该方向最前沿的学术论文、开源代码库、技术博客及综述文章，帮助从业者快速把握研究脉络。\n\n当前大模型虽强，但构建具备认知、记忆、感知、规划及自我进化能力的完整智能体仍面临诸多挑战，相关研究分散且难以追踪。awesome-foundation-agents 通过将复杂的研究版图结构化，按认知机制、自我增强、多智能体协作及 AI 安全等核心维度进行分类整理，有效解决了信息碎片化问题，让开发者能一站式获取从理论框架到落地实践的关键资料。\n\n该项目特别适合人工智能研究人员、大模型开发者以及对 Agent 架构感兴趣的技术决策者使用。无论是希望深入理解智能体底层原理的学者，还是寻求复现最新算法（如强化学习微调、思维链推理）的工程师，都能从中找到高价值的参考指引。\n\n其独特亮点在于不仅罗列资源，更提出了清晰的基础智能体概念框架，并持续更新包括自进化机制、世界模型构建等前沿议题的最新成果。作为一个由社区共同维护的开放项目，它动态收录全球范围内的创新工作，是探索下一代自主智能系统不可或缺的导航图。","# Awesome-Foundation-Agents\n\n[![PR Welcome](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPRs-welcome-brightgreen)](https:\u002F\u002Fgithub.com\u002FFoundationAgents\u002Fawesome-foundation-agents\u002Fpulls)\n[![License: MIT](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-yellow.svg)](LICENSE)\n[![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n[![Arxiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-FoundationAgents-b31b1b)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.01990)\n\nWe maintain a curated collection of papers exploring the path towards Foundation Agents, with a focus on formulating the core concepts and navigating the research landscape.\n\n⌛️ Coming soon: Version 2! We're continuously compiling and updating cutting-edge insights. Feel free to suggest any related work you find valuable!\n\n## Our Works Towards Foundation Agents\n\n✨✨✨ [Advances and Challenges in Foundation Agents](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2504.01990) (Paper)\n\n![The key of human brain.](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_e7a4bd74782a.png)\n![The Framework of Foundation Agent](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_352986ddca36.png)\n\n# Awesome Papers\n\n\u003Cfont size=5>\u003Ccenter>\u003Cb> Table of Contents \u003C\u002Fb> \u003C\u002Fcenter>\u003C\u002Ffont>\n- [Core Components of Intelligent Agents](#core-components-of-intelligent-agents)\n    - [Cognition](#cognition)\n    - [Memory](#memory)\n    - [Perception](#perception)\n    - [World Model](#world-model)\n    - [Action](#action)\n    - [Reward](#reward)\n    - [Emotion](#emotion)\n- [Self-Enhancement in Intelligent Agents](#self-enhancement-in-intelligent-agents)\n- [Collaborative and Evolutionary Intelligent Systems](#collaborative-and-evolutionary-intelligent-systems)\n- [Building Safe and Beneficial AI](#building-safe-and-beneficial-ai)\n\n\n# Core Components of Intelligent Agents\n\n## Cognition\n\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_a47377e3acf7.png\" alt=\"Cognition System\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Learning\n#### Space\n##### Full\n- **Add SFT,RLHF,PEFT**\n - **ReFT: Reasoning with Reinforced Fine-Tuning**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.08967)] [[code]()] \n- **Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning** [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09516)] [[code](https:\u002F\u002Fgithub.com\u002FPeterGriffinJin\u002FSearch-R1)]\n - **R1-Searcher: Incentivizing the Search Capability in LLMs via Reinforcement Learning**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.05592)] [[code]()] \n\n\n##### Partial\n- **Chain-of-Thought Prompting Elicits Reasoning in Large Language Models**, Wei et al. 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)] [[code]()]\n- **Voyager: An Open-Ended Embodied Agent with Large Language Models**, arxiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)] [[code]()]\n- **Reflexion: Language Agents with Verbal Reinforcement Learning**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[code]()]\n- **ReAct meets ActRe: Autonomous Annotations of Agent Trajectories for Contrastive Self-Training**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.14589)] [[code]()]\n- **Generative Agents: Interactive Simulacra of Human Behavior**, ACM UIST 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03442)] [[code]()]\n\n#### Objective\n##### Perception\n- **CLIP: Learning Transferable Visual Models from Natural Language Supervision**, ICML 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)] [[code]()]\n- **LLaVA: Visual Instruction Tuning**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)] [[code]()]\n- **CogVLM: Visual Expert for Pretrained Language Models**, NeurIPS 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.03079)] [[code]()]\n - **Qwen2-Audio Technical Report**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.10759)] [[code]()] \n - **Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09516)] [[code]()] \n\n\n##### Reasoning\n - **SKY-T1: Train Your Own o1 Preview Model Within $450**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.07374)] [[code]()] \n - **Open Thoughts**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.04178)] [[code]()] \n - **LIMO: Less is More for Reasoning**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03387)] [[code]()] \n - **STaR: Bootstrapping Reasoning with Reasoning**, arxiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465)] [[code]()] \n- **ReST: Reinforced Self-Training for Language Modeling**, arxiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08998)] [[code]()]\n - **OpenR: An Open Source Framework for Advanced Reasoning with Large Language Models**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.09671)] [[code]()] \n - **LLaMA-Berry: Pairwise Optimization for o1-like Olympiad-level Mathematical Reasoning**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.02884)] [[code]()] \n - **RAGEN: Training Agents by Reinforcing Reasoning**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.20073)] [[code]()] \n - **Open-R1**, 2024, [[paper](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fopen-r1)] [[code]()] \n\n##### World\n- **Inner Monologue: Embodied Reasoning through Planning with Language Models**, CoRL 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.05608)] [[code]()]\n- **Self-Refine: Iterative Refinement with Self-Feedback**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)] [[code]()]\n- **Reflexion: Language Agents with Verbal Reinforcement Learning**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[code]()]\n- **ExpeL: LLM Agents Are Experiential Learners**, AAAI 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.10144)] [[code]()]\n- **AutoManual: Generating Instruction Manuals by LLM Agents via Interactive Environmental Learning**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16247)] [[code]()]\n- **ReAct meets ActRe: Autonomous Annotations of Agent Trajectories for Contrastive Self-Training**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.14589)] [[code]()]\n\n### Reasoning\n#### Structured\n##### Dynamic\n- **ReAct: Synergizing Reasoning and Acting in Language Models**, arxiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)] [[code]()]\n - **Markov Chain of Thought for Efficient Mathematical Reasoning**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17635)] [[code]()] \n- **Tree of Thoughts: Deliberate Problem Solving with Large Language Models**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)] [[code]()]\n- **Language Agent Tree Search Unifies Reasoning, Acting, and Planning in Language Models**, ICML 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04406)] [[code]()]\n- **Reasoning via Planning (RAP): Improving Language Models with World Models**, EMNLP 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)] [[code]()]\n- **Graph of Thoughts: Solving Elaborate Problems with Large Language Models**, AAAI 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.09687)] [[code]()]\n - **Path of Thoughts: Extracting and Following Paths for Robust Relational Reasoning with Large Language Models**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.17963)] [[code]()] \n - **On the Diagram of Thought**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.10038)] [[code]()] \n\n##### Static\n- **Self-Consistency Improves Chain of Thought Reasoning in Language Models**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)] [[code]()]\n- **Self-Refine: Iterative Refinement with Self-Feedback**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)] [[code]()]\n- **Progressive-Hint Prompting Improves Reasoning in Large Language Models**, arxiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09797)] [[code]()]\n - **On the Self-Verification Limitations of Large Language Models on Reasoning and Planning Tasks**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08115)] [[code]()] \n- **Chain-of-Verification Reduces Hallucination in Large Language Models**, ICLR 2024 Workshop, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.11495)] [[code]()]\n\n\n##### Domain\n- **MathPrompter: Mathematical Reasoning Using Large Language Models**, ACL 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05398)] [[code]()]\n - **LLMs Can Find Mathematical Reasoning Mistakes by Pedagogical Chain-of-Thought**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.06705)] [[code]()] \n - **Physics Reasoner: Knowledge-Augmented Reasoning for Solving Physics Problems with Large Language Models**, COLING 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.13791)] [[code]()] \n\n\n#### Unstructured\n##### Prompt\n- **Chain of Thought Prompting Elicits Reasoning in Large Language Models**, NeurIPS 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)] [[code]()]\n- **Take a Step Back: Evoking Reasoning via Abstraction in Large Language Models**, ICLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06117)] [[code]()]\n- **Ask Me Anything: A Simple Strategy for Prompting Language Models**, arxiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02441)] [[code]()]\n- **Chain-of-Knowledge: Grounding Large Language Models via Dynamic Knowledge Adapting over Heterogeneous Sources**, arxiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13269)] [[code]()]\n - **Self-Explained Keywords Empower Large Language Models for Code Generation**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.15966)] [[code]()] \n\n\n##### Model\n - **DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948)] [[code]()] \n - **Claude 3.7 Sonnet**, 2025, [[paper](https:\u002F\u002Fwww.anthropic.com\u002Fnews\u002Fclaude-3-7-sonnet-and-claude-code)] [[code]()] \n - **OpenAI o1 System Card**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16720)] [[code]()] \n\n##### Implicit\n- **Quiet-STaR: Language Models Can Teach Themselves to Think Before Speaking**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09629)] [[code]()]\n - **Chain of Continuous Thought (Coconut): Training Large Language Models to Reason in a Continuous Latent Space**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06769)] [[code]()] \n\n\n#### Planning\n- **Describe, Explain, Plan and Select (DEPS): Interactive Planning with Large Language Models**, arxiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01560)] [[code]()]\n- **ProgPrompt: Generating Situated Robot Task Plans Using Large Language Models**, ICRA 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302)] [[code]()]\n - **ADAPT: As-Needed Decomposition and Planning with Language Models**, arxiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05772)] [[code]()] \n- **Tree of Thoughts: Deliberate Problem Solving with Large Language Models**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)] [[code]()]\n- **Reasoning via Planning (RAP): Improving Language Models with World Models**, EMNLP 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)] [[code]()]\n- **TravelPlanner: A Benchmark for Real-World Planning with Language Agents**, ICML 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01622)] [[code]()]\n- **PDDL—The Planning Domain Definition Language**, 1998, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1106.4561)] [[code]()]\n- **Mind2Web: Towards a Generalist Agent for the Web**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06070)] [[code]()]\n\n## Memory\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_889d4be32c5f.png\" alt=\"Memory in Intelligence Agents\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Representation\n\n#### Sensory\n##### Text‑based\n- **RecAgent: A Novel Simulation Paradigm for Recommender Systems**, TOIS 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02552)] [[code](https:\u002F\u002Fgithub.com\u002FRUC-GSAI\u002FYuLan-Rec)]\n- **CoPS: Cognitive Personalized Search: Integrating Large Language Models with an Efficient Memory Mechanism**, WWW 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.15264)]\n- **MemoryBank: Enhancing Large Language Models with Long‑Term Memory**, AAAI 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08589)] [[code](https:\u002F\u002Fgithub.com\u002Fzhongwanjun\u002FMemoryBank-SiliconFriend)]\n- **Memory Sandbox: Transparent and Interactive Memory Management for Conversational Agents**, UIST 2023 Adjunct, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.09631)]\n\n##### Multi‑modal\n- **VideoAgent: A Memory‑augmented Multimodal Agent for Video Understanding**, ECCV 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.07956)] [[code](https:\u002F\u002Fgithub.com\u002Fwxh1996\u002FVideoAgent)]\n- **WorldGPT: Empowering LLM as Multimodal World Model**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.10193)] [[code](https:\u002F\u002Fgithub.com\u002FDCDmllm\u002FWorldGPT)]\n- **Agent S: An Open Agentic Framework that Uses Computers Like a Human**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.05901)][[code](https:\u002F\u002Fgithub.com\u002Fsimular-ai\u002FAgent-S)]\n- **OS‑Copilot: Towards Generalist Computer Agents with Self‑Improvement**, ICLR 2024 LLMAgents Workshop, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.17359)] [[code](https:\u002F\u002Fgithub.com\u002FOS-Copilot\u002FOS-Copilot)]\n- **MuLan: Multimodal‑LLM Agent for Progressive and Interactive Multi‑Object Diffusion**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.11075)] [[code](https:\u002F\u002Fgithub.com\u002Fmeasure-infinity\u002Fmulan-code)]\n\n#### Short‑term\n##### Context\n- **MemGPT: Towards LLMs as Operating Systems**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07508)] [[code](https:\u002F\u002Fgithub.com\u002Fcpacker\u002FMemGPT)]\n- **KARMA: Augmenting Embodied AI Agents with Long‑ and Short‑Term Memory Systems**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09692)] [[code](https:\u002F\u002Fgithub.com\u002FWZX0Swarm0Robotics\u002FKARMA)]\n- **LSFS: From Commands to Prompts: LLM‑based Semantic File System**, ICLR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.13007)] [[code](https:\u002F\u002Fgithub.com\u002Fagiresearch\u002FAIOS-LSFS)]\n- **OSCAR: Operating System Control via State‑Aware Reasoning and Re‑Planning**, ICLR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08767)]\n- **RCI: Language Models Can Solve Computer Tasks (Recursive Criticism and Improvement)**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12934)] [[code](https:\u002F\u002Fgithub.com\u002Fposgnu\u002Frci-agent)]\n\n##### Working\n- **Generative Agent: Interactive Simulacra of Human Behavior**, UIST 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.00118)] [[code](https:\u002F\u002Fgithub.com\u002Fjoonspk-research\u002Fgenerative_agents)]\n- **RLP: Reflective Linguistic Programming (RLP): A Stepping Stone in Socially‑Aware AGI**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12647)]\n- **CALYPSO: LLMs as Dungeon Master’s Assistants**, AIIDE 2023, [[paper](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAIIDE\u002Farticle\u002Fview\u002F27546)] [[code](https:\u002F\u002Fgithub.com\u002Fnorthern-lights-province\u002Fcalypso-aiide-artifact)]\n- **HiAgent: Hierarchical Working Memory Management for Solving Long‑Horizon Agent Tasks with Large Language Model**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.12790)] [[code](https:\u002F\u002Fgithub.com\u002FHiAgent2024\u002FHiAgent)]\n\n#### Long‑term\n##### Semantic\n- **AriGraph: Learning Knowledge Graph World Models with Episodic Memory for LLM Agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04363)] [[code](https:\u002F\u002Fgithub.com\u002FAIRI-Institute\u002FAriGraph)]\n- **RecAgent**: see above\n- **HippoRAG: Neurobiologically Inspired Long‑Term Memory for Large Language Models**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14831)] [[code](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FHippoRAG)]\n\n##### Episodic\n- **MobileGPT: Augmenting LLM with Human‑like App Memory for Mobile Task Automation**, ACM MobiCom 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.03003)]\n- **MemoryBank**: see above\n- **Episodic Memory Verbalization Using Hierarchical Representations of Life‑Long Robot Experience**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.17702)] [[code](https:\u002F\u002Fhierarchical-emv.github.io)]\n- **MrSteve: Instruction‑Following Agents in Minecraft with What‑Where‑When Memory**, ICLR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.0)] *(project code pending)*\n\n##### Procedural\n- **AAG: Analogy‑Augmented Generation for LLMs**, ACL ARR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.07239)]\n- **Cradle: Empowering Foundation Agents towards General Computer Control**, ICLR 2025, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=cradle)] [[code](https:\u002F\u002Fgithub.com\u002FBAAI-Agents\u002FCradle)]\n- **JARVIS‑1: Open‑World Multi‑Task Agents with Memory‑Augmented Multimodal Language Models**, NeurIPS 2023 ALOE Workshop, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05997)] [[code](https:\u002F\u002Fgithub.com\u002FCraftJarvis\u002FJARVIS-1)]\n- **LARP: Language‑Agent Role Play for Open‑World Games**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.09352)]\n\n\n### Lifecycle\n\n#### Acquisition\n##### Information Compression\n- **HiAgent: Hierarchical Working Memory Management for Solving Long‑Horizon Agent Tasks with Large Language Model**, ACL 2025, [[paper](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.2011)] [[code](https:\u002F\u002Fgithub.com\u002FHiAgent2024\u002FHiAgent)]\n- **LMAgent: A Large-scale Multimodal Agents Society for Multi-user Simulation**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.09237)]\n- **A Human-Inspired Reading Agent with Gist Memory of Very Long Contexts**, ICML 2024, [[paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv235\u002Fhuijie_readagent_24a.html)] [[code](https:\u002F\u002Fread-agent.github.io\u002F)]\n- **Leveraging Metamemory Mechanisms for Enhanced Data-Free Code Generation in LLMs**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.07892)]\n\n##### Experience Consolidation\n- **ExpeL: LLM Agents Are Experiential Learners**, AAAI 2024, [[paper](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F29317)] [[code](https:\u002F\u002Fgithub.com\u002FLeapLabTHU\u002FExpeL)]\n- **Unified Mind Model: Reimagining Autonomous Agents in the LLM Era**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.03459)]\n- **Meta‑Learning: A Survey**, PAMI 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.03548)]\n- **``My agent understands me better'': Integrating Dynamic Human‑Like Memory Recall and Consolidation in LLM‑Based Agents**, CHI 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02485)] [[code](https:\u002F\u002Fgithub.com\u002Ftamoharu\u002FAgent-Memory-CHI24)]\n\n#### Encoding\n##### Selective Attention\n- **AgentCoord: Visually Exploring Coordination Strategy for LLM‑Based Multi‑Agent Collaboration**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.11943)] [[code](https:\u002F\u002Fgithub.com\u002FAgentCoord\u002FAgentCoord)]\n- **Memory Sharing for Large Language Model Based Agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.09982)]\n- **Understanding Long Videos via LLM‑Powered Entity Relation Graphs**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.15953)]\n- **A-MEM: Agentic Memory for LLM Agents**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12110)] [[code](https:\u002F\u002Fgithub.com\u002FWujiangXu\u002FAgenticMemory)]\n- **Robots Can Multitask Too: Integrating a Memory Architecture and LLMs for Enhanced Cross-Task Robot Action Generation**, Humanoids 2024, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10769803)]\n\n##### Multi-modal Fusion\n- **Optimus-1: Hybrid Multimodal Memory Empowered Agents Excel in Long-Horizon Tasks**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.03615)] [[code](https:\u002F\u002Fgithub.com\u002FJiuTian-VL\u002FOptimus-1)]\n- **Optimus-2: Multimodal Minecraft Agent with Goal-Observation-Action Conditioned Policy**, CVPR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.19902)] [[code](https:\u002F\u002Fgithub.com\u002FJiuTian-VL\u002FOptimus-2)]\n- **JARVIS-1: Multimodal Memory-Augmented Open-World Agent**, NeurIPS 2023 ALOE Workshop, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05997)] [[code](https:\u002F\u002Fgithub.com\u002FCraftJarvis\u002FJARVIS-1)]\n\n#### Derivation\n##### Reflection\n- **Agent S: An Open Agentic Framework that Uses Computers Like a Human**, ICLR 2025 Poster, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.05901)] [[code](https:\u002F\u002Fgithub.com\u002Fsimular-ai\u002FAgent-S)]  \n- **OSCAR: Operating System Control via State-Aware Reasoning and Re-Planning**, ICLR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08767)]  \n- **R2D2: Remembering, Reflecting and Dynamic Decision Making for Web Agents**, ACL 2025, [[paper](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.1464)]  \n- **Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks**, ACL ARR 2025 (submitted), [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15945)] [[code](https:\u002F\u002Fgithub.com\u002FX-PLUG\u002FMobileAgent\u002Ftree\u002Fmain\u002FMobile-Agent-E)]\n\n##### Summarization\n- **SummEdits: Edit-based Factuality-Oriented Summarization**, EMNLP 2023, [[paper](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.600)] [[code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fsummedits)]  \n- **SCM: Enhancing Large Language Model with Self-Controlled Memory Framework**, DASFAA 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04521)] [[code](https:\u002F\u002Fgithub.com\u002Fwbbeyourself\u002FSCM4LLMs)] \n- **Healthcare Copilot: Eliciting the Power of General LLMs for Medical Consultation**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.10045)]\n- **Recursively Summarizing Enables Long-Term Dialogue Memory in Large Language Models**, Neurocomputing 2025, [[paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fabs\u002Fpii\u002FS0925231225008653)]\n\n##### Knowledge Distillation\n- **KnowAgent: Knowledge-Augmented Planning for LLM-Based Agents**, Findings of NAACL 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.09419)] [[code](https:\u002F\u002Fgithub.com\u002FKnowAgent\u002FKnowAgent)]  \n- **AoTD: Enhancing Video-LLM Reasoning via Agent-of-Thoughts Distillation**, CVPR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.01694)]  \n- **LDPD: Language-Driven Policy Distillation**, ICLR 2024 LLM-Agents Workshop, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.19008)]  \n- **Sub-goal Distillation: Bridging Large Language Models and Goal-Conditioned RL for Long-Horizon Tasks**, CoLLAs 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.06720)]  \n- **MAGDi: Memory-Augmented Generative Debugger**, ICML 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11424)] [[code](https:\u002F\u002Fgithub.com\u002Fjustinchiu\u002FMAGDi)]\n\n##### Selective Forgetting\n- **Lyfe Agents: Generative Agents for Low-Cost Real-Time Social Interactions**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09816)]  \n- **TiM: Think-in-Memory Language Models**, ICLR 2024 (submitted), [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=TiM24)]  \n- **MemoryBank: Enhancing Large Language Models with Long-Term Memory**, AAAI 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08589)] [[code](https:\u002F\u002Fgithub.com\u002Fzhongwanjun\u002FMemoryBank-SiliconFriend)]  \n- **S³: Social-Network Simulation System with Large Language Model-Empowered Agents**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.14984)] [[code](https:\u002F\u002Fgithub.com\u002FGA-S3\u002FSocial-Simulation)]  \n- **``My agent understands me better''**: see above\n\n#### Retrieval\n##### Indexing\n- **HippoRAG: Neurobiologically Inspired Long-Term Memory for Large Language Models**, NeurIPS 2024, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=Rz1nVVnp4P)] [[project](https:\u002F\u002Fix.cs.uoregon.edu\u002F~apouranb\u002Fhmn\u002Fhmn.html)] [[code](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FHippoRAG)]\n- **TradingGPT: Multi-Agent System with Layered Memory for Simulated Stock Trading**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05767)]\n- **LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory**, ICLR 2025, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=go6gKVh6bV)] [[code](https:\u002F\u002Fgithub.com\u002Fxiaowu0162\u002FLongMemEval)]\n- **SeCom: Memory Construction and Retrieval for Long-Term Personalized Conversational Agents**, ICLR 2025, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=5eY3sG8o2k)] [[project](https:\u002F\u002Faka.ms\u002FSECOM)] [[blog](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fsecom-building-retrieval-based-long-term-memory-for-personalized-conversational-agents\u002F)]\n\n##### Matching\n- **Large Memory Layers with Product Keys**, NeurIPS 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05242)] [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FXLM)]\n- **OSAgent: Copiloting Operating System with LLM-based Agent**, IJCNN 2024, [[paper](https:\u002F\u002Fdblp.org\u002Frec\u002Fconf\u002Fijcnn\u002FXu0C24.html)]\n- **Neural Machine Translation by Jointly Learning to Align and Translate**, ICLR 2015, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.0473)]\n- **``My agent understands me better''**: see above\n\n#### Neural Memory\n##### Associative Memory\n- **Hopfield Networks is All You Need**, NeurIPS 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.02217)] [[code](https:\u002F\u002Fgithub.com\u002Fml-jku\u002Fhopfield-layers)]\n- **Hopfield Networks is All You Need**, ICLR 2021, [[paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=tL89RnzIiCd)]\n- **Neural Turing Machines for the Remaining Useful Life Estimation Problem**, Computers in Industry 2022, [[paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0166361522001678)] [[code](https:\u002F\u002Fgithub.com\u002Faranciokov\u002FNTM-For-RULEstimation)]\n\n##### Parameter Integration\n- **MemoryLLM: Towards Self-Updatable Large Language Models**, ICML 2024, [[paper](https:\u002F\u002Fcseweb.ucsd.edu\u002F~jmcauley\u002Freviews\u002Ficml24c.pdf)] [[code](https:\u002F\u002Fgithub.com\u002Fwangyu-ustc\u002FMemoryLLM)]\n- **SELF-PARAM: Self-Parameterized Retrofitting for Large Language Models**, ICLR 2025, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=2f1e7xxycZ)] [[code](https:\u002F\u002Fgithub.com\u002FXinshuangL\u002FSELF-PARAM)]\n- **MemoRAG: Boosting Long Context Processing with Global Memory-Enhanced Retrieval Augmentation**, The Web Conference (WWW) 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.05591)] [[code](https:\u002F\u002Fgithub.com\u002Fqhjqhj00\u002FMemoRAG)]\n- **Learning to (Learn at Test Time): RNNs with Expressive Hidden States**, ICLR 2025, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=N0bdUqPjbB)] [[code](https:\u002F\u002Fgithub.com\u002Ftest-time-training\u002Fttt-lm-pytorch)]\n- **Titans: Learning to Memorize at Test Time**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.08544)] [[unofficial code](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Ftitans-pytorch)]\n- **R³Mem: A Third-Order Memory for Large Language Models**, ICLR 2025 (to appear), [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06607)]\n\n#### Utilization\n##### RAG\n- **RAGLAB: Research Platform for Retrieval-Augmented Generation**, EMNLP 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.03005)] [[code](https:\u002F\u002Fgithub.com\u002Ffate-ubw\u002FRAGLAB)]\n- **When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories**, ACL 2023, [[paper](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.546.pdf)]\n- **Atlas: Few-shot Learning with Retrieval Augmented Language Models**, arXiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.03299)] [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fatlas)]\n- **Personalized Large Language Model Assistant with Evolving Conditional Memory**, COLING 2025, [[paper](https:\u002F\u002Faclanthology.org\u002F2025.coling-main.254.pdf)]\n\n##### Long-context Modeling\n- **Recurrent Memory Transformer**, Neurips 2022, [[paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Ffile\u002F47e288629a6996a17ce50b90a056a0e1-Paper-Conference.pdf)]\n- **Scaling Transformer to 1M tokens and beyond with RMT**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.11062)]\n- **Adapting Language Models to Compress Contexts**, EMNLP 2023, [[paper](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.232.pdf)]\n- **In-context Autoencoder for Context Compression in a Large Language Model**, ICLR 2024, [[paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=uREj4ZuGJE)]\n- **Learning to Compress Prompts with Gist Tokens**, NeurIPS 2023, [[paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002F3d77c6dcc7f143aa2154e7f4d5e22d68-Paper-Conference.pdf)]\n- **CompAct: Compressing Retrieved Documents Actively for Question Answering**, EMNLP 2024, [[paper](https:\u002F\u002Faclanthology.org\u002F2024.emnlp-main.1194.pdf)]\n\n##### Alleviating Hallucination\n- **Banishing LLM Hallucinations Requires Rethinking Generalization**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.17642)]\n- **Memoria: Resolving Fateful Forgetting Problem through Human-Inspired Memory Architecture**, ICML 2024, [[paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=yTz0u4B8ug)]\n- **Mixture of A Million Experts**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.04153)]\n- **Retrieve Only When It Needs: Adaptive Retrieval Augmentation for Hallucination Mitigation in Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10612)]\n\n## Perception\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_fe396a8773a4.png\" alt=\"Perception System\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Unimodal Models\n\n#### Text\n- **BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding**, 2018, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)] [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert)]\n- **RoBERTa: A Robustly Optimized BERT Pretraining Approach**, 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692)] [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq)]\n- **ALBERT: A Lite BERT for Self-supervised Learning of Language Representations** , 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942)] [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FALBERT)]\n\n#### Image\n- **Deep Residual Learning for Image Recognition**, CVPR 2016, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1512.03385)] [[code](https:\u002F\u002Fgithub.com\u002FKaimingHe\u002Fdeep-residual-networks)]\n- **End-to-End Object Detection with Transformers**, 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.12872)] [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fdetr)]\n- **Grounding DINO 1.5: Advance the \"Edge\" of Open-Set Object Detection**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.10300)] [[code](https:\u002F\u002Fgithub.com\u002FIDEA-Research\u002FGrounding-DINO-1.5-API)]\n\n#### Video\n- **ViViT: A Video Vision Transformer**, 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.15691)] [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fscenic\u002Ftree\u002Fmain\u002Fscenic\u002Fprojects\u002Fvivit)]\n- **VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.12602)] [[code](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FVideoMAE)]\n\n#### Audio\n- **FastSpeech 2: Fast and High-Quality End-to-End Text to Speech**, 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04558)] [[code](https:\u002F\u002Fspeechresearch.github.io\u002Ffastspeech2)]\n- **Seamless: Multilingual Expressive and Streaming Speech Translation**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.05187)] [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fseamless_communication)]\n- **wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations**, 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11477)] [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq\u002Ftree\u002Fmain\u002Fexamples\u002Fwav2vec)]\n\n#### Other Unimodal\n- **Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04671)] [[code](https:\u002F\u002Fgithub.com\u002Fchenfei-wu\u002FTaskMatrix)]\n- **HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FJARVIS)]\n- **MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11381)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMM-REACT)]\n- **ViperGPT: Visual Inference via Python Execution for Reasoning**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08128)] [[code](https:\u002F\u002Fgithub.com\u002Fcvlab-columbia\u002Fviper)]\n- **AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.12995)] [[code](https:\u002F\u002Fgithub.com\u002FAIGC-Audio\u002FAudioGPT)]\n- **LLaVA-Plus: Learning to Use Tools for Creating Multimodal Agents**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05437)] [[code](https:\u002F\u002Fgithub.com\u002FLLaVA-VL\u002FLLaVA-Plus-Codebase)]\n\n### Cross-modal Models\n\n#### Text-Image\n- **Learning Transferable Visual Models From Natural Language Supervision**, 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)] [[code](https:\u002F\u002Fgithub.com\u002FOpenAI\u002FCLIP)]\n- **Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision** , 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.05918)]\n- **Improving Image Generation with Better Captions**, 2023, [[paper](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fdall-e-3.pdf)]\n- **VisualBERT: A Simple and Performant Baseline for Vision and Language**, 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03557)] [[code](https:\u002F\u002Fgithub.com\u002Fuclanlp\u002Fvisualbert)]\n\n#### Text-Video\n- **VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding**, 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.14084)] [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq\u002Ftree\u002Fmain\u002Fexamples\u002FMMPT)]\n- **Phenaki: Variable Length Video Generation From Open Domain Textual Description**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02399)] [[code](https:\u002F\u002Fphenaki.github.io\u002F)]\n- **Make-A-Video: Text-to-Video Generation without Text-Video Data**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14792)] [[code](https:\u002F\u002Fmake-a-video.github.io\u002F)]\n\n#### Text-Audio\n- **Wav2CLIP: Learning Robust Audio Representations From CLIP**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11499)] [[code](https:\u002F\u002Fgithub.com\u002Fdescriptinc\u002Flyrebird-wav2clip)]\n- **VATT: Transformers for Multimodal Self-Supervised Learning from Raw Video, Audio and Text**, 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.11178)] [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fvatt)]\n- **AudioCLIP: Extending CLIP to Image, Text and Audio** , 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.13043)] [[code](https:\u002F\u002Fgithub.com\u002FAndreyGuzhov\u002FAudioCLIP)]\n\n#### Other Cross-modal\n- **CLIP-Forge: Towards Zero-Shot Text-to-Shape Generation**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.02624)] [[code](https:\u002F\u002Fgithub.com\u002FAutodeskAILab\u002FClip-Forge)]\n- **Point-E: A System for Generating 3D Point Clouds from Complex Prompts**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08751)] [[code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fpoint-e)]\n\n### MultiModal Models\n\n#### VLM (Vision-Language Models)\n- **MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.09478)] [[code](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FMiniGPT-4)]\n- **LLaVA-NeXT: Improved reasoning, OCR, and world knowledge**, 2024, [[paper](https:\u002F\u002Fllava-vl.github.io\u002Fblog\u002F2024-01-30-llava-next)] [[code](https:\u002F\u002Fgithub.com\u002FLLaVA-VL\u002FLLaVA-NeXT)]\n- **CogVLM2: Visual Language Models for Image and Video Understanding**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.16500)] [[code](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FCogVLM2)]\n- **Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12191)] [[code](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen2.5-VL)]\n- **Generative Multimodal Models are In-Context Learners**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.13286)] [[code](https:\u002F\u002Fgithub.com\u002Fbaaivision\u002FEmu)]\n\n##### Edge-Side VLM\n- **TinyGPT-V: Efficient Multimodal Large Language Model via Small Backbones**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16862)] [[code](https:\u002F\u002Fgithub.com\u002FDLYuanGod\u002FTinyGPT-V)]\n- **MobileVLM : A Fast, Strong and Open Vision Language Assistant for Mobile Devices**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16886)] [[code](https:\u002F\u002Fgithub.com\u002FMeituan-AutoML\u002FMobileVLM)]\n- **MiniCPM-V: A GPT-4V Level MLLM on Your Phone**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.01800)] [[code](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FMiniCPM-V)]\n- **OmniParser for Pure Vision Based GUI Agent** , 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.00203)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOmniParser)]\n\n#### VLA (Vision-Language for Action)\n- **CLIPort: What and Where Pathways for Robotic Manipulation**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.12098)] [[code](https:\u002F\u002Fgithub.com\u002Fcliport\u002Fcliport)]\n- **RT-1: Robotics Transformer for Real-World Control at Scale**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)] [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Frobotics_transformer)]\n- **Open-World Object Manipulation using Pre-trained Vision-Language Models**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.00905)] [[code](https:\u002F\u002Frobot-moo.github.io\u002F)]\n- **Perceiver-Actor: A Multi-Task Transformer for Robotic Manipulation**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05451)] [[code](https:\u002F\u002Fgithub.com\u002Fperact\u002Fperact)]\n- **Diffusion Policy: Visuomotor Policy Learning via Action Diffusion**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04137)] [[code](https:\u002F\u002Fgithub.com\u002Freal-stanford\u002Fdiffusion_policy)]\n- **PaLM-E: An Embodied Multimodal Language Model**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03378)] [[code](https:\u002F\u002Fpalm-e.github.io\u002F)]\n- **MultiPLY: A Multisensory Object-Centric Embodied Large Language Model in 3D World**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.08577)] [[code](https:\u002F\u002Fgithub.com\u002Feth-ait\u002FMultiPly)]\n\n#### ALM (Audio-Language Models)\n- **Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01831)] [[code](https:\u002F\u002Faudioflamingo.github.io\u002F)]\n- **SpeechVerse: A Large-scale Generalizable Audio Language Model**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.08295)]\n- **UniAudio 1.5: Large Language Model-driven Audio Codec is A Few-shot Audio Task Learner**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10056)] [[code](https:\u002F\u002Fgithub.com\u002Fyangdongchao\u002FLLM-Codec)]\n- **Qwen2-Audio Technical Report**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.10759)] [[code](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen2-Audio)]\n- **AudioLM: a Language Modeling Approach to Audio Generation**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03143)] [[code](https:\u002F\u002Fgoogle-research.github.io\u002Fseanet\u002Faudiolm\u002Fexamples)]\n- **Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.16725)] [[code](https:\u002F\u002Fgithub.com\u002Fgpt-omni\u002Fmini-omni)]\n- **SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11000)] [[code](https:\u002F\u002Fgithub.com\u002F0nutation\u002FSpeechGPT)]\n\n#### AVLM (Audio-Visual-Language Models)\n- **ONE-PEACE: Exploring One General Representation Model Toward Unlimited Modalities**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11172)] [[code](https:\u002F\u002Fgithub.com\u002FOFA-Sys\u002FONE-PEACE)]\n- **PandaGPT: One Model To Instruction-Follow Them All**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16355)] [[code](https:\u002F\u002Fgithub.com\u002Fyxuansu\u002FPandaGPT)]\n- **Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration** , 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09093)] [[code](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM)]\n- **LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01852)] [[code](https:\u002F\u002Fgithub.com\u002FPKU-YuanGroup\u002FLanguageBind)]\n- **UnIVAL: Unified Model for Image, Video, Audio and Language Tasks**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.16184)] [[code](https:\u002F\u002Fgithub.com\u002Fmshukor\u002FUnIVAL)]\n- **X-LLM: Bootstrapping Advanced Large Language Models by Treating Multi-Modalities as Foreign Languages**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04160)] [[code](https:\u002F\u002Fgithub.com\u002Fphellonchen\u002FX-LLM)]\n\n#### Other MultiModal\n- **PointLLM: Empowering Large Language Models to Understand Point Clouds**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.16911)] [[code](https:\u002F\u002Fgithub.com\u002FOpenRobotLab\u002FPointLLM)]\n- **MiniGPT-3D: Efficiently Aligning 3D Point Clouds with Large Language Models using 2D Priors**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.01413)] [[code](https:\u002F\u002Fgithub.com\u002FTangYuan96\u002FMiniGPT-3D)]\n- **NExT-GPT: Any-to-Any Multimodal LLM**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05519)] [[code](https:\u002F\u002Fgithub.com\u002FNExT-GPT\u002FNExT-GPT)]\n- **Unified-IO 2: Scaling Autoregressive Multimodal Models with Vision, Language, Audio, and Action**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17172)] [[code](https:\u002F\u002Fgithub.com\u002Fallenai\u002Funified-io-2)]\n - **CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.18775)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fi-Code\u002Ftree\u002Fmain\u002FCoDi-2)] \n- **ModaVerse: Efficiently Transforming Modalities with LLMs**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.06395)] [[code](https:\u002F\u002Fgithub.com\u002Fxinke-wang\u002FModaVerse)]\n\n## World Model\n\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_f4b9199bfc99.png\" alt=\"World Model in Foundation Agents\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### External Approaches\n**DINO-WM [358]: Video World Models on Pre-trained Visual Features Enable Zero-Shot Planning**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.04983)], [[code][]]\n\n**SAPIEN [351]: A Simulated Part-based Interactive Environment**, CVPR 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08515)], [[code][]]\n\n**MuZero [349]: Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model**, Nature 2020, [[paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03051-4)], [[code][]]\n\n**GR-2 [357]: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06158)], [[code][]]\n\n**COAT [356]: Discovery of the Hidden World with Large Language Models**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03941)], [[code][]]\n\n**AutoManual [108]: Generating Instruction Manuals by LLM Agents via Interactive Environmental Learning**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16247)], [[code][]]\n\n **PILCO [355]: A Model-Based and Data-Efficient Approach to Policy Search**, ICML 2011, [[paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv17\u002Fdeisenroth12a.html)], [[code][]] \n\n### Internal Approaches\n**ActRe [49]: ReAct meets ActRe: Autonomous Annotations of Agent Trajectories for Contrastive Self-Training**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.14589)], [[code][]]\n\n**World Models [348]: World Models**, NeurIPS 2018, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.10122)], [[code][]]\n\n**Dreamer [350]: Dream to Control: Learning Behaviors by Latent Imagination**, ICLR 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603)], [[code][]]\n\n**Diffusion WM [353]: Diffusion for World Modeling: Visual Details Matter in Atari**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.12399)], [[code][]]\n\n **GQN [354]: Neural Scene Representation and Rendering**, Science 2018, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.07422)], [[code][]] \n\n **Daydreamer [352]: World Models for Physical Robot Learning**, CoRL 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176)], [[code][]] \n\n## Action\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_110777c4a2df.jpg\" alt=\"The action.\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Action Space:\n\n### Language\n\n#### Text\n\n- **ReAct: Synergizing Reasoning and Acting in Language Models**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)] [[code](https:\u002F\u002Fgithub.com\u002Fysymyth\u002FReAct)]\n\n- **AutoGPT: Build, Deploy, and Run AI Agents**, Github, [[code](https:\u002F\u002Fgithub.com\u002FSignificant-Gravitas\u002FAutoGPT)]\n\n- **Reflexion: Language Agents with Verbal Reinforcement Learning**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[code](https:\u002F\u002Fgithub.com\u002Fnoahshinn\u002Freflexion)]\n\n- **LLM+P: Empowering Large Language Models with Optimal Planning Proficiency**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11477)] [[code](https:\u002F\u002Fgithub.com\u002FCranial-XIX\u002Fllm-pddl)]\n\n#### Code\n\n- **MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.00352)] [[code](https:\u002F\u002Fgithub.com\u002Fgeekan\u002FMetaGPT)]\n\n- **ChatDev: Communicative Agents for Software Development**, ACL 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07924)] [[code](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FChatDev)]\n\n- **SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering**, NeurIPS 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15793)] [[code](https:\u002F\u002Fgithub.com\u002FSWE-agent\u002FSWE-agent)]\n\n- **OpenHands: An Open Platform for AI Software Developers as Generalist Agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.16741)] [[code](https:\u002F\u002Fgithub.com\u002FAll-Hands-AI\u002FOpenHands)]\n- \n#### Chat\n\n- **Generative Agents: Interactive Simulacra of Human Behavior**, UIST 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03442)] [[code](https:\u002F\u002Fgithub.com\u002Fjoonspk-research\u002Fgenerative_agents)]\n\n- **AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation**, COLM 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08155)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fautogen)]\n\n### Digital\n\n#### Game\n\n- **MineDojo: Building Open-Ended Embodied Agents with Internet-Scale Knowledge**, NeurIPS 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08853)] [[code](https:\u002F\u002Fgithub.com\u002FMineDojo\u002FMineDojo)]\n  \n- **Voyager: An Open-Ended Embodied Agent with Large Language Models**, TMLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)] [[code](https:\u002F\u002Fgithub.com\u002FMineDojo\u002FVoyager)]\n\n- **SwarmBrain: Embodied agent for real-time strategy game StarCraft II via large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.17749)] [[code](https:\u002F\u002Fgithub.com\u002Framsayxiaoshao\u002FSwarmBrain)]\n\n- **JARVIS-1: Open-World Multi-task Agents with Memory-Augmented Multimodal Language Models**, NeurIPS 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05997)] [[code](https:\u002F\u002Fgithub.com\u002FCraftJarvis\u002FJARVIS-1)]\n\n#### Multimodal\n\n- **MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11381)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMM-REACT)]\n\n- **ViperGPT: Visual Inference via Python Execution for Reasoning**, ICCV 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08128)] [[code](https:\u002F\u002Fgithub.com\u002Fcvlab-columbia\u002Fviper)]\n\n- **Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04671)] [[code](https:\u002F\u002Fgithub.com\u002Fhackiey\u002Fvisual-chatgpt)]\n\n- **HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.17580)] [[code](https:\u002F\u002Fgithub.com\u002FAI-Chef\u002FHuggingGPT)]\n\n#### Web\n\n- **WebGPT: Browser-assisted question-answering with human feedback**, arXiv 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.09332)] [[blog](https:\u002F\u002Fopenai.com\u002Findex\u002Fwebgpt\u002F)]\n\n- **WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents**, NeurIPS 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.01206)] [[code](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FWebShop)]\n\n- **A Real-World WebAgent with Planning, Long Context Understanding, and Program Synthesis**, ICLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.12856)]\n\n- **Mind2Web: Towards a Generalist Agent for the Web**, NeurIPS 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06070)] [[code](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FMind2Web)]\n\n#### GUI\n\n- **Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual Perception**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.16158)] [[code](https:\u002F\u002Fgithub.com\u002FX-PLUG\u002FMobileAgent)]\n\n- **AppAgent: Multimodal Agents as Smartphone Users**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.13771)] [[code](https:\u002F\u002Fgithub.com\u002FTencentQQGYLab\u002FAppAgent)]\n\n- **UFO: A UI-Focused Agent for Windows OS Interaction**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07939)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUFO)]\n\n- **OmniParser for Pure Vision Based GUI Agent**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.00203)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOmniParser)]\n\n#### DB & KG\n\n- **A Survey of NL2SQL with Large Language Models: Where are we, and where are we going?**, arXiv 2024, [[paper](arxiv.org\u002Fabs\u002F2408.05109)] [[Handbook](https:\u002F\u002Fgithub.com\u002FHKUSTDial\u002FNL2SQL_Handbook)]\n\n- **Alpha-SQL: Zero-Shot Text-to-SQL using Monte Carlo Tree Search**, ICML 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17248)]\n  \n- **NL2SQL-Bugs: A Benchmark for Detecting Semantic Errors in NL2SQL Translation**, SIGKDD 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11984)] [[code](https:\u002F\u002Fnl2sql-bugs.github.io\u002F)]\n  \n- **EllieSQL: Cost-Efficient Text-to-SQL with Complexity-Aware Routing**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.22402)] [[code](https:\u002F\u002Felliesql.github.io\u002F)]\n  \n- **nvBench 2.0: A Benchmark for Natural Language to Visualization under Ambiguity**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.12880)] [[code](https:\u002F\u002Fnvbench2.github.io\u002F)]\n\n- **The Dawn of Natural Language to SQL: Are We Fully Ready?**, VLDB 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01265)] [[code](https:\u002F\u002Fnl2sql360.github.io\u002F)]\n  \n- **Are Large Language Models Good Statisticians?**, NIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07815)] [[code](https:\u002F\u002Fstatqa.github.io\u002F)]\n\n- **UnifiedSKG: Unifying and Multi-Tasking Structured Knowledge Grounding with Text-to-Text Language Models**, EMNLP 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.05966)] [[code](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FUnifiedSKG)]\n\n- **Don't Generate, Discriminate: A Proposal for Grounding Language Models to Real-World Environments**, ACL 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09736)] [[code](https:\u002F\u002Fgithub.com\u002Fdki-lab\u002FPangu)]\n\n- **Can LLM Already Serve as A Database Interface? A BIg Bench for Large-Scale Database Grounded Text-to-SQLs**, NeurIPS 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.03111)] [[project](https:\u002F\u002Fbird-bench.github.io\u002F)]\n\n- **Spider 2.0: Evaluating language models on real-world enterprise text-to-sql workflows.**, ICLR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.07763)] [[code](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FSpider2)]\n\n- **Middleware for llms: Tools are instrumental for language agents in complex environments.**, EMNLP 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14672)] [[code](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FMiddleware)]\n\n### Physical\n\n- **RT-1: Robotics Transformer for Real-World Control at Scale**, RSS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)] [[project](https:\u002F\u002Frobotics-transformer1.github.io\u002F)]\n\n- **RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control**, CoRL 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15818)] [[project](https:\u002F\u002Frobotics-transformer2.github.io\u002F)]\n\n- **Open X-Embodiment: Robotic Learning Datasets and RT-X Models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.08864v4)] [[project](https:\u002F\u002Frobotics-transformer-x.github.io\u002F)]\n  \n- **GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06158)] [[project](https:\u002F\u002Fgr2-manipulation.github.io\u002F)]\n  \n- **π0: A vision-language-action flow model for general robot control.**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.24164)]\n\n- **Do as I can, not as I say Grounding language in robotic affordances**, CoRL 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.01691)] [[project](https:\u002F\u002Fsay-can.github.io\u002F)]\n\n- **Voxposer: Composable 3d value maps for robotic manipulation with language models.**, CoRL 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.05973)] [[code](https:\u002F\u002Fgithub.com\u002Fhuangwl18\u002FVoxPoser)]\n\n- **Embodiedgpt: Vision-language pre-training via embodied chain of thought.**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15021)] [[project](https:\u002F\u002Fembodiedgpt.github.io\u002F)]\n\n### Learning\n\n### ICL (In-Context Learning)\n\n#### Prompt\n\n- **CoT: Chain-of-Thought Prompting Elicits Reasoning in Large Language Models**, NeurIPS 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)]\n\n- **ReAct: React: Synergizing reasoning and acting in language models**, arXiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)] [[project](https:\u002F\u002Freact-lm.github.io\u002F)]\n\n- **Auto-CoT: Automatic Chain of Thought Prompting in Large Language Models**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03493)] [[code](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fauto-cot)]\n\n- **ToT: Tree of Thoughts: Deliberate Problem Solving with Large Language Models**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)] [[code](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002Ftree-of-thought-llm)]\n\n- **GoT: Graph of Thoughts: Solving Elaborate Problems with Large Language Models**, AAAI 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.09687)] [[code](https:\u002F\u002Fgithub.com\u002Fspcl\u002Fgraph-of-thoughts)]\n\n- **LearnAct: Empowering Large Language Model Agents through Action Learning**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15809)] [[code](https:\u002F\u002Fgithub.com\u002Fzhao-ht\u002FLearnAct)]\n\n- **CoA: Improving Multi-Agent Debate with Sparse Communication Topology**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11776)]\n\n#### Decompose\n\n- **Least-to-Most: Least-to-Most Prompting Enables Complex Reasoning in Large Language Models**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625)]\n\n- **HuggingGPT: Hugginggpt: Solving ai tasks with chatgpt and its friends in hugging face**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FJARVIS)]\n\n- **Plan-and-Solve: Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models**, ACL 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04091)] [[code](https:\u002F\u002Fgithub.com\u002FAGI-Edgerunners\u002FPlan-and-Solve-Prompting)]\n\n- **ProgPrompt: Progprompt: Generating situated robot task plans using large language models**, ICRA 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302)] [[project](https:\u002F\u002Fprogprompt.github.io\u002F)]\n\n#### Role-play\n\n- **Generative Agents: Generative agents: Interactive simulacra of human behavio**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03442)] [[code](https:\u002F\u002Fgithub.com\u002Fjoonspk-research\u002Fgenerative_agents)]\n\n- **MetaGPT: Meta{GPT}: Meta Programming for Multi-Agent Collaborative Framework**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.00352)] [[code](https:\u002F\u002Fgithub.com\u002Fgeekan\u002FMetaGPT)]\n\n- **ChatDev: ChatDev: Communicative Agents for Software Development**, ACL 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07924)] [[code](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FChatDev)]\n\n- **SWE-Agent: SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15793)] [[project](https:\u002F\u002Fswe-agent.com\u002Flatest\u002F)]\n\n#### Refine\n\n- **Reflexion: Reflexion: language agents with verbal reinforcement learning**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[code](https:\u002F\u002Fgithub.com\u002Fnoahshinn\u002Freflexion)]\n\n- **Self-refine: Self-refine: Iterative refinement with self-feedback**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)] [[code](https:\u002F\u002Fgithub.com\u002Fmadaan\u002Fself-refine)]\n\n- **GPTSwarm: GPTSwarm: Language Agents as Optimizable Graphs**, ICML 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16823v3)] [[project](https:\u002F\u002Fgptswarm.org\u002F)]\n\n### PT & SFT (Pre-Training & Supervised Fine-Tuning)\n\n#### Pre-Train\n\n- **RT-1: RT-1: Robotics Transformer for Real-World Control at Scale**, arXiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)] [[project](https:\u002F\u002Frobotics-transformer1.github.io\u002F)]\n\n- **RT-2: RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15818)] [[project](https:\u002F\u002Frobotics-transformer2.github.io\u002F)]\n\n- **RT-X: Open x-embodiment: Robotic learning datasets and rt-x models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.08864)] [[project](https:\u002F\u002Frobotics-transformer-x.github.io\u002F)]\n\n- **GR-2: GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06158)] [[project](https:\u002F\u002Fgr2-manipulation.github.io\u002F)]\n\n- **LAM: Large Action Models: From Inception to Implementation**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.10047)] [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUFO\u002Ftree\u002Fmain\u002Fdataflow)]\n\n#### SFT\n\n- **CogACT: CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.19650)] [[project](https:\u002F\u002Fcogact.github.io\u002F)]\n\n- **RT-H: RT-H: Action Hierarchies Using Language**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.01823)] [[project](https:\u002F\u002Frt-hierarchy.github.io\u002F)]\n\n- **OpenVLA: OpenVLA: An Open-Source Vision-Language-Action Model**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09246)] [[project](https:\u002F\u002Fopenvla.github.io\u002F)]\n\n- **$\\pi_0$: $\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.24164)] [[project](https:\u002F\u002Fwww.physicalintelligence.company\u002Fblog\u002Fpi0)]\n\n- **UniAct: Universal Actions for Enhanced Embodied Foundation Models**, CVPR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.10105)] [[code](https:\u002F\u002Fgithub.com\u002F2toinf\u002FUniAct)]\n\n### RL (Reinforcement Learning)\n\n- **RLHF: Training language models to follow instructions with human feedback**, NeurIPS 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.02155)]\n\n- **DPO: Direct preference optimization: Your language model is secretly a reward model**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18290)]\n\n- **RLFP: Reinforcement Learning with Foundation Priors: Let the Embodied Agent Efficiently Learn on Its Own**, CoRL 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02635)] [[project](https:\u002F\u002Fyewr.github.io\u002Frlfp\u002F)]\n\n- **ELLM: Guiding pretraining in reinforcement learning with large language models**, ICML 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.06692)] [[code](https:\u002F\u002Fgithub.com\u002Fyuqingd\u002Fellm)]\n\n- **GenSim: Gensim: Generating robotic simulation tasks via large language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01361)] [[project](https:\u002F\u002Fgen-sim.github.io\u002F)]\n\n- **LEA: Reinforcement learning-based recommender systems with large language models for state reward and action modeling**, ACM 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.16948)]\n\n- **MLAQ: Empowering LLM Agents with Zero-Shot Optimal Decision-Making through Q-learning**, ICLR 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11211)]\n\n- **KALM: KALM: Knowledgeable Agents by Offline Reinforcement Learning from Large Language Model Rollouts**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.09248)] [[project](https:\u002F\u002Fkalmneurips2024.github.io\u002F)]\n\n- **When2Ask: Enabling intelligent interactions between an agent and an LLM: A reinforcement learning approach**, RLC 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.03604)]\n\n- **Eureka: Eureka: Human-level reward design via coding large language models**, ICLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.12931)] [[project](https:\u002F\u002Feureka-research.github.io\u002F)]\n\n- **ArCHer: ArCHer: Training Language Model Agents via Hierarchical Multi-Turn RL**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.19446)] [[project](https:\u002F\u002Fyifeizhou02.github.io\u002Farcher.io\u002F)]\n\n- **LLaRP: Large Language Models as Generalizable Policies for Embodied Tasks**, ICLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.17722)] [[project](https:\u002F\u002Fllm-rl.github.io\u002F)]\n\n- **GPTSwarm: GPTSwarm: Language Agents as Optimizable Graphs**, ICML 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.16823)] [[project](https:\u002F\u002Fgptswarm.org\u002F)]\n\n## Reward\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_b69f0704a2cd.png\" alt=\"Reward System\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Extrinsic Reward\n#### Dense Reward\n- **Training language models to follow instructions with human feedback**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155)] [[code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Ffollowing-instructions-human-feedback)]\n- **Offline Regularised Reinforcement Learning for Large Language Models Alignment**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19107)]\n- **sDPO: Don't Use Your Data All at Once**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.19270)]\n- **A General Theoretical Paradigm to Understand Learning from Human Preferences**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.12036)]\n- **β-DPO: Direct Preference Optimization with Dynamic β**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.08639)]\n- **ORPO: Monolithic Preference Optimization without Reference Model**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07691)] [[code](https:\u002F\u002Fgithub.com\u002Fxfactlab\u002Forpo)]\n- **Direct Nash Optimization: Teaching Language Models to Self-Improve with General Preferences**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.03715)]\n- **Beyond Reverse KL: Generalizing Direct Preference Optimization with Diverse Divergence Constraints**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16240)]\n- **Some things are more CRINGE than others: Iterative Preference Optimization with the Pairwise Cringe Loss**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16682)]\n- **From r to Q∗: Your Language Model is Secretly a Q-Function**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.12358)] [[code]()]\n\n#### Sparse Reward\n- **PAFT: A Parallel Training Paradigm for Effective LLM Fine-Tuning**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.17923)]\n- **SimPO: Simple Preference Optimization with a Reference-Free Reward**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14734)] [[code](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FSimPO)]\n- **LiPO: Listwise Preference Optimization through Learning-to-Rank**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01878)] [[code]()]\n- **RRHF: Rank Responses to Align Language Models with Human Feedback without tears**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05302)] [[code](https:\u002F\u002Fgithub.com\u002FGanjinZero\u002FRRHF)]\n- **Preference Ranking Optimization for Human Alignment**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17492)] [[code](https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FDAMO-ConvAI\u002Ftree\u002Fmain\u002FPRO)]\n- **Negating Negatives: Alignment with Human Negative Samples via Distributional Dispreference Optimization**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.03419)]\n- **Negative Preference Optimization: From Catastrophic Collapse to Effective Unlearning**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05868)] [[code](https:\u002F\u002Fgithub.com\u002Flicong-lin\u002Fnegative-preference-optimization)]\n- **Back to Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14740)] [[code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fsummarize-from-feedback)]\n\n#### Delayed Reward\n- **Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.08417)] [[code](https:\u002F\u002Fgithub.com\u002Ffe1ixxu\u002FALMA)]\n- **Nash Learning from Human Feedback**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00886)]\n- **A Minimaximalist Approach to Reinforcement Learning from Human Feedback**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04056)]\n\n#### Adaptive Reward\n- **Training language models to follow instructions with human feedback**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155)] [[code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Ffollowing-instructions-human-feedback)]\n- **Offline Regularised Reinforcement Learning for Large Language Models Alignment**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19107)]\n- **β-DPO: Direct Preference Optimization with Dynamic β**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.08639)]\n- **ORPO: Monolithic Preference Optimization without Reference Model**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07691)] [[code](https:\u002F\u002Fgithub.com\u002Fxfactlab\u002Forpo)]\n- **PAFT: A Parallel Training Paradigm for Effective LLM Fine-Tuning**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.17923)]\n- **SimPO: Simple Preference Optimization with a Reference-Free Reward**, 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14734)] [[code](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FSimPO)]\n- **Nash Learning from Human Feedback**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00886)]\n- **A Minimaximalist Approach to Reinforcement Learning from Human Feedback**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04056)]\n- **Beyond Reverse KL: Generalizing Direct Preference Optimization with Diverse Divergence Constraints**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16240)]\n\n### Intrinsic Reward\n#### Curiosity-Driven Reward\n- **Curiosity-driven Exploration by Self-supervised Prediction**, 2017, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.05363)] [[code](https:\u002F\u002Fgithub.com\u002Fpathak22\u002Fnoreward-rl)]\n- **Self-Supervised Exploration via Disagreement**, 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04161)] [[code](https:\u002F\u002Fgithub.com\u002Fpathak22\u002Fexploration-by-disagreement)]\n- **Planning to Explore via Self-Supervised World Models**, 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.05960)] [[code](https:\u002F\u002Fgithub.com\u002Framanans1\u002Fplan2explore)]\n\n#### Diversity Reward\n- **Liir: Learning individual intrinsic reward in multi-agent reinforcement learning**, 2019, [[paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2019\u002Ffile\u002F07a9d3fed4c5ea6b17e80258dee231fa-Paper.pdf)]\n\n#### Competence-Based Reward\n- **CURIOUS: Intrinsically Motivated Modular Multi-Goal Reinforcement Learning**, 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.06284)] [[code](https:\u002F\u002Fgithub.com\u002Fflowersteam\u002Fcurious)]\n- **Skew-Fit: State-Covering Self-Supervised Reinforcement Learning**, 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.03698)]\n- **DISCERN: Diversity-based Selection of Centroids for k-Estimation and Rapid Non-stochastic Clustering**, 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05933)] [[code](https:\u002F\u002Fgithub.com\u002Falihassanijr\u002FDISCERN)]\n- **Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.01335)] [[code](https:\u002F\u002Fgithub.com\u002Fuclaml\u002FSPIN)]\n- **KTO: Model Alignment as Prospect Theoretic Optimization**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01306)] [[code](https:\u002F\u002Fgithub.com\u002FContextualAI\u002FHALOs)]\n\n#### Exploration Reward\n- **Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.01335)] [[code](https:\u002F\u002Fgithub.com\u002Fuclaml\u002FSPIN)]\n- **Exploration by Random Network Distillation**, 2018, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12894)] [[code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Frandom-network-distillation)]\n\n#### Information Gain Reward\n- **Understanding Chain-of-Thought in LLMs through Information Theory**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.11984)]\n- **VIME: Variational Information Maximizing Exploration**, 2016, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.09674)] [[code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fvime)]\n- **EMI: Exploration with Mutual Information**, 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.01176)] [[code](https:\u002F\u002Fgithub.com\u002Fsnu-mllab\u002FEMI)]\n- **Model-Based Active Exploration**, 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12162)] [[code](https:\u002F\u002Fgithub.com\u002Fnnaisense\u002Fmax)]\n- **KTO: Model Alignment as Prospect Theoretic Optimization**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01306)] [[code](https:\u002F\u002Fgithub.com\u002FContextualAI\u002FHALOs)]\n\n### Hybrid Reward\n#### Combination of Intrinsic and Extrinsic Reward\n- **RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00267)]\n- **Constitutional AI: Harmlessness from AI Feedback**, 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08073)] [[code](https:\u002F\u002Fgithub.com\u002Fanthropics\u002FConstitutionalHarmlessnessPaper)]\n- **Iterative Preference Learning from Human Feedback: Bridging Theory and Practice for RLHF under KL-Constraint**, 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11456)]\n- **RLHF Workflow: From Reward Modeling to Online RLHF**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.07863)] [[code](https:\u002F\u002Fgithub.com\u002FRLHFlow\u002FRLHF-Reward-Modeling)]\n\n### Hierarchical Reward\n#### Hierarchical Reward\n- **Token-level Direct Preference Optimization**, 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.11999)] [[code](https:\u002F\u002Fgithub.com\u002FVance0124\u002FToken-level-Direct-Preference-Optimization)]\n\n## Emotion\n\n# Self-Enhancement in Intelligent Agents\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_0ddb66b024d4.png\" alt=\"Self-evolution\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Optimization Spaces\n\n#### Prompt\n\n\n- **Prompt optimization in multi-step tasks (promst): Integrating human feedback and preference alignment**, EMNLP 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08702)] \n\n- **StraGo: Harnessing strategic guidance for prompt optimization**, EMNLP 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.08601)]\n\n- **Connecting large language models with evolutionary algorithms yields powerful prompt optimizers**, ICLR 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.08532)]\n\n#### Workflow\n\n#### Tools\n\n\n### Optimization Algorithms\n\n#### Optimization Strategies\n\n- **Large Language Models Are Human-Level Prompt Engineers**, ICLR 2023 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01910)]\n\n- **Automatic Prompt Optimization with \"Gradient Descent\" and Beam Search**, EMNLP 2023 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.03495)]\n\n- **GPTSwarm: Language Agents as Optimizable Graphs**, ICML 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.16823)]\n\n- **Promptbreeder: Self-Referential Self-Improvement via Prompt Evolution**, ICML 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16797)]\n\n- **Teaching Large Language Models to Self-Debug**, ICLR 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05128)]\n\n- **Large Language Models as Optimizers**, ICLR 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.03409)]\n\n- **DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines**, ICLR 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03714)]\n\n- **Prompt Engineering a Prompt Engineer**, Findings of ACL 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05661)]\n\n- **Prompt optimization in multi-step tasks (promst): Integrating human feedback and preference alignment**, EMNLP 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08702)]\n\n- **StraGo: Harnessing strategic guidance for prompt optimization**, EMNLP 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.08601)]\n\n- **Optimizing Instructions and Demonstrations for Multi-Stage Language Model Programs**, EMNLP 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11695)]\n\n- **Trace is the Next AutoDiff: Generative Optimization with Rich Feedback, Execution Traces, and LLMs**, NeurIPS 2024 [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.16218)]\n\n- **Optimizing Generative AI by Backpropagating Language Model Feedback**, Nature [[paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-025-08661-4)]\n\n- **Are Large Language Models Good Prompt Optimizers?**, arxiv [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02101)]\n\n#### Theoretical Perspectives\n\n- **An Explanation of In-context Learning as Implicit Bayesian Inference**, ICLR 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.02080)]\n\n- **Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?**, EMNLP 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.12837)]\n\n- **What Can Transformers Learn In-Context? A Case Study of Simple Function Classes**, NeurIPS 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01066)]\n\n- **What Learning Algorithm Is In-Context Learning? Investigations with Linear Models**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15661)]\n\n- **Transformers Learn In-Context by Gradient Descent**, ICML 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07677)]\n\n- **Transformers Learn to Achieve Second-Order Convergence Rates for In-Context Linear Regression**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.17086)]\n\n\n\n### Utilization Scenario\n\n#### Online Opitmization\n\n- **Reflexion: language agents with verbal reinforcement learning**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)]\n\n- **Self-refine: Iterative refinement with self-feedback**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)]\n\n- **ReAct: Synergizing Reasoning and Acting in Language Models**, ICLR 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)]\n\n- **Tree of thoughts: Deliberate problem solving with large language models**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)]\n\n- **Voyager: An Open-Ended Embodied Agent with Large Language Models**, TMLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)]\n\n- **Let's Verify Step by Step**, ICLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20050)]\n\n- **MetaGPT: Meta programming for multi-agent collaborative framework**, ICLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.00352)]\n\n- **Camel: Communicative agents for “mind” exploration of large language model society**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17760)]\n\n- **ChatDev: Communicative Agents for Software Development**, ACL 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07924)]\n\n- **Hugginggpt: Solving ai tasks with chatgpt and its friends in hugging face**, NeurIPS 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)]\n\n- **Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation**, COLM 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02304)]\n\n- **Quiet-star: Language models can teach themselves to think before speaking**, CoRR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09629)]\n\n- **Text2reward: Automated dense reward function generation for reinforcement learning. **, ICLR 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.11489)]\n\n- **Extracting prompts by inverting LLM outputs**, ACL 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15012)]\n\n- **Aligning large language models via self-steering optimization.**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17131)]\n\n- **Aligning large language models via self-steering optimization.**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17131)]\n\n#### Offline Optimization\n\n- **Are Large Language Models Good Statisticians?**, NeurIPS 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07815)]\n\n- **nvBench 2.0: A Benchmark for Natural Language to Visualization under Ambiguity**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.12880)]\n\n- **Srag: Structured retrieval-augmented generation for multi-entity question answering over wikipedia graph**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01346)]\n\n- **Fine-grained retrieval-augmented generation for visual question answering**, arxiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.20964)]\n\n- **xLAM: A Family of Large Action Models to Empower AI Agent Systems**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.03215)]\n\n- **Automated design of agentic systems.**, arxiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.08435)]\n\n- **LIRE: listwise reward enhancement for preference alignment**, ACL 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13516)]\n\n\n### Scientific Knowledge Discovery\n\n#### Hypothesis Generation and Testing\n\n- **Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with 100+ NLP Researchers**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.04109)] \n\n- **SciAgents: Automating Scientific Discovery Through Bioinspired Multi-Agent Intelligent Graph Reasoning**, Advanced Materials 2024, [[paper](https:\u002F\u002Fdoi.org\u002F10.1002\u002Fadma.202413523)] \n\n- **Genesis: Towards the Automation of Systems Biology Research**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.04109)] \n\n- **The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.06292)] \n\n- **Agent Laboratory: Using LLM Agents as Research Assistants**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04227)] \n\n- **ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06590)] \n\n- **ChemOS 2.0: An orchestration architecture for chemical self-driving laboratories**, Matter 2024, [[paper](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.matt.2024.04.022)] \n\n- **Towards an AI co-scientist**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.18864)] \n\n#### Protocol Planning and Tool Innovation\n\n- **Autonomous mobile robots for exploratory synthetic chemistry**, Nature 2024, [[paper](https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs41586-024-08173-7)] \n\n- **Delocalized, asynchronous, closed-loop discovery of organic laser emitters**, Science 2024, [[paper](https:\u002F\u002Fdoi.org\u002F10.1126\u002Fscience.adk9227)] \n\n- **The Virtual Lab: AI Agents Design New SARS-CoV-2 Nanobodies with Experimental Validation**, bioRxiv 2024, [[paper](https:\u002F\u002Fdoi.org\u002F10.1101\u002F2024.11.11.623004)] \n\n#### Data Analysis and Implication Derivation\n\n- **Solving olympiad geometry without human demonstrations**, Nature 2024, [[paper](https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs41586-023-06747-5)] \n\n- **Toward a Team of AI-made Scientists for Scientific Discovery from Gene Expression Data**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12391)] \n\n- **Data Interpreter: An LLM Agent For Data Science**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.18679)] \n\n- **Curie: Toward Rigorous and Automated Scientific Experimentation with AI Agents**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.16069)], [[github](https:\u002F\u002Fgithub.com\u002FJust-Curieous\u002FCurie)]\n\n# Collaborative and Evolutionary Intelligent Systems\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_b43af177b08b.png\" alt=\"LLM-based Multi-Agent Systems\" width=\"100%\">\n\u003C\u002Fdiv>\n\n## Application\n### Strategic Learning\n- RECONCILE (Chen et al., 2023)\n- LLM-Game-Agent (Lan et al., 2023)\n- BattleAgentBench (Wang et al., 2024)\n\n### Modeling and Simulation\n- Generative Agents (Park et al., 2023)\n- Agent Hospital (Li et al., 2024)\n- MedAgents (Tang et al., 2024)\n- MEDCO (Wei et al., 2024)\n\n### Collaborative Task Solving\n- MetaGPT (Hong et al., 2023)\n- ChatDev (Qian et al., 2024)\n- Agent Laboratory (Schmidgall et al., 2025)\n- The Virtual Lab (Swanson et al., 2024)\n\n## Composition and Protocol\n### Agent Composition\n#### Homogeneous\n- CoELA (Zhang et al., 2023)\n- VillagerAgent (Dong et al., 2024)\n- LLM-Coordination (Agashe et al., 2024)\n\n#### Heterogeneous\n- MetaGPT (Hong et al., 2023)\n- ChatDev (Qian et al., 2024)\n- Generative Agents (Park et al., 2023)\n- S-Agents (Chen et al., 2024)\n\n### Interaction Protocols\n#### Message Types\n- SciAgents (Ghafarollahi et al., 2024)\n- AppAgent (Chi et al., 2023)\n- MetaGPT (Hong et al., 2023)\n\n#### Communication Interfaces\n- AgentBench (Liu et al., 2023)\n- VAB (Liu et al., 2024)\n- TaskWeaver (Qiao et al., 2024)\n- HULA (Takerngsaksiri et al., 2025)\n\n### Next Generation Protocol\n- MCP (Anthropic)\n- Agora (Marro et al., 2024)\n- IoA (Chen et al., 2024)\n\n## Topology\n### Static Topology\n- MEDCO (Wei et al., 2024)\n- Agent Hospital (Li et al., 2024)\n- Welfare Diplomacy (Mukobi et al., 2023)\n- MedAgents (Tang et al., 2024)\n\n### Dynamic Topology\n- DyLAN (Liu et al., 2023)\n- GPTSwarm (Zhuge et al., 2024)\n- CodeR (Chen et al., 2024)\n- Oasis (Yang et al., 2024)\n\n## Collaboration\n### Agent-Agent Collaboration\n#### Consensus-oriented\n- Agent Laboratory (Schmidgall et al., 2025)\n- The Virtual Lab (Swanson et al., 2024)\n- OASIS (Yang et al., 2024)\n\n#### Collaborative Learning\n- Generative Agents (Park et al., 2023)\n- Welfare Diplomacy (Mukobi et al., 2023)\n- LLM-Game-Agent (Lan et al., 2023)\n- BattleAgentBench (Wang et al., 2024)\n\n#### Teaching\u002FMentoring\n- MEDCO (Wei et al., 2024)\n- Agent Hospital (Li et al., 2024)\n\n#### Task-oriented\n- MedAgents (Tang et al., 2024)\n- S-Agents (Chen et al., 2024)\n\n### Human-AI Collaboration\n- Dittos (Leong et al., 2024)\n- PRELUDE (Gao et al., 2024)\n\n## Evolution\n### Collective Intelligence\n- Generative Agents (Park et al., 2023)\n- Welfare Diplomacy (Mukobi et al., 2023)\n- LLM-Game-Agent (Lan et al., 2023)\n- BattleAgentBench (Wang et al., 2024)\n\n### Individual Adaptability\n- Agent Hospital (Li et al., 2024)\n- Agent Laboratory (Schmidgall et al., 2025)\n- MEDCO (Wei et al., 2024)\n\n## Evaluation\n### Benchmark for Specific Tasks\n- MBPP (dataset-mbpp)\n- HotpotQA (dataset-hotpot-qa)\n- MATH (dataset-math)\n- SVAMP (dataset-svamp)\n- MultiArith (dataset-multiarith)\n\n### Benchmark for MAS\n- Collab-Overcooked (Sun et al., 2025)\n- REALM-Bench (Geng et al., 2025)\n- PARTNR (Chang et al., 2024)\n- VillagerBench (Dong et al., 2024)\n- AutoArena (Zhao et al., 2024)\n- MultiagentBench (Zhu et al., 2025)\n\n\n# Building Safe and Beneficial AI\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_c4fe683b95ab.png\" alt=\"Agent Intrinsic Safety\" width=\"100%\">\n\u003C\u002Fdiv>\n\n## Safety Threats\n\n### Jailbreak\n\n#### White-box Jailbreak\n\n- **Jailbreak attacks and defenses against large language models: A survey**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04295)]\n  \n- **Universal and transferable adversarial attacks on aligned language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15043)]\n  \n- **Boosting jailbreak attack with momentum**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.01229)]\n  \n- **Improved techniques for optimization-based jailbreaking on large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.21018)]\n  \n- **Jailbreak Instruction-Tuned LLMs via end-of-sentence MLP Re-weighting**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10150)]\n  \n- **Open the Pandora's Box of LLMs: Jailbreaking LLMs through Representation Engineering**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.06824)]\n  \n- **DROJ: A Prompt-Driven Attack against Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.09125)]\n  \n- **Autodan: Generating stealthy jailbreak prompts on aligned large language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04451)]\n  \n- **POEX: Policy Executable Embodied AI Jailbreak Attacks**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16633)]\n\n\n#### Black-box Jailbreak\n\n- **Jailbroken: How does LLM safety training fail?**, NeurIPS 2023, [[paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Fhash\u002F063b264250add1efdb3e3f7f5686b4e0-Abstract-Conference.html)]\n  \n- **Jailbreaking black box large language models in twenty queries**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.08419)]\n  \n- **Jailbreaking large language models against moderation guardrails via cipher characters**, NeurIPS 2024, [[paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2025\u002Ffile\u002F59408-59435-Abstract-Conference.html)]\n  \n- **Visual adversarial examples jailbreak aligned large language models**, AAAI 2024, [[paper](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F29398)]\n  \n- **POEX: Policy Executable Embodied AI Jailbreak Attacks**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16633)]\n  \n- **Autodan: Generating stealthy jailbreak prompts on aligned large language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04451)]\n  \n- **Guard: Role-playing to generate natural-language jailbreakings to test guideline adherence of large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03299)]\n  \n- **Heuristic-Induced Multimodal Risk Distribution Jailbreak Attack for Multimodal Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05934)]\n  \n- **Rt-attack: Jailbreaking text-to-image models via random token**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.13896)]\n  \n\n### Prompt Injection\n\n#### Direct Prompt Injection\n\n- **Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection**, AISec@CCS 2023, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3617415.3624610)\n  \n- **Automatic and universal prompt injection attacks against large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.04957)]\n  \n- **Optimization-based prompt injection attack to LLM-as-a-judge**, CCS 2024, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3636696.3654467)]\n  \n- **Benchmarking indirect prompt injections in tool-integrated large language model agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02691)]\n  \n- **Trust No AI: Prompt Injection Along The CIA Security Triad**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06090)]\n  \n- **Empirical analysis of large vision-language models against goal hijacking via visual prompt injection**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.03554)]\n  \n- **Dataset and Lessons Learned from the 2024 SaTML LLM Capture-the-Flag Competition**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07954)]\n  \n- **Ignore this title and HackAPrompt: Exposing systemic vulnerabilities of LLMs through a global prompt hacking competition**, EMNLP 2023, [[paper](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.316)]\n\n\n#### Indirect Prompt Injection\n\n- **Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection**, AISec@CCS 2023, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3617415.3624610)]\n  \n- **HijackRAG: Hijacking Attacks against Retrieval-Augmented Large Language Models**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22832)]\n  \n- **Backdoored Retrievers for Prompt Injection Attacks on Retrieval Augmented Generation of Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.14479)]\n  \n- **Prompt Infection: LLM-to-LLM Prompt Injection within Multi-Agent Systems**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.07283)]\n  \n- **Adversarial search engine optimization for large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.18382)]\n\n### Hallucination\n\n#### Knowledge-conflict Hallucination\n\n- **Survey of hallucination in natural language generation**, ACM Computing Surveys 2023, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3571730)]\n  \n- **A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05232)]\n  \n- **DELUCIONQA: Detecting Hallucinations in Domain-specific Question Answering**, Findings of EMNLP 2023, [[paper](https:\u002F\u002Faclanthology.org\u002F2023.findings-emnlp.258)]\n  \n- **Deficiency of large language models in finance: An empirical examination of hallucination**, Failure Modes Workshop @ NeurIPS 2023, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=ywJqHknZKf)]\n  \n- **MetaGPT: Meta Programming for Multi-Agent Collaborative Framework**, ICLR 2023, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=6vBvP6uLTQU)]\n  \n- **Hallucination is inevitable: An innate limitation of large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.11817)]\n  \n- **ERBench: An Entity-Relationship based Automatically Verifiable Hallucination Benchmark for Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.05266)]\n\n\n#### Context-conflict Hallucination\n\n- **Truth-Aware Context Selection: Mitigating the Hallucinations of Large Language Models Being Misled by Untruthful Contexts**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07556)]\n  \n- **Large Language Models are Easily Confused: A Quantitative Metric, Security Implications and Typological Analysis**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.13237)]\n  \n- **HaluEval-Wild: Evaluating Hallucinations of Language Models in the Wild**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.04307)]\n  \n- **Analyzing and Mitigating Object Hallucination in Large Vision-Language Models**, ICLR 2023, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=2CeF2U2CFi)]\n  \n- **Mitigating object hallucination in large vision-language models via classifier-free guidance**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08680)]\n  \n- **When Large Language Models contradict humans? Large Language Models' Sycophantic Behaviour**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09410)]\n  \n- **HallusionBench: an advanced diagnostic suite for entangled language hallucination and visual illusion in large vision-language models**, CVPR 2024, [[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fhtml\u002FGuan_HallusionBench_An_Advanced_Diagnostic_Suite_for_Entangled_Language_Hallucination_and_CVPR_2024_paper.html)]\n  \n- **DiaHalu: A Dialogue-level Hallucination Evaluation Benchmark for Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.00896)]\n\n\n### Misalignment\n\n#### Goal-misguided Misalignment\n\n- **AI alignment: A comprehensive survey**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.19852)]\n  \n- **Specification Gaming: The Flip Side of AI Ingenuity**, DeepMind Blog 2020, [[paper](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fspecification-gaming-the-flip-side-of-ai-ingenuity\u002F)]\n  \n- **The alignment problem from a deep learning perspective**, arXiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00626)]\n  \n- **Emulated Disalignment: Safety Alignment for Large Language Models May Backfire!**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12343)]\n  \n- **Agent Alignment in Evolving Social Norms**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04620)]\n  \n- **Model Merging and Safety Alignment: One Bad Model Spoils the Bunch**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.14563)]\n  \n\n#### Capability-misused Misalignment\n\n- **Trustworthy LLMs: A survey and guideline for evaluating large language models' alignment**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.05374)]\n  \n- **Assessing the brittleness of safety alignment via pruning and low-rank modifications**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.05162)]\n  \n- **AI alignment: A comprehensive survey**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.19852)]\n  \n- **Fine-tuning aligned language models compromises safety, even when users do not intend to!**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03693)]\n  \n- **Fundamental limitations of alignment in large language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11082)]\n  \n\n\n### Poisoning Attacks\n\n#### Model Poisoning\n\n- **Weight poisoning attacks on pre-trained models**, ACL 2020, [[paper](https:\u002F\u002Faclanthology.org\u002F2020.acl-main.495)]\n  \n- **Badedit: Backdooring large language models by model editing**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.13355)]\n  \n- **The philosopher's stone: Trojaning plugins of large language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00374)]\n  \n- **Obliviate: Neutralizing Task-agnostic Backdoors within the Parameter-efficient Fine-tuning Paradigm**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.14119)]\n  \n- **Poisoned ChatGPT finds work for idle hands: Exploring developers’ coding practices with insecure suggestions from poisoned AI models**, IEEE S&P 2024, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10432536)\n  \n- **Secret Collusion Among Generative AI Agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07510)]\n  \n- **Exploiting the Vulnerability of Large Language Models via Defense-Aware Architectural Backdoor**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.01952)]\n  \n\n\n#### Data Poisoning\n\n- **Poisoning language models during instruction tuning**, ICML 2023, [[paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fwan23b.html)]\n  \n- **Agentpoison: Red-teaming LLM agents via poisoning memory or knowledge bases**, NeurIPS 2025, [[paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2025\u002Fhash\u002F130185-130213-Abstract-Conference.html)]\n  \n- **Poison-RAG: Adversarial Data Poisoning Attacks on Retrieval-Augmented Generation in Recommender Systems**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.11759)]\n  \n- **PoisonBench: Assessing Large Language Model Vulnerability to Data Poisoning**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.08811)]\n  \n- **The dark side of human feedback: Poisoning large language models via user inputs**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.00787)]\n  \n- **Scaling laws for data poisoning in LLMs**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.02946)]\n  \n- **Talk too much: Poisoning large language models under token limit**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.14795)]\n  \n- **Best-of-Venom: Attacking RLHF by Injecting Poisoned Preference Data**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05530)]\n\n\n#### Backdoor Injection\n\n- **Sleeper agents: Training deceptive LLMs that persist through safety training**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05566)]\n  \n- **Wipi: A new web threat for LLM-driven web agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09875)]\n  \n- **Exploring backdoor attacks against large language model-based decision making**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.20774)]\n  \n- **When Backdoors Speak: Understanding LLM Backdoor Attacks Through Model-Generated Explanations**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.12701)]\n  \n- **Backdooring instruction-tuned large language models with virtual prompt injection**, NAACL 2024, [[paper](https:\u002F\u002Faclanthology.org\u002F2024.naacl-long.338)]\n\n\n## Privacy Threats\n\n### Training Data Inference\n\n#### Membership Inference Attacks\n\n- **Membership inference attacks against machine learning models**, IEEE S&P 2017, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F7958568)]\n  \n- **The secret sharer: Evaluating and testing unintended memorization in neural networks**, USENIX Security 2019, [[paper](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity19\u002Fpresentation\u002Fcarlini)]\n  \n- **Label-only membership inference attacks**, ICML 2021, [[paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fchoquette-choo21a.html)]\n  \n- **Practical membership inference attacks against fine-tuned large language models via self-prompt calibration**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.06062)]\n  \n- **Membership inference attacks from first principles**, IEEE S&P 2022, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9833683)]\n  \n- **Membership inference attacks on machine learning: A survey**, ACM Computing Surveys 2022, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3505244)]\n\n\n#### Data Extraction Attacks\n\n- **Extracting training data from large language models**, USENIX Security 2021, [[paper](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity21\u002Fpresentation\u002Fcarlini-extracting)]\n  \n- **Special characters attack: Toward scalable training data extraction from large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.05990)]\n  \n- **Ethicist: Targeted training data extraction through loss smoothed soft prompting and calibrated confidence estimation**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.04401)]\n  \n- **Language model inversion**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.13647)]\n  \n- **Privacy risks of general-purpose language models**, IEEE S&P 2020, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9152683)]\n  \n- **Quantifying memorization across neural language models**, arXiv 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.07646)]\n  \n- **Stealing part of a production language model**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\n\n\n### Interaction Data Inference\n\n#### System Prompt Stealing\n\n- **Ignore previous prompt: Attack techniques for language models**, TSRML@NeurIPS 2022, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.09597)]\n  \n- **Prompt Stealing Attacks Against Text-to-Image Generation Models**, USENIX Security 2024, [[paper](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity24\u002Fpresentation\u002Fshen-xinyue)]\n  \n- **Safeguarding System Prompts for LLMs**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.13426)]\n  \n- **InputSnatch: Stealing Input in LLM Services via Timing Side-Channel Attacks**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.18191)]\n  \n- **Effective prompt extraction from language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.06865)]\n  \n- **Last one standing: A comparative analysis of security and privacy of soft prompt tuning, lora, and in-context learning**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.11397)]\n  \n- **LLM app store analysis: A vision and roadmap**, ACM TOSEM 2024, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1145\u002F3657416)]\n\n\n#### User Prompt Stealing\n\n- **Prsa: Prompt reverse stealing attacks against large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07870)]\n  \n- **Prompt Leakage effect and defense strategies for multi-turn LLM interactions**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.16251)]\n  \n- **Investigating the prompt leakage effect and black-box defenses for multi-turn LLM interactions**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.06770)]\n  \n- **Why Are My Prompts Leaked? Unraveling Prompt Extraction Threats in Customized Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.02416)]\n  \n- **Pleak: Prompt leaking attacks against large language model applications**, CCS 2024, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3644473.3665630)]\n  \n- **Stealing User Prompts from Mixture of Experts**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22884)]\n  \n- **Extracting Prompts by Inverting LLM Outputs**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15012)]\n\n## Threats on Non-Brain\n\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_d3d7feb62500.jpg\" alt=\"Threats on LLM Non-Brains\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Perception Safety Threats\n\n#### Adversarial Attacks\n\n##### Textual\n\n- **An LLM can Fool Itself: A Prompt-Based Adversarial Attack**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.13345)]\n  \n- **Revisiting Character-level Adversarial Attacks for Language Models**, ICML 2024, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=Z9tyM3Rru1)]\n  \n- **Hard prompts made easy: Gradient-based discrete optimization for prompt tuning and discovery**, NeurIPS 2024, [[paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Fhash\u002F36-Hard-Prompts-Made-Easy-Abstract-Conference.html)]\n  \n- **Universal and transferable adversarial attacks on aligned language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15043)]\n\n##### Visual\n\n- **Image hijacks: Adversarial images can control generative models at runtime**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00236)]\n  \n- **Image-based Multimodal Models as Intruders: Transferable Multimodal Attacks on Video-based MLLMs**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.01042)]\n  \n- **Dissecting Adversarial Robustness of Multimodal LM Agents**, ICLR 2025, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=zCPIQdjMkJ)]\n  \n- **Poltergeist: Acoustic adversarial machine learning against cameras and computer vision**, IEEE S&P 2021, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9474293)]\n\n\n##### Auditory\n\n- **Inaudible adversarial perturbation: Manipulating the recognition of user speech in real time**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01040)]\n  \n- **The Silent Manipulator: A Practical and Inaudible Backdoor Attack against Speech Recognition Systems**, ACM Multimedia 2023, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3581783.3612104)]\n  \n- **Enrollment-stage backdoor attacks on speaker recognition systems via adversarial ultrasound**, IEEE IoT Journal 2023, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10162210)]\n  \n- **Ultrabd: Backdoor attack against automatic speaker verification systems via adversarial ultrasound**, ICPADS 2023, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10089999)\n  \n- **DolphinAttack: Inaudible voice commands**, CCS 2017, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3133956.3134052)]\n  \n\n\n##### Other Modality\n\n- **A Survey on Adversarial Robustness of LiDAR-based Machine Learning Perception in Autonomous Vehicles**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.13778)]\n  \n- **Rocking drones with intentional sound noise on gyroscopic sensors**, USENIX Security 2015, [[paper](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity15\u002Ftechnical-sessions\u002Fpresentation\u002Fson)]\n  \n- **Adversarial attacks on multi-agent communication**, ICCV 2021, [[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FTu_Adversarial_Attacks_on_Multi-Agent_Communication_ICCV_2021_paper.html)]\n  \n- **GPS location spoofing attack detection for enhancing the security of autonomous vehicles**, IEEE VTC-Fall 2021, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9623147)]\n\n\n#### Misperception Issues\n\n- **Grounding large language models in interactive environments with online reinforcement learning**, ICML 2023, [[paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fcarta23a.html)]\n  \n- **Bias and fairness in large language models: A survey**, Computational Linguistics 2024, [[paper](https:\u002F\u002Fdirect.mit.edu\u002Fcoli\u002Farticle\u002Fdoi\u002F10.1162\u002Fcoli_a_00491\u002F123804)]\n  \n- **Domain generalization using causal matching**, ICML 2021, [[paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fmahajan21a.html)]\n  \n- **GEM: Glare or gloom, I can still see you—End-to-end multi-modal object detection**, IEEE RA-L 2021, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9513373)]\n  \n- **NPHardEval: Dynamic benchmark on reasoning ability of large language models via complexity classes**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14890)]\n  \n- **Modeling opinion misperception and the emergence of silence in online social system**, PLOS ONE 2024, [[paper](https:\u002F\u002Fjournals.plos.org\u002Fplosone\u002Farticle?id=10.1371\u002Fjournal.pone.0296075)]\n  \n- **Bridging the domain gap for multi-agent perception**, ICRA 2023, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10161095)]\n  \n- **Cooperative and competitive biases for multi-agent reinforcement learning**, arXiv 2021, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06890)]\n  \n- **Model-agnostic multi-agent perception framework**, ICRA 2023, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10159709)]\n  \n- **Mutual influence between language and perception in multi-agent communication games**, PLOS Computational Biology 2022, [[paper](https:\u002F\u002Fjournals.plos.org\u002Fploscompbiol\u002Farticle?id=10.1371\u002Fjournal.pcbi.1010658)]\n\n\n\n### Action Safety Threats\n\n#### Supply Chain Attack\n\n- **A new era in LLM security: Exploring security concerns in real-world LLM-based systems**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.18649)]\n  \n- **Wipi: A new web threat for LLM-driven web agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09875)]\n  \n- **Identifying the risks of LM agents with an LM-emulated sandbox**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15817)]\n  \n- **Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection**, AISec@CCS 2023, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3617415.3624610)]\n  \n- **Benchmarking indirect prompt injections in tool-integrated large language model agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02691)]\n\n\n#### Tool Use Risk\n\n- **Identifying the risks of LM agents with an LM-emulated sandbox**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15817)]\n  \n- **Toolsword: Unveiling safety issues of large language models in tool learning across three stages**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.10753)]\n  \n- **Benchmarking indirect prompt injections in tool-integrated large language model agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02691)]\n\n\n## Agent Extrinsic Safety\n  \n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_19ac5f4e4b85.jpg\" alt=\"Agent Extrinsic Safety:\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### Agent-Memory Interaction Threats\n\n#### Retrieval Augmented Generation\n\n- **Agentpoison: Red-teaming LLM agents via poisoning memory or knowledge bases**, NeurIPS 2025, [[paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2025\u002Fhash\u002F130185-130213-Abstract-Conference.html)]\n  \n- **ConfusedPilot: Confused deputy risks in RAG-based LLMs**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.04870)]\n  \n- **PoisonedRAG: Knowledge corruption attacks to retrieval-augmented generation of large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07867)]\n  \n- **Machine against the RAG: Jamming retrieval-augmented generation with blocker documents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.05870)]\n  \n- **BadRAG: Identifying vulnerabilities in retrieval augmented generation of large language models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.00083)]\n  \n- **TrojanRAG: Retrieval-Augmented Generation Can Be Backdoor Driver in Large Language Models**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13401)]\n  \n- **Whispers in Grammars: Injecting Covert Backdoors to Compromise Dense Retrieval Systems**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.13532)]\n\n\n### Agent-Environment Interaction Threats\n\n#### Physical Environment\n\n- **Autonomous vehicles: Sophisticated attacks, safety issues, challenges, open topics, blockchain, and future directions**, JCP 2023, [[paper](https:\u002F\u002Fwww.mdpi.com\u002F2624-800X\u002F3\u002F3\u002F27)]\n  \n- **Engineering challenges ahead for robot teamwork in dynamic environments**, Applied Sciences 2020, [[paper](https:\u002F\u002Fwww.mdpi.com\u002F2076-3417\u002F10\u002F4\u002F1368)]\n  \n- **On GPS spoofing of aerial platforms: a review of threats, challenges, methodologies, and future research directions**, PeerJ Computer Science 2021, [[paper](https:\u002F\u002Fpeerj.com\u002Farticles\u002Fcs-507)]\n  \n- **Security and privacy in cyber-physical systems: A survey**, IEEE Communications Surveys & Tutorials 2017, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F7815411)]\n  \n- **Adversarial objects against LiDAR-based autonomous driving systems**, arXiv 2019, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05418)]\n  \n- **Learning to walk in the real world with minimal human effort**, arXiv 2020, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08550)]\n  \n- **Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.04247)]\n\n#### Digital Environment\n\n- **A new era in LLM security: Exploring security concerns in real-world LLM-based systems**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.18649)]\n  \n- **Demystifying RCE vulnerabilities in LLM-integrated apps**, CCS 2024, [[paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3618271.3630876)]\n  \n- **Wipi: A new web threat for LLM-driven web agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09875)]\n  \n- **Application of large language models to DDoS attack detection**, SPCPS 2023, [[paper](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-031-45910-2_6)]\n  \n- **Coercing LLMs to do and reveal (almost) anything**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14020)]\n  \n- **Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.04247)]\n  \n- **EIA: Environmental Injection Attack on Generalist Web Agents for Privacy Leakage**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.11295)]\n  \n- **AdvWeb: Controllable Black-Box Attacks on VLM-Powered Web Agents**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17401)]\n  \n- **AGrail: A Lifelong Agent Guardrail with Effective and Adaptive Safety Detection**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11448)]\n\n### Agent-Agent Interaction Threats\n\n#### Competitive Interactions\n\n- **Multi-Agent Risks from Advanced AI**, arXiv 2025, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14143)]\n  \n- **Hoodwinked: Deception and cooperation in a text-based game for language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01404)]\n  \n- **Attacking deep reinforcement learning with decoupled adversarial policy**, IEEE TDSC 2022, [[paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9519422)]\n  \n- **Secure consensus of multi-agent systems under denial-of-service attacks**, Asian Journal of Control 2023, [[paper](https:\u002F\u002Fonlinelibrary.wiley.com\u002Fdoi\u002Fabs\u002F10.1002\u002Fasjc.2921)]\n  \n- **A Perfect Collusion Benchmark: How can AI agents be prevented from colluding with information-theoretic undetectability?**, Multi-Agent Security Workshop @ NeurIPS 2023, [[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=NEURIPS-2023-perfect-collusion)]\n\n\n#### Cooperative Interactions\n\n- **On the risk of misinformation pollution with large language models**, arXiv 2023, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13661)]\n  \n- **Agent Smith: A single image can jailbreak one million multimodal LLM agents exponentially fast**, arXiv 2024, [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08567)]\n\n","# 令人惊叹的基础智能体\n\n[![欢迎提交PR](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPRs-welcome-brightgreen)](https:\u002F\u002Fgithub.com\u002FFoundationAgents\u002Fawesome-foundation-agents\u002Fpulls)\n[![许可证：MIT](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-yellow.svg)](LICENSE)\n[![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n[![Arxiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-FoundationAgents-b31b1b)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.01990)\n\n我们维护着一份精心整理的论文合集，旨在探索通往基础智能体的道路，重点关注核心概念的构建以及研究领域的梳理。\n\n⌛️ 即将推出版本2！我们将持续收集并更新前沿洞见。如果您发现任何有价值的相关工作，欢迎随时提出建议！\n\n## 我们在基础智能体方向上的成果\n\n✨✨✨ [基础智能体的进展与挑战](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2504.01990)（论文）\n\n![人类大脑的关键](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_e7a4bd74782a.png)\n![基础智能体框架](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_352986ddca36.png)\n\n# 令人惊叹的论文\n\n\u003Cfont size=5>\u003Ccenter>\u003Cb> 目录 \u003C\u002Fb> \u003C\u002Fcenter>\u003C\u002Ffont>\n- [智能体的核心组件](#core-components-of-intelligent-agents)\n    - [认知](#cognition)\n    - [记忆](#memory)\n    - [感知](#perception)\n    - [世界模型](#world-model)\n    - [行动](#action)\n    - [奖励](#reward)\n    - [情感](#emotion)\n- [智能体的自我提升](#self-enhancement-in-intelligent-agents)\n- [协作与进化型智能系统](#collaborative-and-evolutionary-intelligent-systems)\n- [构建安全且有益的人工智能](#building-safe-and-beneficial-ai)\n\n\n# 智能体的核心组件\n\n## 知识\n\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_a47377e3acf7.png\" alt=\"认知系统\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 学习\n#### 空间\n##### 完整\n- **添加SFT、RLHF、PEFT**\n - **ReFT：通过强化微调进行推理**，arxiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.08967)] [[代码]()] \n- **Search-R1：利用强化学习训练大语言模型进行推理并调用搜索引擎** [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09516)] [[代码](https:\u002F\u002Fgithub.com\u002FPeterGriffinJin\u002FSearch-R1)]\n - **R1-Searcher：通过强化学习激励大语言模型的搜索能力**，arxiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.05592)] [[代码]()] \n\n\n##### 部分\n- **思维链提示激发大型语言模型的推理能力**，Wei等人，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)] [[代码]()]\n- **Voyager：基于大型语言模型的开放式具身智能体**，arxiv 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)] [[代码]()]\n- **Reflexion：具有口头强化学习的语言智能体**，NeurIPS 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[代码]()]\n- **ReAct遇见ActRe：用于对比自监督学习的智能体轨迹自主标注**，arxiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.14589)] [[代码]()]\n- **生成式智能体：人类行为的交互式模拟**，ACM UIST 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03442)] [[代码]()]\n\n#### 目标\n##### 感知\n- **CLIP：从自然语言监督中学习可迁移的视觉模型**，ICML 2021年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)] [[代码]()]\n- **LLaVA：视觉指令微调**，NeurIPS 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)] [[代码]()]\n- **CogVLM：预训练语言模型的视觉专家**，NeurIPS 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.03079)] [[代码]()]\n - **Qwen2-Audio技术报告**，arxiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.10759)] [[代码]()] \n - **Search-R1：利用强化学习训练大语言模型进行推理并调用搜索引擎**，arxiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09516)] [[代码]()] \n\n\n##### 推理\n - **SKY-T1：以450美元以内训练属于你自己的o1预览模型**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.07374)] [[代码]()] \n - **开放思想**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.04178)] [[代码]()] \n - **LIMO：少即是多的推理方法**，arxiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03387)] [[代码]()] \n - **STaR：用推理来启动推理**，arxiv 2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465)] [[代码]()] \n- **ReST：用于语言建模的强化自训练**，arxiv 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08998)] [[代码]()]\n - **OpenR：一个用于大语言模型高级推理的开源框架**，arxiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.09671)] [[代码]()] \n - **LLaMA-Berry：针对o1级别奥林匹克数学推理的成对优化**，arxiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.02884)] [[代码]()] \n - **RAGEN：通过强化推理来训练智能体**，arxiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.20073)] [[代码]()] \n - **Open-R1**，2024年，[[论文](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fopen-r1)] [[代码]()] \n\n##### 世界\n- **内心独白：通过语言模型规划实现具身推理**，CoRL 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.05608)] [[代码]()]\n- **自我精炼：基于自我反馈的迭代优化**，NeurIPS 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)] [[代码]()]\n- **Reflexion：具有口头强化学习的语言智能体**，NeurIPS 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[代码]()]\n- **ExpeL：大语言模型智能体是体验式学习者**，AAAI 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.10144)] [[代码]()]\n- **AutoManual：由大语言模型智能体通过交互式环境学习生成使用说明书**，arxiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16247)] [[代码]()]\n- **ReAct遇见ActRe：用于对比自监督学习的智能体轨迹自主标注**，arxiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.14589)] [[代码]()]\n\n### 推理\n#### 结构化\n##### 动态\n- **ReAct：在语言模型中协同推理与行动**，arXiv 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)] [[代码]()]\n - **用于高效数学推理的思维马尔可夫链**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17635)] [[代码]()] \n- **思维之树：利用大型语言模型进行深思熟虑的问题解决**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)] [[代码]()]\n- **语言智能体树搜索统一了语言模型中的推理、行动和规划**，ICML 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04406)] [[代码]()]\n- **通过规划进行推理（RAP）：利用世界模型改进语言模型**，EMNLP 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)] [[代码]()]\n- **思维图：利用大型语言模型解决复杂问题**，AAAI 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.09687)] [[代码]()]\n - **思维路径：提取并遵循路径以实现稳健的关系推理**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.17963)] [[代码]()] \n - **论思维图**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.10038)] [[代码]()] \n\n##### 静态\n- **自我一致性提升语言模型中的思维链推理**，ICLR 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)] [[代码]()]\n- **自我精炼：基于自我反馈的迭代优化**，NeurIPS 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)] [[代码]()]\n- **渐进式提示引导提升大型语言模型的推理能力**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09797)] [[代码]()]\n - **论大型语言模型在推理和规划任务中的自我验证局限性**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08115)] [[代码]()] \n- **验证链降低大型语言模型中的幻觉现象**，ICLR 2024研讨会，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.11495)] [[代码]()]\n\n\n##### 领域\n- **MathPrompter：利用大型语言模型进行数学推理**，ACL 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05398)] [[代码]()]\n - **LLMs 可以通过教学型思维链发现数学推理中的错误**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.06705)] [[代码]()] \n - **物理推理器：利用知识增强的推理解决物理问题**，COLING 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.13791)] [[代码]()] \n\n\n#### 非结构化\n##### 提示\n- **思维链提示激发大型语言模型的推理能力**，NeurIPS 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)] [[代码]()]\n- **退一步：通过抽象激发大型语言模型的推理能力**，ICLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06117)] [[代码]()]\n- **问我任何问题：一种简单的语言模型提示策略**，arXiv 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02441)] [[代码]()]\n- **知识链：通过动态适应异构知识源来 grounding 大型语言模型**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13269)] [[代码]()]\n - **自解释关键词赋能大型语言模型进行代码生成**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.15966)] [[代码]()] \n\n\n##### 模型\n - **DeepSeek-R1：通过强化学习激励 LLM 的推理能力**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948)] [[代码]()] \n - **Claude 3.7 Sonnet**，2025，[[论文](https:\u002F\u002Fwww.anthropic.com\u002Fnews\u002Fclaude-3-7-sonnet-and-claude-code)] [[代码]()] \n - **OpenAI o1 系统卡片**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16720)] [[代码]()] \n\n##### 隐式\n- **Quiet-STaR：语言模型可以自我训练，在开口前先思考**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09629)] [[代码]()]\n - **连续思维链（Coconut）：训练大型语言模型在连续潜在空间中进行推理**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06769)] [[代码]()] \n\n\n#### 规划\n- **描述、解释、计划与选择（DEPS）：与大型语言模型交互式规划**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01560)] [[代码]()]\n- **ProgPrompt：利用大型语言模型生成情境化的机器人任务计划**，ICRA 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302)] [[代码]()]\n - **ADAPT：按需分解与语言模型结合的规划**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05772)] [[代码]()] \n- **思维之树：利用大型语言模型进行深思熟虑的问题解决**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)] [[代码]()]\n- **通过规划进行推理（RAP）：利用世界模型改进语言模型**，EMNLP 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)] [[代码]()]\n- **TravelPlanner：面向语言智能体的真实世界规划基准测试**，ICML 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01622)] [[代码]()]\n- **PDDL—规划领域定义语言**，1998，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1106.4561)] [[代码]()]\n- **Mind2Web：迈向通用网络智能体**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06070)] [[代码]()]\n\n## 记忆\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_889d4be32c5f.png\" alt=\"智能体中的记忆\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 表征\n\n#### 感知\n##### 文本‑基\n- **RecAgent: 推荐系统的新仿真范式**, TOIS 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02552)] [[代码](https:\u002F\u002Fgithub.com\u002FRUC-GSAI\u002FYuLan-Rec)]\n- **CoPS: 认知个性化搜索：将大型语言模型与高效记忆机制结合**, WWW 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.15264)]\n- **MemoryBank: 利用长期记忆增强大型语言模型**, AAAI 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08589)] [[代码](https:\u002F\u002Fgithub.com\u002Fzhongwanjun\u002FMemoryBank-SiliconFriend)]\n- **Memory Sandbox: 面向对话代理的透明且可交互的记忆管理**, UIST 2023 附录, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.09631)]\n\n##### 多模态\n- **VideoAgent: 一种用于视频理解的记忆增强型多模态代理**, ECCV 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.07956)] [[代码](https:\u002F\u002Fgithub.com\u002Fwxh1996\u002FVideoAgent)]\n- **WorldGPT: 赋能大语言模型成为多模态世界模型**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.10193)] [[代码](https:\u002F\u002Fgithub.com\u002FDCDmllm\u002FWorldGPT)]\n- **Agent S: 一个像人类一样使用计算机的开放代理框架**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.05901)][[代码](https:\u002F\u002Fgithub.com\u002Fsimular-ai\u002FAgent-S)]\n- **OS‑Copilot: 朝着具备自我改进能力的通用计算机代理迈进**, ICLR 2024 LLMAgents Workshop, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.17359)] [[代码](https:\u002F\u002Fgithub.com\u002FOS-Copilot\u002FOS-Copilot)]\n- **MuLan: 用于渐进式和交互式多对象扩散的多模态‑LLM代理**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.11075)] [[代码](https:\u002F\u002Fgithub.com\u002Fmeasure-infinity\u002Fmulan-code)]\n\n#### 短期\n##### 上下文\n- **MemGPT: 朝着将大语言模型作为操作系统的方向发展**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07508)] [[代码](https:\u002F\u002Fgithub.com\u002Fcpacker\u002FMemGPT)]\n- **KARMA: 利用长短期记忆系统增强具身智能体**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09692)] [[代码](https:\u002F\u002Fgithub.com\u002FWZX0Swarm0Robotics\u002FKARMA)]\n- **LSFS: 从命令到提示：基于大语言模型的语义文件系统**, ICLR 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.13007)] [[代码](https:\u002F\u002Fgithub.com\u002Fagiresearch\u002FAIOS-LSFS)]\n- **OSCAR: 基于状态感知推理与重规划的操作系统控制**, ICLR 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08767)]\n- **RCI: 语言模型可以解决计算机任务（递归批评与改进）**, NeurIPS 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12934)] [[代码](https:\u002F\u002Fgithub.com\u002Fposgnu\u002Frci-agent)]\n\n##### 工作\n- **Generative Agent: 人类行为的交互式模拟体**, UIST 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.00118)] [[代码](https:\u002F\u002Fgithub.com\u002Fjoonspk-research\u002Fgenerative_agents)]\n- **RLP: 反思性语言编程 (RLP): 社会意识型 AGI 的垫脚石**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12647)]\n- **CALYPSO: 大语言模型作为地下城主的助手**, AIIDE 2023, [[论文](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAIIDE\u002Farticle\u002Fview\u002F27546)] [[代码](https:\u002F\u002Fgithub.com\u002Fnorthern-lights-province\u002Fcalypso-aiide-artifact)]\n- **HiAgent: 用于解决长 horizon 代理任务的大语言模型的层次化工作记忆管理**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.12790)] [[代码](https:\u002F\u002Fgithub.com\u002FHiAgent2024\u002FHiAgent)]\n\n#### 长期\n##### 语义\n- **AriGraph: 利用情景记忆为大语言模型代理学习知识图谱世界模型**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04363)] [[代码](https:\u002F\u002Fgithub.com\u002FAIRI-Institute\u002FAriGraph)]\n- **RecAgent**: 见上文\n- **HippoRAG: 受神经生物学启发的大语言模型长期记忆**, NeurIPS 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14831)] [[代码](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FHippoRAG)]\n\n##### 情景\n- **MobileGPT: 为移动任务自动化赋予大语言模型类人应用记忆**, ACM MobiCom 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.03003)]\n- **MemoryBank**: 见上文\n- **利用生命历程机器人经验的层次化表示进行情景记忆言语化**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.17702)] [[代码](https:\u002F\u002Fhierarchical-emv.github.io)]\n- **MrSteve: 在 Minecraft 中具有“何地何时”记忆的指令遵循型代理**, ICLR 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.0)] *(项目代码待定)*\n\n##### 过程\n- **AAG: 针对大语言模型的类比增强生成**, ACL ARR 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.07239)]\n- **Cradle: 赋能基础代理迈向通用计算机控制**, ICLR 2025, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=cradle)] [[代码](https:\u002F\u002Fgithub.com\u002FBAAI-Agents\u002FCradle)]\n- **JARVIS‑1: 具有记忆增强型多模态语言模型的开放世界多任务代理**, NeurIPS 2023 ALOE Workshop, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05997)] [[代码](https:\u002F\u002Fgithub.com\u002FCraftJarvis\u002FJARVIS-1)]\n- **LARP: 面向开放世界游戏的语言‑代理角色扮演**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.09352)]\n\n\n### 生命周期\n\n#### 获取\n##### 信息压缩\n- **HiAgent: 用于解决长 horizon 代理任务的大语言模型的层次化工作记忆管理**, ACL 2025, [[论文](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.2011)] [[代码](https:\u002F\u002Fgithub.com\u002FHiAgent2024\u002FHiAgent)]\n- **LMAgent: 用于多用户仿真的大规模多模态代理社会**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.09237)]\n- **一种受人类启发、具有超长上下文概要记忆的阅读代理**, ICML 2024, [[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv235\u002Fhuijie_readagent_24a.html)] [[代码](https:\u002F\u002Fread-agent.github.io\u002F)]\n- **利用元记忆机制提升大语言模型中的无数据代码生成能力**, arXiv 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.07892)]\n\n##### 经验整合\n- **ExpeL: 大语言模型代理是体验式学习者**, AAAI 2024, [[论文](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F29317)] [[代码](https:\u002F\u002Fgithub.com\u002FLeapLabTHU\u002FExpeL)]\n- **统一心智模型: 重新构想大语言模型时代的自主代理**, arXiv 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.03459)]\n- **元学习: 一篇综述**, PAMI 2021, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.03548)]\n- **``我的代理更懂我'': 在基于大语言模型的代理中集成动态类人记忆回忆与整合**, CHI 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02485)] [[代码](https:\u002F\u002Fgithub.com\u002Ftamoharu\u002FAgent-Memory-CHI24)]\n\n#### 编码\n##### 选择性注意力\n- **AgentCoord: 基于LLM的多智能体协作的视觉探索协调策略**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.11943)] [[代码](https:\u002F\u002Fgithub.com\u002FAgentCoord\u002FAgentCoord)]\n- **基于大型语言模型的智能体的记忆共享**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.09982)]\n- **通过LLM驱动的实体关系图理解长视频**, arXiv 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.15953)]\n- **A-MEM: LLM智能体的代理记忆**, arXiv 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12110)] [[代码](https:\u002F\u002Fgithub.com\u002FWujiangXu\u002FAgenticMemory)]\n- **机器人也能多任务处理：集成记忆架构与LLM以增强跨任务机器人动作生成**, Humanoids 2024, [[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10769803)]\n\n##### 多模态融合\n- **Optimus-1: 混合多模态记忆赋能的智能体在长时程任务中表现出色**, NeurIPS 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.03615)] [[代码](https:\u002F\u002Fgithub.com\u002FJiuTian-VL\u002FOptimus-1)]\n- **Optimus-2: 具有目标-观察-动作条件策略的多模态Minecraft智能体**, CVPR 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.19902)] [[代码](https:\u002F\u002Fgithub.com\u002FJiuTian-VL\u002FOptimus-2)]\n- **JARVIS-1: 多模态记忆增强的开放世界智能体**, NeurIPS 2023 ALOE Workshop, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05997)] [[代码](https:\u002F\u002Fgithub.com\u002FCraftJarvis\u002FJARVIS-1)]\n\n#### 推导\n##### 反思\n- **Agent S: 一个像人类一样使用计算机的开放式代理框架**, ICLR 2025海报, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.05901)] [[代码](https:\u002F\u002Fgithub.com\u002Fsimular-ai\u002FAgent-S)]  \n- **OSCAR: 基于状态感知推理和重规划的操作系统控制**, ICLR 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08767)]  \n- **R2D2: 面向网络智能体的记忆、反思与动态决策**, ACL 2025, [[论文](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.1464)]  \n- **Mobile-Agent-E: 用于复杂任务的自我进化移动助手**, ACL ARR 2025（已提交）, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15945)] [[代码](https:\u002F\u002Fgithub.com\u002FX-PLUG\u002FMobileAgent\u002Ftree\u002Fmain\u002FMobile-Agent-E)]\n\n##### 摘要\n- **SummEdits: 基于编辑的事实导向摘要**, EMNLP 2023, [[论文](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.600)] [[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fsummedits)]  \n- **SCM: 通过自控记忆框架增强大型语言模型**, DASFAA 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04521)] [[代码](https:\u002F\u002Fgithub.com\u002Fwbbeyourself\u002FSCM4LLMs)] \n- **医疗健康助手：激发通用LLM在医疗咨询中的潜力**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.10045)]\n- **递归摘要使大型语言模型具备长期对话记忆**, Neurocomputing 2025, [[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fabs\u002Fpii\u002FS0925231225008653)]\n\n##### 知识蒸馏\n- **KnowAgent: 基于LLM的智能体的知识增强规划**, Findings of NAACL 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.09419)] [[代码](https:\u002F\u002Fgithub.com\u002FKnowAgent\u002FKnowAgent)]  \n- **AoTD: 通过思想代理蒸馏提升视频-LLM推理能力**, CVPR 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.01694)]  \n- **LDPD: 语言驱动的策略蒸馏**, ICLR 2024 LLM-Agents研讨会, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.19008)]  \n- **子目标蒸馏：弥合大型语言模型与目标条件强化学习，用于长时程任务**, CoLLAs 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.06720)]  \n- **MAGDi: 记忆增强型生成式调试器**, ICML 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11424)] [[代码](https:\u002F\u002Fgithub.com\u002Fjustinchiu\u002FMAGDi)]\n\n##### 选择性遗忘\n- **Lyfe Agents: 用于低成本实时社交互动的生成式智能体**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09816)]  \n- **TiM: 思考即记忆的语言模型**, ICLR 2024（已提交）, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=TiM24)]  \n- **MemoryBank: 通过长期记忆增强大型语言模型**, AAAI 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08589)] [[代码](https:\u002F\u002Fgithub.com\u002Fzhongwanjun\u002FMemoryBank-SiliconFriend)]  \n- **S³: 基于大型语言模型智能体的社会网络模拟系统**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.14984)] [[代码](https:\u002F\u002Fgithub.com\u002FGA-S3\u002FSocial-Simulation)]  \n- **``我的智能体更懂我''**: 见上文\n\n#### 检索\n##### 索引\n- **HippoRAG: 受神经生物学启发的大型语言模型长期记忆**, NeurIPS 2024, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=Rz1nVVnp4P)] [[项目](https:\u002F\u002Fix.cs.uoregon.edu\u002F~apouranb\u002Fhmn\u002Fhmn.html)] [[代码](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FHippoRAG)]\n- **TradingGPT: 具有分层记忆的多智能体系统，用于模拟股票交易**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05767)]\n- **LongMemEval: 对聊天助手长期交互记忆的基准测试**, ICLR 2025, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=go6gKVh6bV)] [[代码](https:\u002F\u002Fgithub.com\u002Fxiaowu0162\u002FLongMemEval)]\n- **SeCom: 长期个性化对话智能体的记忆构建与检索**, ICLR 2025, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=5eY3sG8o2k)] [[项目](https:\u002F\u002Faka.ms\u002FSECOM)] [[博客](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fsecom-building-retrieval-based-long-term-memory-for-personalized-conversational-agents\u002F)]\n\n##### 匹配\n- **带有产品密钥的大内存层**, NeurIPS 2019, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05242)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FXLM)]\n- **OSAgent: 由LLM驱动的智能体辅助操作系统**, IJCNN 2024, [[论文](https:\u002F\u002Fdblp.org\u002Frec\u002Fconf\u002Fijcnn\u002FXu0C24.html)]\n- **通过联合学习对齐与翻译实现神经机器翻译**, ICLR 2015, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.0473)]\n- **``我的智能体更懂我''**: 见上文\n\n#### 神经记忆\n##### 关联记忆\n- **霍普菲尔德网络就是你需要的一切**, NeurIPS 2020, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.02217)] [[代码](https:\u002F\u002Fgithub.com\u002Fml-jku\u002Fhopfield-layers)]\n- **霍普菲尔德网络就是你需要的一切**, ICLR 2021, [[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=tL89RnzIiCd)]\n- **用于剩余使用寿命估计问题的神经图灵机**, Computers in Industry 2022, [[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0166361522001678)] [[代码](https:\u002F\u002Fgithub.com\u002Faranciokov\u002FNTM-For-RULEstimation)]\n\n##### 参数集成\n- **MemoryLLM：迈向自我更新的大语言模型**，ICML 2024，[[论文](https:\u002F\u002Fcseweb.ucsd.edu\u002F~jmcauley\u002Freviews\u002Ficml24c.pdf)] [[代码](https:\u002F\u002Fgithub.com\u002Fwangyu-ustc\u002FMemoryLLM)]\n- **SELF-PARAM：大语言模型的自参数化改造**，ICLR 2025，[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=2f1e7xxycZ)] [[代码](https:\u002F\u002Fgithub.com\u002FXinshuangL\u002FSELF-PARAM)]\n- **MemoRAG：通过全局记忆增强的检索增强技术提升长上下文处理能力**，万维网大会（WWW）2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.05591)] [[代码](https:\u002F\u002Fgithub.com\u002Fqhjqhj00\u002FMemoRAG)]\n- **学会（在测试时学习）：具有表达性隐藏状态的循环神经网络**，ICLR 2025，[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=N0bdUqPjbB)] [[代码](https:\u002F\u002Fgithub.com\u002Ftest-time-training\u002Fttt-lm-pytorch)]\n- **泰坦：在测试时学习记忆**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.08544)] [[非官方代码](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Ftitans-pytorch)]\n- **R³Mem：一种用于大语言模型的三阶记忆**，ICLR 2025（待发表），[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06607)]\n\n#### 应用\n##### RAG\n- **RAGLAB：检索增强生成研究平台**，EMNLP 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.03005)] [[代码](https:\u002F\u002Fgithub.com\u002Ffate-ubw\u002FRAGLAB)]\n- **何时不应信任语言模型：探究参数化与非参数化记忆的有效性**，ACL 2023，[[论文](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.546.pdf)]\n- **Atlas：基于检索增强语言模型的少样本学习**，arXiv 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.03299)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fatlas)]\n- **具有演化条件记忆的个性化大语言模型助手**，COLING 2025，[[论文](https:\u002F\u002Faclanthology.org\u002F2025.coling-main.254.pdf)]\n\n##### 长上下文建模\n- **递归记忆Transformer**，NeurIPS 2022，[[论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Ffile\u002F47e288629a6996a17ce50b90a056a0e1-Paper-Conference.pdf)]\n- **利用RMT将Transformer扩展到100万 tokens及以上**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.11062)]\n- **调整语言模型以压缩上下文**，EMNLP 2023，[[论文](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.232.pdf)]\n- **大型语言模型中用于上下文压缩的上下文自编码器**，ICLR 2024，[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=uREj4ZuGJE)]\n- **学习使用要点令牌压缩提示**，NeurIPS 2023，[[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002F3d77c6dcc7f143aa2154e7f4d5e22d68-Paper-Conference.pdf)]\n- **CompAct：为问答任务主动压缩检索文档**，EMNLP 2024，[[论文](https:\u002F\u002Faclanthology.org\u002F2024.emnlp-main.1194.pdf)]\n\n##### 缓解幻觉\n- **消除大语言模型幻觉需要重新思考泛化能力**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.17642)]\n- **Memoria：通过受人类启发的记忆架构解决致命遗忘问题**，ICML 2024，[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=yTz0u4B8ug)]\n- **百万专家混合模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.04153)]\n- **仅在需要时检索：用于缓解大语言模型幻觉的自适应检索增强技术**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10612)]\n\n\n\n## 感知\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_fe396a8773a4.png\" alt=\"感知系统\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 单模态模型\n\n#### 文本\n- **BERT：面向语言理解的深度双向Transformer预训练**，2018年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)] [[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert)]\n- **RoBERTa：鲁棒优化的BERT预训练方法**，2019年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq)]\n- **ALBERT：用于语言表征自监督学习的轻量级BERT**，2019年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942)] [[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FALBERT)]\n\n#### 图像\n- **用于图像识别的深度残差学习**，CVPR 2016，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1512.03385)] [[代码](https:\u002F\u002Fgithub.com\u002FKaimingHe\u002Fdeep-residual-networks)]\n- **基于Transformer的端到端目标检测**，2020年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.12872)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fdetr)]\n- **Grounding DINO 1.5：推进开放集目标检测的“前沿”**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.10300)] [[代码](https:\u002F\u002Fgithub.com\u002FIDEA-Research\u002FGrounding-DINO-1.5-API)]\n\n#### 视频\n- **ViViT：视频视觉Transformer**，2021年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.15691)] [[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fscenic\u002Ftree\u002Fmain\u002Fscenic\u002Fprojects\u002Fvivit)]\n- **VideoMAE：掩码自编码器是自监督视频预训练的数据高效学习者**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.12602)] [[代码](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FVideoMAE)]\n\n#### 音频\n- **FastSpeech 2：快速且高质量的端到端文本转语音**，2020年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04558)] [[代码](https:\u002F\u002Fspeechresearch.github.io\u002Ffastspeech2)]\n- **Seamless：多语言、富有表现力且流式传输的语音翻译**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.05187)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fseamless_communication)]\n- **wav2vec 2.0：用于语音表征自监督学习的框架**，2020年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11477)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq\u002Ftree\u002Fmain\u002Fexamples\u002Fwav2vec)]\n\n#### 其他单模态\n- **Visual ChatGPT：与视觉基础模型对话、绘图和编辑**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04671)] [[代码](https:\u002F\u002Fgithub.com\u002Fchenfei-wu\u002FTaskMatrix)]\n- **HuggingGPT：借助ChatGPT及其在Hugging Face中的伙伴解决AI任务**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FJARVIS)]\n- **MM-REACT：引导ChatGPT进行多模态推理与行动**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11381)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMM-REACT)]\n- **ViperGPT：通过Python执行实现视觉推理**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08128)] [[代码](https:\u002F\u002Fgithub.com\u002Fcvlab-columbia\u002Fviper)]\n- **AudioGPT：理解并生成语音、音乐、声音以及会说话的头部**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.12995)] [[代码](https:\u002F\u002Fgithub.com\u002FAIGC-Audio\u002FAudioGPT)]\n- **LLaVA-Plus：学习使用工具创建多模态智能体**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05437)] [[代码](https:\u002F\u002Fgithub.com\u002FLLaVA-VL\u002FLLaVA-Plus-Codebase)]\n\n### 跨模态模型\n\n#### 文本-图像\n- **从自然语言监督中学习可迁移的视觉模型**，2021年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)] [[代码](https:\u002F\u002Fgithub.com\u002FOpenAI\u002FCLIP)]\n- **利用噪声文本监督扩展视觉及视觉-语言表征学习**，2021年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.05918)]\n- **通过更优质的标题提升图像生成效果**，2023年，[[论文](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fdall-e-3.pdf)]\n- **VisualBERT：一种简单高效的视觉与语言基准模型**，2019年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03557)] [[代码](https:\u002F\u002Fgithub.com\u002Fuclanlp\u002Fvisualbert)]\n\n#### 文本-视频\n- **VideoCLIP：用于零样本视频-文本理解的对比预训练**，2021年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.14084)] [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq\u002Ftree\u002Fmain\u002Fexamples\u002FMMPT)]\n- **Phenaki：基于开放域文本描述的变长视频生成**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02399)] [[代码](https:\u002F\u002Fphenaki.github.io\u002F)]\n- **Make-A-Video：无需文本-视频数据的文本到视频生成**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14792)] [[代码](https:\u002F\u002Fmake-a-video.github.io\u002F)]\n\n#### 文本-音频\n- **Wav2CLIP：从CLIP中学习鲁棒的音频表征**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11499)] [[代码](https:\u002F\u002Fgithub.com\u002Fdescriptinc\u002Flyrebird-wav2clip)]\n- **VATT：用于从原始视频、音频和文本进行多模态自监督学习的Transformer**，2021年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.11178)] [[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fvatt)]\n- **AudioCLIP：将CLIP扩展至图像、文本和音频**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.13043)] [[代码](https:\u002F\u002Fgithub.com\u002FAndreyGuzhov\u002FAudioCLIP)]\n\n#### 其他跨模态\n- **CLIP-Forge：迈向零样本文本到形状生成**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.02624)] [[代码](https:\u002F\u002Fgithub.com\u002FAutodeskAILab\u002FClip-Forge)]\n- **Point-E：一种可根据复杂提示生成三维点云的系统**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08751)] [[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fpoint-e)]\n\n### 多模态模型\n\n#### VLM（视觉-语言模型）\n- **MiniGPT-v2：大型语言模型作为视觉-语言多任务学习的统一接口**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.09478)] [[代码](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FMiniGPT-4)]\n- **LLaVA-NeXT：改进的推理、OCR和世界知识**，2024年，[[论文](https:\u002F\u002Fllava-vl.github.io\u002Fblog\u002F2024-01-30-llava-next)] [[代码](https:\u002F\u002Fgithub.com\u002FLLaVA-VL\u002FLLaVA-NeXT)]\n- **CogVLM2：用于图像和视频理解的视觉语言模型**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.16500)] [[代码](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FCogVLM2)]\n- **Qwen2-VL：在任何分辨率下增强视觉-语言模型对世界的感知能力**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12191)] [[代码](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen2.5-VL)]\n- **生成式多模态模型是上下文学习者**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.13286)] [[代码](https:\u002F\u002Fgithub.com\u002Fbaaivision\u002FEmu)]\n\n##### 边缘侧VLM\n- **TinyGPT-V：通过小型骨干网络实现高效的多模态大型语言模型**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16862)] [[代码](https:\u002F\u002Fgithub.com\u002FDLYuanGod\u002FTinyGPT-V)]\n- **MobileVLM：面向移动设备的快速、强大且开源的视觉语言助手**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16886)] [[代码](https:\u002F\u002Fgithub.com\u002FMeituan-AutoML\u002FMobileVLM)]\n- **MiniCPM-V：你手机上的GPT-4V级别多模态大型语言模型**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.01800)] [[代码](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FMiniCPM-V)]\n- **OmniParser：纯视觉驱动的GUI代理**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.00203)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOmniParser)]\n\n#### VLA（视觉-语言-行动模型）\n- **CLIPort：用于机器人操作的“什么”与“哪里”路径规划**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.12098)] [[代码](https:\u002F\u002Fgithub.com\u002Fcliport\u002Fcliport)]\n- **RT-1：面向大规模真实世界控制的机器人Transformer**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)] [[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Frobotics_transformer)]\n- **利用预训练视觉-语言模型进行开放世界物体操作**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.00905)] [[代码](https:\u002F\u002Frobot-moo.github.io\u002F)]\n- **Perceiver-Actor：用于机器人操作的多任务Transformer**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05451)] [[代码](https:\u002F\u002Fgithub.com\u002Fperact\u002Fperact)]\n- **Diffusion Policy：基于动作扩散的视觉-运动策略学习**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04137)] [[代码](https:\u002F\u002Fgithub.com\u002Freal-stanford\u002Fdiffusion_policy)]\n- **PaLM-E：具身多模态语言模型**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03378)] [[代码](https:\u002F\u002Fpalm-e.github.io\u002F)]\n- **MultiPLY：3D世界中的多感官、以对象为中心的具身大型语言模型**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.08577)] [[代码](https:\u002F\u002Fgithub.com\u002Feth-ait\u002FMultiPly)]\n\n#### ALM（音频-语言模型）\n- **Audio Flamingo：一种具有少样本学习和对话能力的新颖音频语言模型**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01831)] [[代码](https:\u002F\u002Faudioflamingo.github.io\u002F)]\n- **SpeechVerse：大规模可泛化的音频语言模型**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.08295)]\n- **UniAudio 1.5：由大型语言模型驱动的音频编解码器是少样本音频任务学习者**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10056)] [[代码](https:\u002F\u002Fgithub.com\u002Fyangdongchao\u002FLLM-Codec)]\n- **Qwen2-Audio技术报告**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.10759)] [[代码](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen2-Audio)]\n- **AudioLM：一种基于语言建模的音频生成方法**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03143)] [[代码](https:\u002F\u002Fgoogle-research.github.io\u002Fseanet\u002Faudiolm\u002Fexamples)]\n- **Mini-Omni：语言模型可以在流式处理中听、说并思考**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.16725)] [[代码](https:\u002F\u002Fgithub.com\u002Fgpt-omni\u002Fmini-omni)]\n- **SpeechGPT：赋予大型语言模型内在的跨模态对话能力**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11000)] [[代码](https:\u002F\u002Fgithub.com\u002F0nutation\u002FSpeechGPT)]\n\n#### AVLM（音频-视觉-语言模型）\n- **ONE-PEACE：探索一种通用表示模型，迈向无限模态**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11172)] [[代码](https:\u002F\u002Fgithub.com\u002FOFA-Sys\u002FONE-PEACE)]\n- **PandaGPT：一个模型即可完成所有指令遵循任务**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16355)] [[代码](https:\u002F\u002Fgithub.com\u002Fyxuansu\u002FPandaGPT)]\n- **Macaw-LLM：融合图像、音频、视频和文本的多模态语言建模**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09093)] [[代码](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM)]\n- **LanguageBind：通过基于语言的语义对齐，将视频-语言预训练扩展到N模态**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01852)] [[代码](https:\u002F\u002Fgithub.com\u002FPKU-YuanGroup\u002FLanguageBind)]\n- **UnIVAL：用于图像、视频、音频和语言任务的统一模型**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.16184)] [[代码](https:\u002F\u002Fgithub.com\u002Fmshukor\u002FUnIVAL)]\n- **X-LLM：通过将多模态视为外语来构建先进的大型语言模型**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04160)] [[代码](https:\u002F\u002Fgithub.com\u002Fphellonchen\u002FX-LLM)]\n\n#### 其他多模态\n- **PointLLM：赋能大型语言模型理解点云数据**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.16911)] [[代码](https:\u002F\u002Fgithub.com\u002FOpenRobotLab\u002FPointLLM)]\n- **MiniGPT-3D：利用2D先验高效对齐3D点云与大型语言模型**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.01413)] [[代码](https:\u002F\u002Fgithub.com\u002FTangYuan96\u002FMiniGPT-3D)]\n- **NExT-GPT：任意模态之间的多模态大型语言模型**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05519)] [[代码](https:\u002F\u002Fgithub.com\u002FNExT-GPT\u002FNExT-GPT)]\n- **Unified-IO 2：扩展自回归多模态模型，涵盖视觉、语言、音频和行动**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17172)] [[代码](https:\u002F\u002Fgithub.com\u002Fallenai\u002Funified-io-2)]\n- **CoDi-2：任意模态之间的上下文内、交错式和交互式生成**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.18775)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fi-Code\u002Ftree\u002Fmain\u002FCoDi-2)]\n- **ModaVerse：利用大型语言模型高效转换模态**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.06395)] [[代码](https:\u002F\u002Fgithub.com\u002Fxinke-wang\u002FModaVerse)]\n\n## 世界模型\n\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_f4b9199bfc99.png\" alt=\"Foundation Agents中的世界模型\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 外部方法\n**DINO-WM [358]：基于预训练视觉特征的视频世界模型实现零样本规划**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.04983)]，[[代码][]]\n\n**SAPIEN [351]：基于部件的交互式仿真环境**，CVPR 2020年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08515)]，[[代码][]]\n\n**MuZero [349]：通过学习模型规划掌握雅达利、围棋、国际象棋和将棋**，Nature 2020年，[[论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03051-4)]，[[代码][]]\n\n**GR-2 [357]：具有网络规模知识的生成式视频-语言-动作模型用于机器人操作**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06158)]，[[代码][]]\n\n**COAT [356]：利用大型语言模型发现隐藏的世界**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03941)]，[[代码][]]\n\n**AutoManual [108]：通过LLM智能体结合交互式环境学习生成使用说明书**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16247)]，[[代码][]]\n\n**PILCO [355]：一种基于模型且数据高效的策略搜索方法**，ICML 2011年，[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv17\u002Fdeisenroth12a.html)]，[[代码][]]\n\n### 内部方法\n**ActRe [49]：ReAct遇上ActRe：用于对比自监督学习的智能体轨迹自主标注**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.14589)]，[[代码][]]\n\n**世界模型 [348]：世界模型**，NeurIPS 2018年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.10122)]，[[代码][]]\n\n**Dreamer [350]：梦想即控制：通过潜在想象学习行为**，ICLR 2020年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603)]，[[代码][]]\n\n**扩散世界模型 [353]：用于世界建模的扩散模型——雅达利游戏中视觉细节至关重要**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.12399)]，[[代码][]]\n\n**GQN [354]：神经场景表示与渲染**，Science 2018年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.07422)]，[[代码][]]\n\n**Daydreamer [352]：用于物理机器人学习的世界模型**，CoRL 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176)]，[[代码][]]\n\n## 行动\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_110777c4a2df.jpg\" alt=\"行动。\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 行动空间：\n\n### 语言\n\n#### 文本\n\n- **ReAct：在语言模型中协同推理与行动**，ICLR 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)] [[代码](https:\u002F\u002Fgithub.com\u002Fysymyth\u002FReAct)]\n\n- **AutoGPT：构建、部署和运行AI智能体**，Github，[[代码](https:\u002F\u002Fgithub.com\u002FSignificant-Gravitas\u002FAutoGPT)]\n\n- **Reflexion：具备言语强化学习能力的语言智能体**，NeurIPS 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[代码](https:\u002F\u002Fgithub.com\u002Fnoahshinn\u002Freflexion)]\n\n- **LLM+P：以最优规划能力赋能大型语言模型**，arXiv 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11477)] [[代码](https:\u002F\u002Fgithub.com\u002FCranial-XIX\u002Fllm-pddl)]\n\n#### 代码\n\n- **MetaGPT：面向多智能体协作框架的元编程**，ICLR 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.00352)] [[代码](https:\u002F\u002Fgithub.com\u002Fgeekan\u002FMetaGPT)]\n\n- **ChatDev：用于软件开发的沟通型智能体**，ACL 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07924)] [[代码](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FChatDev)]\n\n- **SWE-agent：智能体-计算机接口助力自动化软件工程**，NeurIPS 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15793)] [[代码](https:\u002F\u002Fgithub.com\u002FSWE-agent\u002FSWE-agent)]\n\n- **OpenHands：面向通用型AI软件开发者的开放平台**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.16741)] [[代码](https:\u002F\u002Fgithub.com\u002FAll-Hands-AI\u002FOpenHands)]\n- \n#### 聊天\n\n- **生成式智能体：人类行为的交互式模拟**，UIST 2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03442)] [[代码](https:\u002F\u002Fgithub.com\u002Fjoonspk-research\u002Fgenerative_agents)]\n\n- **AutoGen：通过多智能体对话实现下一代LLM应用**，COLM 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08155)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fautogen)]\n\n### 数字\n\n#### 游戏\n\n- **MineDojo：利用互联网规模知识构建开放式具身智能体**，NeurIPS 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08853)] [[代码](https:\u002F\u002Fgithub.com\u002FMineDojo\u002FMineDojo)]\n  \n- **Voyager：基于大型语言模型的开放式具身智能体**，TMLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)] [[代码](https:\u002F\u002Fgithub.com\u002FMineDojo\u002FVoyager)]\n\n- **SwarmBrain：通过大型语言模型实现的即时战略游戏《星际争霸II》具身智能体**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.17749)] [[代码](https:\u002F\u002Fgithub.com\u002Framsayxiaoshao\u002FSwarmBrain)]\n\n- **JARVIS-1：具有记忆增强型多模态语言模型的开放世界多任务智能体**，NeurIPS 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05997)] [[代码](https:\u002F\u002Fgithub.com\u002FCraftJarvis\u002FJARVIS-1)]\n\n#### 多模态\n\n- **MM-REACT：提示ChatGPT进行多模态推理与行动**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11381)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMM-REACT)]\n\n- **ViperGPT：通过Python执行实现视觉推理**，ICCV 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08128)] [[代码](https:\u002F\u002Fgithub.com\u002Fcvlab-columbia\u002Fviper)]\n\n- **Visual ChatGPT：与视觉基础模型对话、绘图和编辑**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04671)] [[代码](https:\u002F\u002Fgithub.com\u002Fhackiey\u002Fvisual-chatgpt)]\n\n- **HuggingGPT：借助ChatGPT及其在Hugging Face中的伙伴解决AI任务**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.17580)] [[代码](https:\u002F\u002Fgithub.com\u002FAI-Chef\u002FHuggingGPT)]\n\n#### 网络\n\n- **WebGPT：基于浏览器辅助的人工反馈问答系统**，arXiv 2021，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.09332)] [[博客](https:\u002F\u002Fopenai.com\u002Findex\u002Fwebgpt\u002F)]\n\n- **WebShop：迈向可扩展的真实世界网络交互——基于 grounded语言模型的智能体**，NeurIPS 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.01206)] [[代码](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FWebShop)]\n\n- **具备规划、长上下文理解与程序合成能力的真实世界网络智能体**，ICLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.12856)]\n\n- **Mind2Web：迈向通用的网络智能体**，NeurIPS 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06070)] [[代码](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FMind2Web)]\n\n#### GUI\n\n- **Mobile-Agent：具有视觉感知能力的自主多模态移动设备智能体**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.16158)] [[代码](https:\u002F\u002Fgithub.com\u002FX-PLUG\u002FMobileAgent)]\n\n- **AppAgent：作为智能手机用户的多模态智能体**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.13771)] [[代码](https:\u002F\u002Fgithub.com\u002FTencentQQGYLab\u002FAppAgent)]\n\n- **UFO：专注于Windows操作系统交互的UI导向智能体**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07939)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUFO)]\n\n- **OmniParser：纯视觉驱动的GUI智能体**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.00203)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOmniParser)]\n\n#### 数据库与知识图谱\n\n- **大型语言模型驱动的NL2SQL综述：我们目前处于什么阶段？未来又将走向何方？**，arXiv 2024，[[论文](arxiv.org\u002Fabs\u002F2408.05109)] [[手册](https:\u002F\u002Fgithub.com\u002FHKUSTDial\u002FNL2SQL_Handbook)]\n\n- **Alpha-SQL：基于蒙特卡洛树搜索的零样本文本到SQL转换**，ICML 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17248)]\n  \n- **NL2SQL-Bugs：用于检测NL2SQL翻译中语义错误的基准测试**，SIGKDD 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11984)] [[代码](https:\u002F\u002Fnl2sql-bugs.github.io\u002F)]\n  \n- **EllieSQL：具有复杂度感知路由的成本高效文本到SQL转换**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.22402)] [[代码](https:\u002F\u002Felliesql.github.io\u002F)]\n  \n- **nvBench 2.0：面向歧义情况下的自然语言到可视化基准测试**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.12880)] [[代码](https:\u002F\u002Fnvbench2.github.io\u002F)]\n\n- **自然语言到SQL的曙光：我们准备好了吗？**，VLDB 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01265)] [[代码](https:\u002F\u002Fnl2sql360.github.io\u002F)]\n  \n- **大型语言模型是优秀的统计学家吗？**，NIPS 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07815)] [[代码](https:\u002F\u002Fstatqa.github.io\u002F)]\n\n- **UnifiedSKG：利用文本到文本语言模型实现结构化知识的统一与多任务接地**，EMNLP 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.05966)] [[代码](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FUnifiedSKG)]\n\n- **不要生成，要判别：一种将语言模型接地于真实世界环境的方案**，ACL 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09736)] [[代码](https:\u002F\u002Fgithub.com\u002Fdki-lab\u002FPangu)]\n\n- **LLM是否已经可以作为数据库接口？大规模数据库接地文本到SQL的大规模基准测试**，NeurIPS 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.03111)] [[项目](https:\u002F\u002Fbird-bench.github.io\u002F)]\n\n- **Spider 2.0：评估语言模型在真实世界企业文本到SQL工作流中的表现**，ICLR 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.07763)] [[代码](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FSpider2)]\n\n- **LLM中间件：工具对于复杂环境中语言智能体至关重要**，EMNLP 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14672)] [[代码](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FMiddleware)]\n\n### 物理\n\n- **RT-1：用于大规模真实世界控制的机器人Transformer**，RSS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)] [[项目](https:\u002F\u002Frobotics-transformer1.github.io\u002F)]\n\n- **RT-2：视觉-语言-动作模型将网络知识迁移到机器人控制中**，CoRL 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15818)] [[项目](https:\u002F\u002Frobotics-transformer2.github.io\u002F)]\n\n- **Open X-Embodiment：机器人学习数据集与RT-X模型**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.08864v4)] [[项目](https:\u002F\u002Frobotics-transformer-x.github.io\u002F)]\n  \n- **GR-2：具有网络规模知识的生成式视频-语言-动作模型，用于机器人操作**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06158)] [[项目](https:\u002F\u002Fgr2-manipulation.github.io\u002F)]\n  \n- **π0：用于通用机器人控制的视觉-语言-动作流模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.24164)]\n\n- **按我能做的做，而非我所说的做：将语言接地于机器人的可用性**，CoRL 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.01691)] [[项目](https:\u002F\u002Fsay-can.github.io\u002F)]\n\n- **Voxposer：用于机器人操作的语言模型可组合3D价值地图**，CoRL 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.05973)] [[代码](https:\u002F\u002Fgithub.com\u002Fhuangwl18\u002FVoxPoser)]\n\n- **Embodiedgpt：通过具身思维链进行视觉-语言预训练**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15021)] [[项目](https:\u002F\u002Fembodiedgpt.github.io\u002F)]\n\n### 学习\n\n### ICL（上下文学习）\n\n#### 提示\n\n- **CoT：思维链提示在大型语言模型中激发推理**，NeurIPS 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)]\n\n- **ReAct：ReAct——在语言模型中协同推理与行动**，arXiv 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)] [[项目](https:\u002F\u002Freact-lm.github.io\u002F)]\n\n- **Auto-CoT：大型语言模型中的自动思维链提示**，ICLR 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03493)] [[代码](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fauto-cot)]\n\n- **ToT：思维树——利用大型语言模型进行审慎问题解决**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)] [[代码](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002Ftree-of-thought-llm)]\n\n- **GoT：思维图——用大型语言模型解决复杂问题**，AAAI 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.09687)] [[代码](https:\u002F\u002Fgithub.com\u002Fspcl\u002Fgraph-of-thoughts)]\n\n- **LearnAct：通过动作学习赋能大型语言模型智能体**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15809)] [[代码](https:\u002F\u002Fgithub.com\u002Fzhao-ht\u002FLearnAct)]\n\n- **CoA：利用稀疏通信拓扑改进多智能体辩论**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11776)]\n\n#### 分解\n\n- **Least-to-Most：由低到高提示使大型语言模型具备复杂推理能力**，ICLR 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625)]\n\n- **HuggingGPT：Hugginggpt——结合ChatGPT及其在Hugging Face中的伙伴解决AI任务**，NeurIPS 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FJARVIS)]\n\n- **Plan-and-Solve：计划与求解提示——提升大型语言模型的零样本思维链推理能力**，ACL 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04091)] [[代码](https:\u002F\u002Fgithub.com\u002FAGI-Edgerunners\u002FPlan-and-Solve-Prompting)]\n\n- **ProgPrompt：Progprompt——利用大型语言模型生成情境化的机器人任务规划**，ICRA 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302)] [[项目](https:\u002F\u002Fprogprompt.github.io\u002F)]\n\n#### 角色扮演\n\n- **Generative Agents：生成式智能体——人类行为的交互模拟**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03442)] [[代码](https:\u002F\u002Fgithub.com\u002Fjoonspk-research\u002Fgenerative_agents)]\n\n- **MetaGPT：Meta{GPT}——面向多智能体协作框架的元编程**，ICLR 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.00352)] [[代码](https:\u002F\u002Fgithub.com\u002Fgeekan\u002FMetaGPT)]\n\n- **ChatDev：ChatDev——用于软件开发的沟通型智能体**，ACL 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07924)] [[代码](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FChatDev)]\n\n- **SWE-Agent：SWE-agent——智能体-计算机接口助力自动化软件工程**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15793)] [[项目](https:\u002F\u002Fswe-agent.com\u002Flatest\u002F)]\n\n#### 精炼\n\n- **Reflexion：Reflexion——具有语言强化学习能力的语言智能体**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)] [[代码](https:\u002F\u002Fgithub.com\u002Fnoahshinn\u002Freflexion)]\n\n- **Self-refine：Self-refine——基于自我反馈的迭代精炼**，NeurIPS 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)] [[代码](https:\u002F\u002Fgithub.com\u002Fmadaan\u002Fself-refine)]\n\n- **GPTSwarm：GPTSwarm——可优化图结构的语言智能体**，ICML 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16823v3)] [[项目](https:\u002F\u002Fgptswarm.org\u002F)]\n\n### PT & SFT（预训练与监督微调）\n\n#### 预训练\n\n- **RT-1：RT-1——面向大规模真实世界控制的机器人Transformer**，arXiv 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)] [[项目](https:\u002F\u002Frobotics-transformer1.github.io\u002F)]\n\n- **RT-2：RT-2——视觉-语言-动作模型将网络知识迁移到机器人控制中**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15818)] [[项目](https:\u002F\u002Frobotics-transformer2.github.io\u002F)]\n\n- **RT-X：开放x-具身化——机器人学习数据集及RT-x模型**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.08864)] [[项目](https:\u002F\u002Frobotics-transformer-x.github.io\u002F)]\n\n- **GR-2：GR-2——具有网络规模知识的生成式视频-语言-动作模型，用于机器人操作**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06158)] [[项目](https:\u002F\u002Fgr2-manipulation.github.io\u002F)]\n\n- **LAM：大型动作模型——从构思到实现**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.10047)] [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUFO\u002Ftree\u002Fmain\u002Fdataflow)]\n\n#### 微调\n\n- **CogACT：CogACT——用于协同机器人操作中认知与动作的基础性视觉-语言-动作模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.19650)] [[项目](https:\u002F\u002Fcogact.github.io\u002F)]\n\n- **RT-H：RT-H——使用语言构建动作层次结构**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.01823)] [[项目](https:\u002F\u002Frt-hierarchy.github.io\u002F)]\n\n- **OpenVLA：OpenVLA——一个开源的视觉-语言-动作模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09246)] [[项目](https:\u002F\u002Fopenvla.github.io\u002F)]\n\n- **$\\pi_0$：$\\pi_0$——一种用于通用机器人控制的视觉-语言-动作流模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.24164)] [[项目](https:\u002F\u002Fwww.physicalintelligence.company\u002Fblog\u002Fpi0)]\n\n- **UniAct：增强具身基础模型的通用动作**，CVPR 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.10105)] [[代码](https:\u002F\u002Fgithub.com\u002F2toinf\u002FUniAct)]\n\n### 强化学习 (RL)\n\n- **RLHF：利用人类反馈训练语言模型遵循指令**，NeurIPS 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.02155)]\n\n- **DPO：直接偏好优化：你的语言模型其实是一个奖励模型**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18290)]\n\n- **RLFP：基于基础先验的强化学习：让具身智能体高效地自主学习**，CoRL 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02635)] [[项目](https:\u002F\u002Fyewr.github.io\u002Frlfp\u002F)]\n\n- **ELLM：利用大型语言模型指导强化学习中的预训练**，ICML 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.06692)] [[代码](https:\u002F\u002Fgithub.com\u002Fyuqingd\u002Fellm)]\n\n- **GenSim：Gensim：通过大型语言模型生成机器人仿真任务**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01361)] [[项目](https:\u002F\u002Fgen-sim.github.io\u002F)]\n\n- **LEA：基于强化学习和大型语言模型的状态奖励与动作建模的推荐系统**，ACM 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.16948)]\n\n- **MLAQ：通过Q-learning实现零样本最优决策，赋能LLM智能体**，ICLR 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11211)]\n\n- **KALM：KALM：基于大型语言模型回放的离线强化学习构建知识型智能体**，NeurIPS 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.09248)] [[项目](https:\u002F\u002Fkalmneurips2024.github.io\u002F)]\n\n- **When2Ask：通过强化学习实现智能体与LLM之间的智能交互**，RLC 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.03604)]\n\n- **Eureka：Eureka：利用大型语言模型编程实现人类水平的奖励设计**，ICLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.12931)] [[项目](https:\u002F\u002Feureka-research.github.io\u002F)]\n\n- **ArCHer：ArCHer：通过分层多轮强化学习训练语言模型智能体**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.19446)] [[项目](https:\u002F\u002Fyifeizhou02.github.io\u002Farcher.io\u002F)]\n\n- **LLaRP：大型语言模型作为具身任务的可泛化策略**，ICLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.17722)] [[项目](https:\u002F\u002Fllm-rl.github.io\u002F)]\n\n- **GPTSwarm：GPTSwarm：将语言智能体视为可优化图结构**，ICML 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.16823)] [[项目](https:\u002F\u002Fgptswarm.org\u002F)]\n\n## 奖励\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_b69f0704a2cd.png\" alt=\"奖励系统\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 外在奖励\n#### 密集奖励\n- **利用人类反馈训练语言模型遵循指令**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155)] [[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Ffollowing-instructions-human-feedback)]\n- **用于大型语言模型对齐的离线正则化强化学习**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19107)]\n- **sDPO：不要一次性用尽你的数据**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.19270)]\n- **理解基于人类偏好学习的一般理论范式**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.12036)]\n- **β-DPO：带有动态β的直接偏好优化**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.08639)]\n- **ORPO：无需参考模型的整体式偏好优化**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07691)] [[代码](https:\u002F\u002Fgithub.com\u002Fxfactlab\u002Forpo)]\n- **直接纳什优化：用通用偏好教导语言模型自我改进**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.03715)]\n- **超越反向KL：通过多样化的散度约束推广直接偏好优化**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16240)]\n- **有些东西比其他东西更让人尴尬：基于成对“尴尬”损失的迭代偏好优化**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16682)]\n- **从r到Q∗：你的语言模型其实是一个Q函数**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.12358)] [[代码]()]\n\n#### 稀疏奖励\n- **PAFT：一种用于高效微调LLM的并行训练范式**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.17923)]\n- **SimPO：一种无需参考奖励的简单偏好优化**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14734)] [[代码](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FSimPO)]\n- **LiPO：通过排序学习进行列表式偏好优化**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01878)] [[代码]()]\n- **RRHF：无需泪水地通过人类反馈对齐语言模型的回答排名**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05302)] [[代码](https:\u002F\u002Fgithub.com\u002FGanjinZero\u002FRRHF)]\n- **用于人类对齐的偏好排序优化**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17492)] [[代码](https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FDAMO-ConvAI\u002Ftree\u002Fmain\u002FPRO)]\n- **否定消极样本：通过分布式非偏好优化实现与人类负面样本对齐**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.03419)]\n- **负向偏好优化：从灾难性崩溃到有效遗忘**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05868)] [[代码](https:\u002F\u002Fgithub.com\u002Flicong-lin\u002Fnegative-preference-optimization)]\n- **回归基础：重新审视LLM中基于人类反馈学习的REINFORCE风格优化**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14740)] [[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fsummarize-from-feedback)]\n\n#### 延迟奖励\n- **对比偏好优化：推动机器翻译中LLM性能的边界**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.08417)] [[代码](https:\u002F\u002Fgithub.com\u002Ffe1ixxu\u002FALMA)]\n- **基于人类反馈的纳什学习**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00886)]\n- **一种极小极大主义的基于人类反馈的强化学习方法**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04056)]\n\n#### 自适应奖励\n- **利用人类反馈训练语言模型遵循指令**，2022年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155)] [[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Ffollowing-instructions-human-feedback)]\n- **用于大型语言模型对齐的离线正则化强化学习**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19107)]\n- **β-DPO：带有动态β的直接偏好优化**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.08639)]\n- **ORPO：无需参考模型的整体式偏好优化**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07691)] [[代码](https:\u002F\u002Fgithub.com\u002Fxfactlab\u002Forpo)]\n- **PAFT：一种用于高效微调LLM的并行训练范式**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.17923)]\n- **SimPO：一种无需参考奖励的简单偏好优化**，2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14734)] [[代码](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FSimPO)]\n- **基于人类反馈的纳什学习**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00886)]\n- **一种极小极大主义的基于人类反馈的强化学习方法**，2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04056)]\n- **超越反向KL：通过多样化的散度约束推广直接偏好优化**，2023年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16240)]\n\n### 内在奖励\n#### 好奇心驱动的奖励\n- **基于自监督预测的 curiosity-driven 探索**, 2017, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.05363)] [[代码](https:\u002F\u002Fgithub.com\u002Fpathak22\u002Fnoreward-rl)]\n- **通过分歧进行的自监督探索**, 2019, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04161)] [[代码](https:\u002F\u002Fgithub.com\u002Fpathak22\u002Fexploration-by-disagreement)]\n- **利用自监督世界模型规划探索**, 2020, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.05960)] [[代码](https:\u002F\u002Fgithub.com\u002Framanans1\u002Fplan2explore)]\n\n#### 多样性奖励\n- **Liir：多智能体强化学习中的个体内在奖励学习**, 2019, [[论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2019\u002Ffile\u002F07a9d3fed4c5ea6b17e80258dee231fa-Paper.pdf)]\n\n#### 能力基础奖励\n- **CURIOUS：内在动机的模块化多目标强化学习**, 2019, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.06284)] [[代码](https:\u002F\u002Fgithub.com\u002Fflowersteam\u002Fcurious)]\n- **Skew-Fit：状态覆盖的自监督强化学习**, 2019, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.03698)]\n- **DISCERN：基于多样性的质心选择，用于 k-估计与快速非随机聚类**, 2021, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05933)] [[代码](https:\u002F\u002Fgithub.com\u002Falihassanijr\u002FDISCERN)]\n- **自我博弈微调将弱语言模型转化为强语言模型**, 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.01335)] [[代码](https:\u002F\u002Fgithub.com\u002Fuclaml\u002FSPIN)]\n- **KTO：作为前景理论优化的模型对齐**, 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01306)] [[代码](https:\u002F\u002Fgithub.com\u002FContextualAI\u002FHALOs)]\n\n#### 探索奖励\n- **自我博弈微调将弱语言模型转化为强语言模型**, 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.01335)] [[代码](https:\u002F\u002Fgithub.com\u002Fuclaml\u002FSPIN)]\n- **随机网络蒸馏的探索**, 2018, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12894)] [[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Frandom-network-distillation)]\n\n#### 信息增益奖励\n- **通过信息论理解大语言模型中的思维链**, 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.11984)]\n- **VIME：变分信息最大化探索**, 2016, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.09674)] [[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fvime)]\n- **EMI：基于互信息的探索**, 2019, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.01176)] [[代码](https:\u002F\u002Fgithub.com\u002Fsnu-mllab\u002FEMI)]\n- **基于模型的主动探索**, 2019, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.12162)] [[代码](https:\u002F\u002Fgithub.com\u002Fnnaisense\u002Fmax)]\n- **KTO：作为前景理论优化的模型对齐**, 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01306)] [[代码](https:\u002F\u002Fgithub.com\u002FContextualAI\u002FHALOs)]\n\n### 混合奖励\n#### 内在奖励与外在奖励的结合\n- **RLAIF vs. RLHF：通过 AI 反馈扩展人类反馈强化学习**, 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00267)]\n- **宪法式 AI：来自 AI 反馈的无害性**, 2022, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08073)] [[代码](https:\u002F\u002Fgithub.com\u002Fanthropics\u002FConstitutionalHarmlessnessPaper)]\n- **基于人类反馈的迭代偏好学习：在 KL 约束下弥合理论与实践的 RLHF**, 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11456)]\n- **RLHF 工作流：从奖励建模到在线 RLHF**, 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.07863)] [[代码](https:\u002F\u002Fgithub.com\u002FRLHFlow\u002FRLHF-Reward-Modeling)]\n\n### 层次奖励\n#### 层次奖励\n- **令牌级别的直接偏好优化**, 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.11999)] [[代码](https:\u002F\u002Fgithub.com\u002FVance0124\u002FToken-level-Direct-Preference-Optimization)]\n\n## 情感\n\n# 智能体中的自我提升\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_0ddb66b024d4.png\" alt=\"自我进化\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 优化空间\n\n#### 提示\n\n\n- **多步任务中的提示优化（promst）：整合人类反馈与偏好对齐**, EMNLP 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08702)] \n\n- **StraGo：利用战略指导进行提示优化**, EMNLP 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.08601)]\n\n- **大型语言模型与进化算法的结合可产生强大的提示优化器**, ICLR 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.08532)]\n\n#### 工作流\n\n#### 工具\n\n### 优化算法\n\n#### 优化策略\n\n- **大型语言模型是人类级别的提示工程师**，ICLR 2023 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01910)]\n\n- **基于“梯度下降”和束搜索的自动提示优化**，EMNLP 2023 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.03495)]\n\n- **GPTSwarm：可优化图结构的语言智能体**，ICML 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.16823)]\n\n- **Promptbreeder：通过提示进化实现自指性的自我改进**，ICML 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16797)]\n\n- **教导大型语言模型进行自我调试**，ICLR 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05128)]\n\n- **大型语言模型作为优化器**，ICLR 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.03409)]\n\n- **DSPy：将声明式语言模型调用编译为可自我改进的流水线**，ICLR 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03714)]\n\n- **为提示工程师设计提示工程**，ACL 2024 研究成果 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05661)]\n\n- **多步任务中的提示优化（promst）：整合人类反馈与偏好对齐**，EMNLP 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08702)]\n\n- **StraGo：利用战略指导进行提示优化**，EMNLP 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.08601)]\n\n- **针对多阶段语言模型程序的指令与示范优化**，EMNLP 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11695)]\n\n- **Trace 是下一代 AutoDiff：结合丰富反馈、执行轨迹与大模型的生成式优化**，NeurIPS 2024 [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.16218)]\n\n- **通过反向传播语言模型反馈优化生成式 AI**，Nature [[论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-025-08661-4)]\n\n- **大型语言模型是优秀的提示优化器吗？**，arXiv [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02101)]\n\n#### 理论视角\n\n- **将上下文学习解释为隐式贝叶斯推断**，ICLR 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.02080)]\n\n- **重新思考示范的作用：是什么让上下文学习奏效？**，EMNLP 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.12837)]\n\n- **Transformer 在上下文中能学习什么？以简单函数类为例**，NeurIPS 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01066)]\n\n- **上下文学习究竟是一种怎样的学习算法？基于线性模型的探究**，ICLR 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15661)]\n\n- **Transformer 通过梯度下降在上下文中学习**，ICML 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07677)]\n\n- **Transformer 学会实现上下文线性回归的二阶收敛速度**，NeurIPS 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.17086)]\n\n\n\n### 应用场景\n\n#### 在线优化\n\n- **Reflexion：具备言语强化学习能力的语言智能体**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)]\n\n- **Self-refine：基于自我反馈的迭代精炼**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)]\n\n- **ReAct：在语言模型中协同推理与行动**，ICLR 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)]\n\n- **思维之树：利用大型语言模型进行深思熟虑的问题解决**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)]\n\n- **Voyager：一个基于大型语言模型的开放式具身智能体**，TMLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)]\n\n- **让我们逐步验证**，ICLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20050)]\n\n- **MetaGPT：用于多智能体协作框架的元编程**，ICLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.00352)]\n\n- **Camel：用于探索大型语言模型社会“心智”的沟通型智能体**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17760)]\n\n- **ChatDev：用于软件开发的沟通型智能体**，ACL 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07924)]\n\n- **Hugginggpt：借助 ChatGPT 及其在 Hugging Face 中的伙伴解决 AI 任务**，NeurIPS 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)]\n\n- **自学优化器（STOP）：递归式自我改进的代码生成**，COLM 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02304)]\n\n- **Quiet-star：语言模型可以学会在开口前先思考**，CoRR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09629)]\n\n- **Text2reward：为强化学习自动生成密集奖励函数**，ICLR 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.11489)]\n\n- **通过逆向 LLM 输出提取提示**，ACL 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15012)]\n\n- **通过自引导优化对齐大型语言模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17131)]\n\n- **通过自引导优化对齐大型语言模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17131)]\n\n#### 离线优化\n\n- **大型语言模型是优秀的统计学家吗？**，NeurIPS 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07815)]\n\n- **nvBench 2.0：一种面向模糊情境下的自然语言到可视化基准测试**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.12880)]\n\n- **Srag：面向维基百科图谱的多实体问答的结构化检索增强生成**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01346)]\n\n- **面向视觉问答的细粒度检索增强生成**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.20964)]\n\n- **xLAM：赋能 AI 智能体系统的大型动作模型家族**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.03215)]\n\n- **智能体系统的自动化设计**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.08435)]\n\n- **LIRE：用于偏好对齐的列表式奖励增强**，ACL 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13516)]\n\n### 科学知识发现\n\n#### 假设生成与验证\n\n- **大型语言模型能否生成新颖的研究思路？一项由100多名自然语言处理研究人员参与的大规模人类研究**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.04109)] \n\n- **SciAgents：通过生物启发的多智能体智能图推理实现科学发现自动化**，Advanced Materials 2024年，[[论文](https:\u002F\u002Fdoi.org\u002F10.1002\u002Fadma.202413523)] \n\n- **Genesis：迈向系统生物学研究的自动化**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.04109)] \n\n- **AI科学家：迈向完全自动化的开放式科学发现**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.06292)] \n\n- **Agent Laboratory：使用大型语言模型代理作为研究助理**，arXiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04227)] \n\n- **ChemAgent：大型语言模型中的自我更新库提升化学推理能力**，arXiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06590)] \n\n- **ChemOS 2.0：用于化学自动驾驶实验室的编排架构**，Matter 2024年，[[论文](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.matt.2024.04.022)] \n\n- **迈向AI联合科学家**，arXiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.18864)] \n\n#### 实验方案规划与工具创新\n\n- **用于探索性合成化学的自主移动机器人**，Nature 2024年，[[论文](https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs41586-024-08173-7)] \n\n- **有机激光发射体的非局部、异步、闭环发现**，Science 2024年，[[论文](https:\u002F\u002Fdoi.org\u002F10.1126\u002Fscience.adk9227)] \n\n- **虚拟实验室：AI代理设计新型SARS-CoV-2纳米抗体并经实验验证**，bioRxiv 2024年，[[论文](https:\u002F\u002Fdoi.org\u002F10.1101\u002F2024.11.11.623004)] \n\n#### 数据分析与结论推导\n\n- **无需人类示范即可解决奥林匹克几何问题**，Nature 2024年，[[论文](https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs41586-023-06747-5)] \n\n- **面向基因表达数据科学发现的AI科学家团队**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12391)] \n\n- **数据解释器：用于数据科学的LLM代理**，arXiv 2024年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.18679)] \n\n- **Curie：利用AI代理实现严谨且自动化的科学实验**，arXiv 2025年，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.16069)]，[[GitHub](https:\u002F\u002Fgithub.com\u002FJust-Curieous\u002FCurie)]\n\n# 协作与进化型智能系统\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_b43af177b08b.png\" alt=\"基于LLM的多智能体系统\" width=\"100%\">\n\u003C\u002Fdiv>\n\n## 应用\n### 战略学习\n- RECONCILE（Chen等，2023）\n- LLM-Game-Agent（Lan等，2023）\n- BattleAgentBench（Wang等，2024）\n\n### 建模与仿真\n- Generative Agents（Park等，2023）\n- Agent Hospital（Li等，2024）\n- MedAgents（Tang等，2024）\n- MEDCO（Wei等，2024）\n\n### 协作式任务解决\n- MetaGPT（Hong等，2023）\n- ChatDev（Qian等，2024）\n- Agent Laboratory（Schmidgall等，2025）\n- The Virtual Lab（Swanson等，2024）\n\n## 组成与协议\n### 代理组成\n#### 同质性\n- CoELA（Zhang等，2023）\n- VillagerAgent（Dong等，2024）\n- LLM-Coordination（Agashe等，2024）\n\n#### 异质性\n- MetaGPT（Hong等，2023）\n- ChatDev（Qian等，2024）\n- Generative Agents（Park等，2023）\n- S-Agents（Chen等，2024）\n\n### 交互协议\n#### 消息类型\n- SciAgents（Ghafarollahi等，2024）\n- AppAgent（Chi等，2023）\n- MetaGPT（Hong等，2023）\n\n#### 通信接口\n- AgentBench（Liu等，2023）\n- VAB（Liu等，2024）\n- TaskWeaver（Qiao等，2024）\n- HULA（Takerngsaksiri等，2025）\n\n### 下一代协议\n- MCP（Anthropic）\n- Agora（Marro等，2024）\n- IoA（Chen等，2024）\n\n## 拓扑结构\n### 静态拓扑\n- MEDCO（Wei等，2024）\n- Agent Hospital（Li等，2024）\n- Welfare Diplomacy（Mukobi等，2023）\n- MedAgents（Tang等，2024）\n\n### 动态拓扑\n- DyLAN（Liu等，2023）\n- GPTSwarm（Zhuge等，2024）\n- CodeR（Chen等，2024）\n- Oasis（Yang等，2024）\n\n## 协作\n### 代理间协作\n#### 基于共识\n- Agent Laboratory（Schmidgall等，2025）\n- The Virtual Lab（Swanson等，2024）\n- OASIS（Yang等，2024）\n\n#### 协作学习\n- Generative Agents（Park等，2023）\n- Welfare Diplomacy（Mukobi等，2023）\n- LLM-Game-Agent（Lan等，2023）\n- BattleAgentBench（Wang等，2024）\n\n#### 教学\u002F指导\n- MEDCO（Wei等，2024）\n- Agent Hospital（Li等，2024）\n\n#### 任务导向\n- MedAgents（Tang等，2024）\n- S-Agents（Chen等，2024）\n\n### 人机协作\n- Dittos（Leong等，2024）\n- PRELUDE（Gao等，2024）\n\n## 进化\n### 集体智慧\n- Generative Agents（Park等，2023）\n- Welfare Diplomacy（Mukobi等，2023）\n- LLM-Game-Agent（Lan等，2023）\n- BattleAgentBench（Wang等，2024）\n\n### 个体适应性\n- Agent Hospital（Li等，2024）\n- Agent Laboratory（Schmidgall等，2025）\n- MEDCO（Wei等，2024）\n\n## 评估\n### 针对特定任务的基准测试\n- MBPP（dataset-mbpp）\n- HotpotQA（dataset-hotpot-qa）\n- MATH（dataset-math）\n- SVAMP（dataset-svamp）\n- MultiArith（dataset-multiarith）\n\n### 针对MAS的基准测试\n- Collab-Overcooked（Sun等，2025）\n- REALM-Bench（Geng等，2025）\n- PARTNR（Chang等，2024）\n- VillagerBench（Dong等，2024）\n- AutoArena（Zhao等，2024）\n- MultiagentBench（Zhu等，2025）\n\n\n# 构建安全且有益的AI\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_c4fe683b95ab.png\" alt=\"代理内在安全性\" width=\"100%\">\n\u003C\u002Fdiv>\n\n## 安全威胁\n\n### 越狱\n\n#### 白盒越狱\n\n- **针对对齐语言模型的越狱攻击与防御：综述**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04295)]\n  \n- **对齐语言模型的通用且可迁移的对抗性攻击**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15043)]\n  \n- **利用动量增强越狱攻击**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.01229)]\n  \n- **基于优化的大语言模型越狱技术改进**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.21018)]\n  \n- **通过句末MLP重新加权实现指令微调后的LLM越狱**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10150)]\n  \n- **打开大语言模型的潘多拉魔盒：通过表征工程实现LLM越狱**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.06824)]\n  \n- **DROJ：一种针对大型语言模型的提示驱动攻击**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.09125)]\n  \n- **Autodan：在对齐的大语言模型上生成隐蔽的越狱提示**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04451)]\n  \n- **POEX：政策可执行的具身AI越狱攻击**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16633)]\n\n\n#### 黑盒越狱\n\n- **被越狱了：LLM的安全训练为何会失效？**, NeurIPS 2023, [[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Fhash\u002F063b264250add1efdb3e3f7f5686b4e0-Abstract-Conference.html)]\n  \n- **在二十次查询内越狱黑盒大型语言模型**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.08419)]\n  \n- **通过密码字符越狱大型语言模型的审核护栏**, NeurIPS 2024, [[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2025\u002Ffile\u002F59408-59435-Abstract-Conference.html)]\n  \n- **视觉对抗样本越狱对齐的大语言模型**, AAAI 2024, [[论文](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F29398)]\n  \n- **POEX：政策可执行的具身AI越狱攻击**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16633)]\n  \n- **Autodan：在对齐的大语言模型上生成隐蔽的越狱提示**, arXiv 2023, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04451)]\n  \n- **Guard：通过角色扮演生成自然语言越狱提示以测试大型语言模型的准则遵守情况**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03299)]\n  \n- **启发式诱导的多模态风险分布越狱攻击：针对多模态大型语言模型**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05934)]\n  \n- **Rt攻击：通过随机令牌越狱文本到图像模型**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.13896)]\n\n\n### 提示注入\n\n#### 直接提示注入\n\n- **这不是你所期望的：利用间接提示注入攻陷现实世界中集成LLM的应用程序**, AISec@CCS 2023, [[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3617415.3624610)]\n  \n- **针对大型语言模型的自动且通用的提示注入攻击**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.04957)]\n  \n- **基于优化的提示注入攻击：针对作为评判者的LLM**, CCS 2024, [[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3636696.3654467)]\n  \n- **工具集成型大型语言模型代理中的间接提示注入基准测试**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02691)]\n  \n- **不要信任AI：沿着CIA安全三要素进行提示注入**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06090)]\n  \n- **大型视觉-语言模型在视觉提示注入导致的目标劫持方面的实证分析**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.03554)]\n  \n- **2024年SaTML LLM夺旗竞赛的数据集及经验教训**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07954)]\n  \n- **忽略这个标题并HackAPrompt：通过全球提示黑客竞赛揭示LLM的系统性漏洞**, EMNLP 2023, [[论文](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.316)]\n\n\n#### 间接提示注入\n\n- **这不是你所期望的：利用间接提示注入攻陷现实世界中集成LLM的应用程序**, AISec@CCS 2023, [[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3617415.3624610)]\n  \n- **HijackRAG：针对检索增强型大型语言模型的劫持攻击**, arXiv 2025, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22832)]\n  \n- **用于检索增强生成大型语言模型提示注入攻击的后门检索器**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.14479)]\n  \n- **提示感染：多智能体系统内的LLM到LLM提示注入**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.07283)]\n  \n- **针对大型语言模型的对抗性搜索引擎优化**, arXiv 2024, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.18382)]\n\n### 幻觉\n\n#### 知识冲突型幻觉\n\n- **自然语言生成中的幻觉研究综述**, ACM 计算评论 2023年, [[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3571730)]\n  \n- **大型语言模型中幻觉的综述：原理、分类、挑战与开放问题**, arXiv 2023年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05232)]\n  \n- **DELUCIONQA：领域特定问答系统中的幻觉检测**, EMNLP 2023年成果, [[论文](https:\u002F\u002Faclanthology.org\u002F2023.findings-emnlp.258)]\n  \n- **大型语言模型在金融领域的不足：幻觉的实证研究**, NeurIPS 2023年故障模式研讨会, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=ywJqHknZKf)]\n  \n- **MetaGPT：面向多智能体协作框架的元编程**, ICLR 2023年, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=6vBvP6uLTQU)]\n  \n- **幻觉不可避免：大型语言模型的先天性局限**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.11817)]\n  \n- **ERBench：基于实体关系的大型语言模型自动可验证幻觉基准测试集**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.05266)]\n\n\n#### 上下文冲突型幻觉\n\n- **真相感知的上下文选择：缓解大型语言模型因虚假上下文而产生的幻觉**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07556)]\n  \n- **大型语言模型极易被误导：量化指标、安全影响及类型学分析**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.13237)]\n  \n- **HaluEval-Wild：评估大型语言模型在真实场景下的幻觉情况**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.04307)]\n  \n- **大型视觉-语言模型中目标幻觉的分析与缓解**, ICLR 2023年, [[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=2CeF2U2CFi)]\n  \n- **通过无分类器指导缓解大型视觉-语言模型中的目标幻觉**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08680)]\n  \n- **当大型语言模型与人类相悖时？大型语言模型的阿谀奉承行为**, arXiv 2023年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09410)]\n  \n- **HallusionBench：用于检测大型视觉-语言模型中语言幻觉与视觉错觉交织现象的高级诊断工具包**, CVPR 2024年, [[论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fhtml\u002FGuan_HallusionBench_An_Advanced_Diagnostic_Suite_for_Entangled_Language_Hallucination_and_CVPR_2024_paper.html)]\n  \n- **DiaHalu：大型语言模型的对话级幻觉评估基准**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.00896)]\n\n\n### 偏差\n\n#### 目标偏离型偏差\n\n- **人工智能对齐：全面综述**, arXiv 2023年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.19852)]\n  \n- **规范博弈：人工智能创造力的另一面**, DeepMind 博客 2020年, [[论文](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fspecification-gaming-the-flip-side-of-ai-ingenuity\u002F)]\n  \n- **从深度学习视角看对齐问题**, arXiv 2022年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00626)]\n  \n- **模拟失调：大型语言模型的安全对齐可能适得其反！**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12343)]\n  \n- **社会规范演变中的智能体对齐**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04620)]\n  \n- **模型融合与安全对齐：一个不良模型会毁掉整个群体**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.14563)]\n\n\n#### 能力滥用型偏差\n\n- **可信的LLM：评估大型语言模型对齐程度的综述与指南**, arXiv 2023年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.05374)]\n  \n- **通过剪枝和低秩修改评估安全对齐的脆弱性**, arXiv 2024年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.05162)]\n  \n- **人工智能对齐：全面综述**, arXiv 2023年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.19852)]\n  \n- **对齐后的语言模型进行微调会损害安全性，即使用户并无此意图！**, arXiv 2023年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03693)]\n  \n- **大型语言模型中对齐的根本局限性**, arXiv 2023年, [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11082)]\n\n### 毒化攻击\n\n#### 模型毒化\n\n- **预训练模型上的权重毒化攻击**，ACL 2020，[[论文](https:\u002F\u002Faclanthology.org\u002F2020.acl-main.495)]\n  \n- **Badedit：通过模型编辑对大型语言模型进行后门植入**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.13355)]\n  \n- **哲学家的石头：大型语言模型插件中的特洛伊木马**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00374)]\n  \n- **Obliviate：在参数高效微调范式中中和与任务无关的后门**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.14119)]\n  \n- **被毒化的ChatGPT为无所事事的人找到工作：利用受污染AI模型的不安全建议探索开发者的编码实践**，IEEE S&P 2024，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10432536)]\n  \n- **生成式AI智能体之间的秘密合谋**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07510)]\n  \n- **通过防御感知的架构后门利用大型语言模型的漏洞**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.01952)]\n\n\n#### 数据毒化\n\n- **指令微调过程中对语言模型的毒化**，ICML 2023，[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fwan23b.html)]\n  \n- **Agentpoison：通过毒化记忆或知识库对LLM智能体进行红队测试**，NeurIPS 2025，[[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2025\u002Fhash\u002F130185-130213-Abstract-Conference.html)]\n  \n- **Poison-RAG：推荐系统中检索增强生成的对抗性数据毒化攻击**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.11759)]\n  \n- **PoisonBench：评估大型语言模型对数据毒化的脆弱性**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.08811)]\n  \n- **人类反馈的阴暗面：通过用户输入毒化大型语言模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.00787)]\n  \n- **LLM中数据毒化的缩放定律**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.02946)]\n  \n- **话太多：在令牌限制下毒化大型语言模型**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.14795)]\n  \n- **最佳之毒：通过注入受污染的偏好数据攻击RLHF**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05530)]\n\n\n#### 后门注入\n\n- **潜伏者：训练能够经受安全训练仍存续的欺骗性LLM**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05566)]\n  \n- **Wipi：面向LLM驱动网络代理的新网络威胁**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09875)]\n  \n- **探索针对基于大型语言模型决策的后门攻击**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.20774)]\n  \n- **当后门开口说话时：通过模型生成的解释理解LLM后门攻击**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.12701)]\n  \n- **利用虚拟提示注入对指令微调后的大型语言模型进行后门植入**，NAACL 2024，[[论文](https:\u002F\u002Faclanthology.org\u002F2024.naacl-long.338)]\n\n\n## 隐私威胁\n\n### 训练数据推断\n\n#### 成员身份推断攻击\n\n- **针对机器学习模型的成员身份推断攻击**，IEEE S&P 2017，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F7958568)]\n  \n- **秘密分享者：评估与测试神经网络中的意外记忆现象**，USENIX Security 2019，[[论文](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity19\u002Fpresentation\u002Fcarlini)]\n  \n- **仅标签成员身份推断攻击**，ICML 2021，[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fchoquette-choo21a.html)]\n  \n- **通过自提示校准对微调后的大型语言模型实施实用成员身份推断攻击**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.06062)]\n  \n- **从基本原理出发的成员身份推断攻击**，IEEE S&P 2022，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9833683)]\n  \n- **机器学习中的成员身份推断攻击：综述**，ACM Computing Surveys 2022，[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3505244)]\n\n\n#### 数据提取攻击\n\n- **从大型语言模型中提取训练数据**，USENIX Security 2021，[[论文](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity21\u002Fpresentation\u002Fcarlini-extracting)]\n  \n- **特殊字符攻击：迈向可扩展的大型语言模型训练数据提取**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.05990)]\n  \n- **伦理学家：通过损失平滑的软提示和校准置信度估计进行目标性训练数据提取**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.04401)]\n  \n- **语言模型反演**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.13647)]\n  \n- **通用语言模型的隐私风险**，IEEE S&P 2020，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9152683)]\n  \n- **量化跨神经语言模型的记忆现象**，arXiv 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.07646)]\n  \n- **窃取生产级语言模型的一部分**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\n\n### 交互数据推断\n\n#### 系统提示窃取\n\n- **忽略先前提示：针对语言模型的攻击技术**，TSRML@NeurIPS 2022，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.09597)]\n  \n- **针对文本到图像生成模型的提示窃取攻击**，USENIX Security 2024，[[论文](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity24\u002Fpresentation\u002Fshen-xinyue)]\n  \n- **保护大语言模型的系统提示**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.13426)]\n  \n- **InputSnatch：通过时序侧信道攻击窃取大语言模型服务中的输入**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.18191)]\n  \n- **从语言模型中有效提取提示**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.06865)]\n  \n- **最后的赢家：软提示调优、LoRA 和上下文学习的安全与隐私比较分析**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.11397)]\n  \n- **大语言模型应用商店分析：愿景与路线图**，ACM TOSEM 2024，[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1145\u002F3657416)]\n\n\n#### 用户提示窃取\n\n- **Prsa：针对大型语言模型的提示反向窃取攻击**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07870)]\n  \n- **多轮大语言模型交互中的提示泄露效应及防御策略**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.16251)]\n  \n- **探究多轮大语言模型交互中的提示泄露效应及黑盒防御方法**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.06770)]\n  \n- **我的提示为何会被泄露？揭秘定制化大型语言模型中的提示提取威胁**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.02416)]\n  \n- **Pleak：针对大型语言模型应用的提示泄露攻击**，CCS 2024，[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3644473.3665630)]\n  \n- **从专家混合模型中窃取用户提示**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22884)]\n  \n- **通过反演大语言模型输出提取提示**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15012)]\n\n## 非大脑部分的威胁\n\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_d3d7feb62500.jpg\" alt=\"LLM非大脑部分的威胁\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 感知安全威胁\n\n#### 对抗性攻击\n\n##### 文本类\n\n- **大语言模型可以自欺其人：一种基于提示的对抗性攻击**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.13345)]\n  \n- **重新审视语言模型的字符级对抗性攻击**，ICML 2024，[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=Z9tyM3Rru1)]\n  \n- **让困难提示变得简单：基于梯度的离散优化用于提示调优与发现**，NeurIPS 2024，[[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Fhash\u002F36-Hard-Prompts-Made-Easy-Abstract-Conference.html)]\n  \n- **对齐语言模型的通用且可迁移的对抗性攻击**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15043)]\n\n##### 视觉类\n\n- **图像劫持：对抗性图像可在运行时控制生成模型**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00236)]\n  \n- **基于图像的多模态模型作为入侵者：针对视频型多模态大语言模型的可迁移多模态攻击**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.01042)]\n  \n- **剖析多模态语言模型代理的对抗鲁棒性**，ICLR 2025，[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=zCPIQdjMkJ)]\n  \n- **波尔特盖斯特：针对摄像头和计算机视觉的声学对抗机器学习**，IEEE S&P 2021，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9474293)]\n\n\n##### 听觉类\n\n- **不可闻的对抗扰动：实时操纵用户语音识别**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01040)]\n  \n- **无声操控者：针对语音识别系统的实用且不可闻后门攻击**，ACM Multimedia 2023，[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3581783.3612104)]\n  \n- **利用对抗性超声波在注册阶段对说话人识别系统进行后门攻击**，IEEE IoT Journal 2023，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10162210)]\n  \n- **Ultrabd：通过对抗性超声波对自动说话人验证系统进行后门攻击**，ICPADS 2023，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10089999)]\n  \n- **海豚攻击：不可闻的语音命令**，CCS 2017，[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3133956.3134052)]\n\n\n##### 其他模态\n\n- **自动驾驶车辆中基于LiDAR的机器学习感知的对抗鲁棒性综述**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.13778)]\n  \n- **通过故意施加声音噪声干扰陀螺仪传感器来晃动无人机**，USENIX Security 2015，[[论文](https:\u002F\u002Fwww.usenix.org\u002Fconference\u002Fusenixsecurity15\u002Ftechnical-sessions\u002Fpresentation\u002Fson)]\n  \n- **多智能体通信中的对抗性攻击**，ICCV 2021，[[论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FTu_Adversarial_Attacks_on_Multi-Agent_Communication_ICCV_2021_paper.html)]\n  \n- **用于增强自动驾驶车辆安全性的一种GPS定位欺骗攻击检测方法**，IEEE VTC-Fall 2021，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9623147)]\n\n\n#### 错误感知问题\n\n- **利用在线强化学习将大型语言模型置于交互环境中**，ICML 2023，[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fcarta23a.html)]\n  \n- **大型语言模型中的偏见与公平性：一项综述**，计算语言学 2024，[[论文](https:\u002F\u002Fdirect.mit.edu\u002Fcoli\u002Farticle\u002Fdoi\u002F10.1162\u002Fcoli_a_00491\u002F123804)]\n  \n- **利用因果匹配进行领域泛化**，ICML 2021，[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fmahajan21a.html)]\n  \n- **GEM：无论强光还是昏暗，我都能看见你——端到端多模态目标检测**，IEEE RA-L 2021，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9513373)]\n  \n- **NPHardEval：通过复杂度类动态评估大型语言模型的推理能力**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14890)]\n  \n- **在线社交系统中意见错误感知及沉默现象的建模**，PLOS ONE 2024，[[论文](https:\u002F\u002Fjournals.plos.org\u002Fplosone\u002Farticle?id=10.1371\u002Fjournal.pone.0296075)]\n  \n- **弥合多智能体感知的领域差距**，ICRA 2023，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10161095)]\n  \n- **多智能体强化学习中的合作与竞争偏见**，arXiv 2021，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06890)]\n  \n- **与模型无关的多智能体感知框架**，ICRA 2023，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10159709)]\n  \n- **多智能体通信游戏中语言与感知之间的相互影响**，PLOS 计算生物学 2022，[[论文](https:\u002F\u002Fjournals.plos.org\u002Fploscompbiol\u002Farticle?id=10.1371\u002Fjournal.pcbi.1010658)]\n\n### 行动安全威胁\n\n#### 供应链攻击\n\n- **LLM安全的新时代：探索真实世界中基于LLM系统的安全问题**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.18649)]\n  \n- **Wipi：面向LLM驱动Web代理的新型网络威胁**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09875)]\n  \n- **利用LM模拟沙盒识别LM代理的风险**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15817)]\n  \n- **并非你所期望的：通过间接提示注入攻陷真实世界的LLM集成应用**，AISec@CCS 2023，[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3617415.3624610)]\n  \n- **工具集成大型语言模型代理中的间接提示注入基准测试**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02691)]\n\n\n#### 工具使用风险\n\n- **利用LM模拟沙盒识别LM代理的风险**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15817)]\n  \n- **Toolsword：揭示大型语言模型在工具学习三个阶段中的安全问题**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.10753)]\n  \n- **工具集成大型语言模型代理中的间接提示注入基准测试**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.02691)]\n\n\n## 代理外部安全\n\n\u003Cdiv style=\"display: flex; justify-content: space-between;\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_readme_19ac5f4e4b85.jpg\" alt=\"Agent Extrinsic Safety:\" width=\"100%\">\n\u003C\u002Fdiv>\n\n### 代理-内存交互威胁\n\n#### 检索增强生成\n\n- **Agentpoison：通过污染记忆或知识库对LLM代理进行红队测试**，NeurIPS 2025，[[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2025\u002Fhash\u002F130185-130213-Abstract-Conference.html)]\n  \n- **ConfusedPilot：基于RAG的LLM中的混淆代理风险**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.04870)]\n  \n- **PoisonedRAG：针对大型语言模型检索增强生成的知识污染攻击**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.07867)]\n  \n- **机器对抗RAG：用阻断文档干扰检索增强生成**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.05870)]\n  \n- **BadRAG：识别大型语言模型检索增强生成中的漏洞**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.00083)]\n  \n- **TrojanRAG：检索增强生成可能成为大型语言模型中的后门驱动器**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13401)]\n  \n- **语法中的低语：注入隐蔽后门以攻陷密集检索系统**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.13532)]\n\n\n### 代理-环境交互威胁\n\n#### 物理环境\n\n- **自动驾驶车辆：复杂攻击、安全问题、挑战、开放议题、区块链及未来方向**，JCP 2023，[[论文](https:\u002F\u002Fwww.mdpi.com\u002F2624-800X\u002F3\u002F3\u002F27)]\n  \n- **动态环境中机器人团队协作面临的工程挑战**，Applied Sciences 2020，[[论文](https:\u002F\u002Fwww.mdpi.com\u002F2076-3417\u002F10\u002F4\u002F1368)]\n  \n- **关于空中平台的GPS欺骗：威胁、挑战、方法论及未来研究方向综述**，PeerJ Computer Science 2021，[[论文](https:\u002F\u002Fpeerj.com\u002Farticles\u002Fcs-507)]\n  \n- **信息物理系统中的安全与隐私：综述**，IEEE Communications Surveys & Tutorials 2017，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F7815411)]\n  \n- **针对基于LiDAR的自动驾驶系统的对抗性物体**，arXiv 2019，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05418)]\n  \n- **在现实世界中以最小的人力成本学会行走**，arXiv 2020，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08550)]\n  \n- **优先保障而非自主：LLM代理对科学的风险**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.04247)]\n\n#### 数字环境\n\n- **LLM安全的新时代：探索真实世界中基于LLM系统的安全问题**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.18649)]\n  \n- **揭秘LLM集成应用中的RCE漏洞**，CCS 2024，[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3618271.3630876)]\n  \n- **Wipi：面向LLM驱动Web代理的新型网络威胁**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09875)]\n  \n- **大型语言模型在DDoS攻击检测中的应用**，SPCPS 2023，[[论文](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-031-45910-2_6)]\n  \n- **迫使LLM做并泄露（几乎）任何事情**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14020)]\n  \n- **优先保障而非自主：LLM代理对科学的风险**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.04247)]\n  \n- **EIA：针对通用Web代理的环境注入攻击以实现隐私泄露**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.11295)]\n  \n- **AdvWeb：针对VLM赋能Web代理的可控黑盒攻击**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17401)]\n  \n- **AGrail：具有有效且自适应安全检测功能的终身代理护栏**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11448)]\n\n### 代理-代理交互威胁\n\n#### 竞争性交互\n\n- **来自先进AI的多智能体风险**，arXiv 2025，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14143)]\n  \n- **被蒙蔽：语言模型文本游戏中的欺骗与合作**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01404)]\n  \n- **用解耦式对抗策略攻击深度强化学习**，IEEE TDSC 2022，[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9519422)]\n  \n- **拒绝服务攻击下多智能体系统的安全共识**，Asian Journal of Control 2023，[[论文](https:\u002F\u002Fonlinelibrary.wiley.com\u002Fdoi\u002Fabs\u002F10.1002\u002Fasjc.2921)]\n  \n- **完美合谋基准：如何以信息论不可检测的方式阻止AI代理合谋？**，NeurIPS 2023多智能体安全研讨会，[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=NEURIPS-2023-perfect-collusion)]\n\n\n#### 合作性交互\n\n- **关于大型语言模型引发的虚假信息污染风险**，arXiv 2023，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13661)]\n  \n- **Agent Smith：一张图片即可以指数级速度越狱一百万个多模态LLM代理**，arXiv 2024，[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08567)]","# Awesome-Foundation-Agents 快速上手指南\n\n**注意**：`awesome-foundation-agents` 并非一个可安装的软件库或框架，而是一个**精选论文与资源列表**。它旨在帮助开发者梳理“基础智能体（Foundation Agents）”的研究脉络、核心概念及前沿进展。因此，本指南将指导你如何获取该资源列表，并如何利用其中的内容开展学习与研究。\n\n## 环境准备\n\n本项目主要包含 Markdown 文档、图片资源及论文链接，无需复杂的运行环境。\n\n*   **系统要求**：Windows \u002F macOS \u002F Linux 均可。\n*   **前置依赖**：\n    *   `Git`：用于克隆仓库。\n    *   `Markdown` 阅读器（可选）：如 VS Code、Typora 或 GitHub 网页版，用于浏览整理后的文档。\n    *   科学上网环境（推荐）：部分论文链接指向 arXiv 或 GitHub，国内访问可能不稳定。\n\n## 安装步骤（获取资源）\n\n通过 Git 克隆仓库到本地，即可离线浏览所有分类整理的论文列表和资源链接。\n\n```bash\n# 1. 克隆仓库\ngit clone https:\u002F\u002Fgithub.com\u002FFoundationAgents\u002Fawesome-foundation-agents.git\n\n# 2. 进入目录\ncd awesome-foundation-agents\n\n# 3. (可选) 使用 VS Code 打开预览\ncode .\n```\n\n> **国内加速建议**：\n> 如果克隆速度较慢，可使用国内镜像源（如 Gitee 镜像，若有）或配置 Git 代理：\n> ```bash\n> git clone https:\u002F\u002Fghproxy.com\u002Fhttps:\u002F\u002Fgithub.com\u002FFoundationAgents\u002Fawesome-foundation-agents.git\n> ```\n\n## 基本使用\n\n获取资源后，主要通过阅读 `README.md` 文件来探索智能体研究的各个维度。以下是使用流程示例：\n\n### 1. 浏览核心分类\n打开根目录下的 `README.md` 文件，你将看到按以下核心模块分类的论文列表：\n*   **Core Components (核心组件)**：涵盖认知 (Cognition)、记忆 (Memory)、感知 (Perception)、世界模型 (World Model)、行动 (Action) 等。\n*   **Self-Enhancement (自我增强)**：智能体的自我进化与改进机制。\n*   **Collaborative Systems (协作系统)**：多智能体协作与演化。\n*   **Safety & Benefits (安全与有益)**：构建安全可靠的 AI。\n\n### 2. 查找特定领域论文\n假设你想研究 **“思维链 (Chain-of-Thought)\"** 或 **“推理 (Reasoning)\"** 相关的最新工作：\n1.  在 `README.md` 中搜索 `Reasoning` 或 `Cognition` 章节。\n2.  找到对应子类别（如 `Structured` -> `Dynamic`）。\n3.  点击论文标题链接（通常指向 arXiv）阅读原文。\n4.  点击 `[[code]]` 链接跳转至对应的 GitHub 开源项目复现代码。\n\n**示例路径**：\n> 阅读 `README.md` -> 定位到 `## Reasoning` -> 查看 `### Structured` -> 发现 **Tree of Thoughts** 论文及其代码实现链接。\n\n### 3. 贡献与更新\n该项目持续更新（Version 2 筹备中），如果你发现了有价值的相关工作，可以通过以下方式参与：\n*   提交 Pull Request (PR) 到仓库。\n*   关注项目发布的综述论文：[Advances and Challenges in Foundation Agents](https:\u002F\u002Fwww.arxiv.org\u002Fabs\u002F2504.01990)。\n\n通过以上步骤，你可以高效地利用该列表构建自己的智能体技术知识体系，并快速定位到需要深入研究的开源项目。","某 AI 初创团队正致力于研发一款能自主规划行程、处理突发状况的“全能旅行代理”，但在构建其核心认知与推理模块时陷入瓶颈。\n\n### 没有 awesome-foundation-agents 时\n- **技术选型迷茫**：面对海量零散的论文，团队难以区分哪些是真正提升 Agent“认知”与“记忆”的核心成果，哪些只是过时的实验，导致研发方向频繁摇摆。\n- **重复造轮子**：开发人员花费数周复现基础的思维链（Chain-of-Thought）或反射机制（Reflexion），却不知社区已有更先进的强化学习搜索方案（如 Search-R1）可直接复用。\n- **架构设计缺失**：缺乏对“世界模型”和“情感模块”等前沿组件的系统性理解，导致代理只能机械执行指令，无法像人类一样灵活应对航班取消等复杂情境。\n- **资源浪费严重**：团队在低效的试错中消耗了大量算力和时间，迟迟无法推出具备真正“自增强”能力的产品原型。\n\n### 使用 awesome-foundation-agents 后\n- **精准导航前沿**：团队利用其分类清晰的目录，迅速锁定了关于“认知”与“推理”的最新 SOTA 论文（如 SKY-T1），直接确立了基于强化学习的技术路线。\n- **高效代码复用**：通过仓库提供的代码链接，快速集成了 Voyager 和 ReAct 等成熟框架，将原本需要一个月的开发周期缩短至三天。\n- **系统架构升级**：参考其中关于“协作进化系统”和“安全有益 AI\"的综述，成功为代理添加了情绪反馈与多智能体协作模块，显著提升了用户体验。\n- **研发加速落地**：站在巨人肩膀上，团队得以跳过基础探索阶段，专注于业务逻辑创新，提前两个月完成了高智商旅行代理的 MVP 版本。\n\nawesome-foundation-agents 如同智能体研发的“高德地图”，让开发者从盲目摸索转向精准导航，极大缩短了从理论到落地的距离。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFoundationAgents_awesome-foundation-agents_95844c79.png","FoundationAgents","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FFoundationAgents_00912c20.png",null,"https:\u002F\u002Fgithub.com\u002FFoundationAgents",2008,196,"2026-04-06T10:19:30","MIT",1,"","未说明",{"notes":85,"python":83,"dependencies":86},"该项目是一个论文和资源列表（Awesome List），用于整理和展示关于基础智能体（Foundation Agents）的研究成果，本身不是一个可执行的软件工具或代码库，因此没有具体的运行环境、依赖库或硬件需求。用户需根据列表中引用的具体论文所对应的独立代码仓库去查询各自的运行要求。",[],[35,13,88],"其他","2026-03-27T02:49:30.150509","2026-04-07T08:18:53.369755",[],[]]