[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-reasoning-survey--Awesome-Reasoning-Foundation-Models":3,"tool-reasoning-survey--Awesome-Reasoning-Foundation-Models":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",143909,2,"2026-04-07T11:33:18",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":73,"owner_company":73,"owner_location":73,"owner_email":73,"owner_twitter":73,"owner_website":73,"owner_url":75,"languages":73,"stars":76,"forks":77,"last_commit_at":78,"license":79,"difficulty_score":80,"env_os":81,"env_gpu":82,"env_ram":82,"env_deps":83,"category_tags":86,"github_topics":88,"view_count":32,"oss_zip_url":73,"oss_zip_packed_at":73,"status":17,"created_at":96,"updated_at":97,"faqs":98,"releases":99},5117,"reasoning-survey\u002FAwesome-Reasoning-Foundation-Models","Awesome-Reasoning-Foundation-Models","✨✨Latest Papers and Benchmarks in Reasoning with Foundation Models","Awesome-Reasoning-Foundation-Models 是一个专注于大模型推理能力的开源资源库，旨在系统梳理该领域的最新论文、基准测试与技术进展。随着人工智能从单纯的知识记忆向复杂逻辑推演进化，如何提升模型在数学、逻辑、因果及多模态场景下的“思考”能力成为关键挑战。该项目通过构建结构化的知识体系，有效解决了研究人员在面对海量碎片化文献时难以快速定位核心资源的痛点。\n\n它特别适合 AI 研究人员、算法工程师以及对大模型底层机制感兴趣的技术开发者使用。无论是希望追踪前沿学术动态，还是寻找特定任务（如常识推理、智能体决策）的解决方案，都能在此获得指引。其独特亮点在于不仅按语言、视觉、多模态等基础模型类型进行分类，还深度整合了预训练、微调、对齐训练、混合专家模型（MoE）及上下文学习等关键推理技术。此外，该资源库依托于高质量的综述论文《A Survey of Reasoning with Foundation Models》，确保了内容的权威性与前瞻性，是探索大模型推理边界不可或缺的参考指南。","# Awesome-Reasoning-Foundation-Models\n[![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002FDOI\u002F10.5281\u002Fzenodo.10298864.svg)](https:\u002F\u002Fdoi.org\u002F10.5281\u002Fzenodo.10298864)\n[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-arxiv.org\u002Fabs\u002F2312.11562-\u003CCOLOR>.svg)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11562)\n\n\n![overview](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Freasoning-survey_Awesome-Reasoning-Foundation-Models_readme_5a69a0541029.jpg) \n\n[`survey.pdf`](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.11562.pdf) |\nA curated list of awesome **large AI models**, or **foundation models**, for **reasoning**.\n\nWe organize the current [foundation models](#2-foundation-models) into three categories: *language foundation models*, *vision foundation models*, and *multimodal foundation models*.\nFurther, we elaborate the foundation models in [reasoning tasks](#3-reasoning-tasks), including *commonsense*, *mathematical*, *logical*, *causal*, *visual*, *audio*, *multimodal*, *agent reasoning*, etc.\n[Reasoning techniques](#4-reasoning-techniques), including *pre-training*, *fine-tuning*, *alignment training*, *mixture of experts*, *in-context learning*, and *autonomous agent*, are also summarized.\n\nWe welcome contributions to this repository to add more resources. Please submit a pull request if you want to contribute! See [CONTRIBUTING](CONTRIBUTING.md).\n\n\u003C!-- ## News -->\n\n\n## Table of Contents\n\n\u003Cdetails open>\n\u003Csummary>table of contents\u003C\u002Fsummary>\n\n- [0 Survey](#0-survey)\n- [1 Relevant Surveys](#1-relevant-surveys-and-links)\n- [2 Foundation Models](#2-foundation-models)\n  - [2.1 Language Foundation Models](#21-language-foundation-models)\n  - [2.2 Vision Foundation Models](#22-vision-foundation-models)\n  - [2.3 Multimodal Foundation Models](#23-multimodal-foundation-models)\n  - [2.4 Reasoning Applications](#24-reasoning-applications)\n- [3 Reasoning Tasks](#3-reasoning-tasks)\n  - [3.1 Commonsense Reasoning](#31-commonsense-reasoning)\n  - [3.2 Mathematical Reasoning](#32-mathematical-reasoning)\n  - [3.3 Logical Reasoning](#33-logical-reasoning)\n  - [3.4 Causal Reasoning](#34-causal-reasoning)\n  - [3.5 Visual Reasoning](#35-visual-reasoning)\n  - [3.6 Audio Reasoning](#36-audio-reasoning)\n  - [3.7 Multimodal Reasoning](#37-multimodal-reasoning)\n  - [3.8 Agent Reasoning](#38-agent-reasoning)\n  - [3.9 Other Tasks and Applications](#39-other-tasks-and-applications)\n- [4 Reasoning Techniques](#4-reasoning-techniques)\n  - [4.1 Pre-Training](#41-pre-training)\n  - [4.2 Fine-Tuning](#42-fine-tuning)\n  - [4.3 Alignment Training](#43-alignment-training)\n  - [4.4 Mixture of Experts (MoE)](#44-mixture-of-experts-moe)\n  - [4.5 In-Context Learning](#45-in-context-learning)\n  - [4.6 Autonomous Agent](#46-autonomous-agent)\n\n\u003C\u002Fdetails>\n\n\n## 0 Survey\n\n![overview](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Freasoning-survey_Awesome-Reasoning-Foundation-Models_readme_fa97ddc33c00.jpg)\n\nThis repository is primarily based on the following paper:\n\n>[**A Survey of Reasoning with Foundation Models: Concepts, Methodologies, and Outlook**](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3729218) \u003Cbr>\n>\n> [[Paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3729218)][[ArXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11562)]\n>\n> [Jiankai Sun](),\n[Chuanyang Zheng](https:\u002F\u002Fchuanyang-zheng.github.io\u002F),\n[Enze Xie](https:\u002F\u002Fxieenze.github.io\u002F),\n[Zhengying Liu](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=DFme0joAAAAJ&hl=fr),\n[Ruihang Chu](),\n[Jianing Qiu](),\n[Jiaqi Xu](),\n[Mingyu Ding](),\n[Hongyang Li](https:\u002F\u002Flihongyang.info\u002F),\n[Mengzhe Geng](),\n[Yue Wu](),\n[Wenhai Wang](https:\u002F\u002Fwhai362.github.io\u002F),\n[Junsong Chen](),\n[Zhangyue Yin](),\n[Xiaozhe Ren](),\n[Jie Fu](https:\u002F\u002Fbigaidream.github.io\u002F),\n[Junxian He](https:\u002F\u002Fjxhe.github.io\u002F),\n[Wu Yuan](http:\u002F\u002Fwww.bme.cuhk.edu.hk\u002Fyuan\u002F),\n[Qi Liu](https:\u002F\u002Fleuchine.github.io\u002F),\n[Xihui Liu](https:\u002F\u002Fxh-liu.github.io\u002F),\n[Yu Li](https:\u002F\u002Fliyu95.com\u002F),\n[Hao Dong](https:\u002F\u002Fzsdonghao.github.io\u002F),\n[Yu Cheng](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=ORPxbV4AAAAJ&hl=zh-CN),\n[Ming Zhang](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=LbzoQBsAAAAJ&hl=en),\n[Pheng Ann Heng](http:\u002F\u002Fwww.cse.cuhk.edu.hk\u002F~pheng\u002F),\n[Jifeng Dai](https:\u002F\u002Fjifengdai.org\u002F),\n[Ping Luo](http:\u002F\u002Fluoping.me\u002F),\n[Jingdong Wang](https:\u002F\u002Fjingdongwang2017.github.io\u002F),\n[Ji-Rong Wen](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=tbxCHJgAAAAJ&hl=zh-CN),\n[Xipeng Qiu](https:\u002F\u002Fxpqiu.github.io\u002F),\n[Yike Guo](https:\u002F\u002Fcse.hkust.edu.hk\u002Fadmin\u002Fpeople\u002Ffaculty\u002Fprofile\u002Fyikeguo),\n[Hui Xiong](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=cVDF1tkAAAAJ&hl=en),\n[Qun Liu](https:\u002F\u002Fliuquncn.github.io\u002Findex_zh.html), and\n[Zhenguo Li](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=XboZC1AAAAAJ&hl=en)\n\nIf you find this repository helpful, please consider citing:\n\n```bibtex\n@article{sun2025survey,\n  author = {Sun, Jiankai and Zheng, Chuanyang and Xie, Enze and Liu, Zhengying and Chu, Ruihang and Qiu, Jianing and Xu, Jiaqi and Ding, Mingyu and Li, Hongyang and Geng, Mengzhe and Wu, Yue and Wang, Wenhai and Chen, Junsong and Yin, Zhangyue and Ren, Xiaozhe and Fu, Jie and He, Junxian and Wu, Yuan and Liu, Qi and Liu, Xihui and Li, Yu and Dong, Hao and Cheng, Yu and Zhang, Ming and Heng, Pheng Ann and Dai, Jifeng and Luo, Ping and Wang, Jingdong and Wen, Ji-Rong and Qiu, Xipeng and Guo, Yike and Xiong, Hui and Liu, Qun and Li, Zhenguo},\n  title = {A Survey of Reasoning with Foundation Models: Concepts, Methodologies, and Outlook},\n  year = {2025},\n  publisher = {Association for Computing Machinery},\n  address = {New York, NY, USA},\n  issn = {0360-0300},\n  url = {https:\u002F\u002Fdoi.org\u002F10.1145\u002F3729218},\n  doi = {10.1145\u002F3729218},\n  abstract = {Reasoning, a crucial ability for complex problem-solving, plays a pivotal role in various real-world settings such as negotiation, medical diagnosis, and criminal investigation. It serves as a fundamental methodology in the field of Artificial General Intelligence (AGI). With the ongoing development of foundation models, there is a growing interest in exploring their abilities in reasoning tasks. In this paper, we introduce seminal foundation models proposed or adaptable for reasoning, highlighting the latest advancements in various reasoning tasks, methods, and benchmarks. We then delve into the potential future directions behind the emergence of reasoning abilities within foundation models. We also discuss the relevance of multimodal learning, autonomous agents, and super alignment in the context of reasoning. By discussing these future research directions, we hope to inspire researchers in their exploration of this field, stimulate further advancements in reasoning with foundation models, e.g. Large Language Models (LLMs), and contribute to the development of AGI.},\n  journal = {ACM Comput. Surv.},\n  month = apr,\n  keywords = {Reasoning, Foundation Models, Multimodal, AI Agent, Artificial General Intelligence, LLM}\n}\n```\n\n\n## 1 Relevant Surveys and Links\n\n\u003Cdetails open>\n\u003Csummary>relevant surveys\u003C\u002Fsummary>\n\n[(Back-to-Top)](#table-of-contents)\n\n- Combating Misinformation in the Age of LLMs: Opportunities and Challenges\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05656)]\n[[Link](https:\u002F\u002Fllm-misinformation.github.io\u002F)]\n\n- The Rise and Potential of Large Language Model Based Agents: A Survey\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07864)]\n[[Link](https:\u002F\u002Fgithub.com\u002FWooooDyy\u002FLLM-Agent-Paper-List)]\n\n- Multimodal Foundation Models: From Specialists to General-Purpose Assistants\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.10020)]\n[[Tutorial](https:\u002F\u002Fvlp-tutorial.github.io\u002F2023\u002F)]\n\n- A Survey on Multimodal Large Language Models\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.13549)]\n[[Link](https:\u002F\u002Fgithub.com\u002FBradyFU\u002FAwesome-Multimodal-Large-Language-Models)]\n\n- Interactive Natural Language Processing\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13246)]\n[[Link](https:\u002F\u002Fgithub.com\u002FInteractiveNLP-Team\u002Fawesome-InteractiveNLP-papers)]\n\n- A Survey of Large Language Models\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.18223)]\n[[Link](https:\u002F\u002Fgithub.com\u002FRUCAIBox\u002FLLMSurvey)]\n\n- Self-Supervised Multimodal Learning: A Survey\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.01008)]\n[[Link](https:\u002F\u002Fgithub.com\u002Fys-zong\u002Fawesome-self-supervised-multimodal-learning)]\n\n- Large AI Models in Health Informatics: Applications, Challenges, and the Future\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11568)]\n[[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10261199)]\n[[Link](https:\u002F\u002Fgithub.com\u002FJianing-Qiu\u002FAwesome-Healthcare-Foundation-Models)]\n\n- Towards Reasoning in Large Language Models: A Survey\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10403)]\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2023.findings-acl.67.pdf)]\n[[Link](https:\u002F\u002Fgithub.com\u002Fjeffhj\u002FLM-reasoning)]\n\n- Reasoning with Language Model Prompting: A Survey\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09597)]\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.294.pdf)]\n[[Link](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FPrompt4ReasoningPapers)]\n\n- Awesome Multimodal Reasoning\n\\-\n[[Link](https:\u002F\u002Fgithub.com\u002Fatfortes\u002FAwesome-Multimodal-Reasoning)]\n\n\u003C\u002Fdetails>\n\n\n## 2 Foundation Models\n\n\u003Cdetails open>\n\u003Csummary>foundation models\u003C\u002Fsummary>\n\n[(Back-to-Top)](#table-of-contents)\n\n![foundation_models](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Freasoning-survey_Awesome-Reasoning-Foundation-Models_readme_76064674e424.jpg)\n\n### Table of Contents - 2\n\n\u003Cdetails open>\n\u003Csummary>foundation models (table of contents)\u003C\u002Fsummary>\n\n[(Back-to-Top)](#table-of-contents)\n\n- [2 Foundation Models](#2-foundation-models)\n  - [2.1 Language Foundation Models](#21-language-foundation-models)\n  - [2.2 Vision Foundation Models](#22-vision-foundation-models)\n  - [2.3 Multimodal Foundation Models](#23-multimodal-foundation-models)\n  - [2.4 Reasoning Applications](#24-reasoning-applications)\n\n\u003C\u002Fdetails>\n\n### 2.1 Language Foundation Models\n\n\u003Cdetails open>\n\u003Csummary>LFMs\u003C\u002Fsummary>\n\n[Foundation Models (Back-to-Top)](#2-foundation-models)\n\n- `2023\u002F10` | `Mistral` | [Mistral 7B](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06825)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06825.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fmistralai\u002Fmistral-src?tab=readme-ov-file)]\n\n- `2023\u002F09` | `Qwen` | [Qwen Technical Report](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16609)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.16609.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen)]\n[[Project](https:\u002F\u002Ftongyi.aliyun.com\u002Fqianwen\u002F)]\n\n- `2023\u002F07` | `Llama 2` | [Llama 2: Open Foundation and Fine-Tuned Chat Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09288)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[Blog](https:\u002F\u002Fai.meta.com\u002Fllama\u002F)]\n\n- `2023\u002F07` | `InternLM` | InternLM: A Multilingual Language Model with Progressively Enhanced Capabilities\n\\-\n[[Paper](https:\u002F\u002Fgithub.com\u002FInternLM\u002FInternLM-techreport\u002Fblob\u002Fmain\u002FInternLM.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FInternLM\u002FInternLM)]\n[[Project](https:\u002F\u002Finternlm.intern-ai.org.cn)]\n\n- `2023\u002F05` | `PaLM 2` | [PaLM 2 Technical Report](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10403)\n\\-\n\n- `2023\u002F03` | `PanGu-Σ` | [PanGu-Σ: Towards Trillion Parameter Language Model with Sparse Heterogeneous Computing](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.10845.pdf)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10845)]\n\n- `2023\u002F03` | `Vicuna` | Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality\n\\-\n[[Blog](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-03-30-vicuna\u002F)]\n[[Code](https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat)]\n\n- `2023\u002F03` | `GPT-4` | [GPT-4 Technical Report](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08774)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.08774.pdf)]\n[[Blog](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4)]\n\n- `2023\u002F02` | `LLaMA` | [LLaMA: Open and Efficient Foundation Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13971)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.13971.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[Blog](https:\u002F\u002Fai.meta.com\u002Fblog\u002Flarge-language-model-llama-meta-ai\u002F)]\n\n- `2022\u002F11` | `ChatGPT` | Chatgpt: Optimizing language models for dialogue\n\\-\n[[Blog](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt)]\n\n- `2022\u002F04` | `PaLM` | [PaLM: Scaling Language Modeling with Pathways](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02311)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.02311.pdf)]\n[[Blog](https:\u002F\u002Fblog.research.google\u002F2022\u002F04\u002Fpathways-language-model-palm-scaling-to.html)]\n\n- `2021\u002F09` | `FLAN` | [Finetuned Language Models Are Zero-Shot Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01652)\n\\-\n\n- `2021\u002F07` | `Codex` | [Evaluating Large Language Models Trained on Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374)\n\\-\n\n- `2021\u002F05` | `GPT-3` | [Language Models are Few-Shot Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)\n\\-\n[[Paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-3)]\n\n- `2021\u002F04` | `PanGu-α` | [PanGu-α: Large-scale Autoregressive Pretrained Chinese Language Models with Auto-parallel Computation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.12369)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.12369.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fhuawei-noah\u002FPretrained-Language-Model)]\n\n- `2019\u002F08` | `Sentence-BERT` | [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks\n](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.10084)\n\\-\n\n- `2019\u002F07` | `RoBERTa` | [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692)\n\\-\n\n- `2018\u002F10` | `BERT` | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FN19-1423.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert)]\n[[Blog](https:\u002F\u002Fblog.research.google\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 2.2 Vision Foundation Models\n\n\u003Cdetails open>\n\u003Csummary>VFMs\u003C\u002Fsummary>\n\n[Foundation Models (Back-to-Top)](#2-foundation-models)\n\n- `2024\u002F01` | `Depth Anything`\n| `Yang et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267061016?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FLiheYoung\u002FDepth-Anything.svg?style=social&label=Star) \u003Cbr>\nDepth Anything: Unleashing the Power of Large-Scale Unlabeled Data \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.10891)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.10891.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FLiheYoung\u002FDepth-Anything)]\n[[project](https:\u002F\u002Fdepth-anything.github.io\u002F)]\n\n- `2023\u002F05` | `SAA+`\n| `Cao et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258762545?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fcaoyunkang\u002FSegment-Any-Anomaly.svg?style=social&label=Star) \u003Cbr>\nSegment Any Anomaly without Training via Hybrid Prompt Regularization \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10724)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.10724.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002Fcaoyunkang\u002FSegment-Any-Anomaly)]\n\n- `2023\u002F05` | `Explain Any Concept` | [Explain Any Concept: Segment Anything Meets Concept-Based Explanation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10289)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.10289.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FJerry00917\u002Fsamshap)]\n\n- `2023\u002F05` | `SAM-Track` | [Segment and Track Anything](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.06558)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.06558.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fz-x-yang\u002FSegment-and-Track-Anything)]\n\n- `2023\u002F05` | `SAMRS` | [SAMRS: Scaling-up Remote Sensing Segmentation Dataset with Segment Anything Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02034)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02034.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FSAMRS)]\n\n- `2023\u002F04` | `Edit Everything` | [Edit Everything: A Text-Guided Generative System for Images Editing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14006)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14006.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FDefengXie\u002FEdit_Everything)]\n\n- `2023\u002F04` | `Inpaint Anything` | [Inpaint Anything: Segment Anything Meets Image Inpainting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06790)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.06790.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgeekyutao\u002FInpaint-Anything)]\n\n- `2023\u002F04` | `SAM`\n| `Kirillov et al., ICCV 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:257952310?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ffacebookresearch\u002Fsegment-anything.svg?style=social&label=Star) \u003Cbr>\nSegment Anything \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643)]\n[[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2023\u002Fhtml\u002FKirillov_Segment_Anything_ICCV_2023_paper.html)]\n[[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fsegment-anything)]\n[[blog](https:\u002F\u002Fsegment-anything.com\u002F)]\n\n- `2023\u002F03` | `VideoMAE V2` | [VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16727)\n\\-\n[[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FWang_VideoMAE_V2_Scaling_Video_Masked_Autoencoders_With_Dual_Masking_CVPR_2023_paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FVideoMAEv2)]\n\n- `2023\u002F03` | `Grounding DINO`\n| `Liu et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:257427307?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FIDEA-Research\u002FGroundingDINO.svg?style=social&label=Star) \u003Cbr>\nGrounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05499)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.05499.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FIDEA-Research\u002FGroundingDINO)]\n\n- `2022\u002F03` | `VideoMAE` | [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.12602)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Fhash\u002F416f9cb3276121c42eebb86352a4354a-Abstract-Conference.html)]\n[[Code](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FVideoMAE)]\n\n- `2021\u002F12` | `Stable Diffusion`\n| `Rombach et al., CVPR 2022`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:245335280?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FCompVis\u002Flatent-diffusion.svg?style=social&label=Star) \u003Cbr>\nHigh-Resolution Image Synthesis with Latent Diffusion Models \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10752)]\n[[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022\u002Fhtml\u002FRombach_High-Resolution_Image_Synthesis_With_Latent_Diffusion_Models_CVPR_2022_paper.html)]\n[[code](https:\u002F\u002Fgithub.com\u002FCompVis\u002Flatent-diffusion)]\n[[stable diffusion](https:\u002F\u002Fgithub.com\u002FStability-AI\u002Fstablediffusion)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FStability-AI\u002Fstablediffusion.svg?style=social&label=Star)]\n\n- `2021\u002F09` | `LaMa` | [Resolution-robust Large Mask Inpainting with Fourier Convolutions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.07161)\n\\-\n[[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FWACV2022\u002Fpapers\u002FSuvorov_Resolution-Robust_Large_Mask_Inpainting_With_Fourier_Convolutions_WACV_2022_paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fadvimman\u002Flama)]\n\n- `2021\u002F03` | `Swin`\n| `Liu et al., ICCV 2021`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:232352874?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FSwin-Transformer.svg?style=social&label=Star) \u003Cbr>\nSwin Transformer: Hierarchical Vision Transformer using Shifted Windows \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14030)]\n[[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FLiu_Swin_Transformer_Hierarchical_Vision_Transformer_Using_Shifted_Windows_ICCV_2021_paper.html)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FSwin-Transformer)]\n\n- `2020\u002F10` | `ViT`\n| `Dosovitskiy et al., ICLR 2021`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:225039882?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nAn Image is Worth 16x16 Words: Transformers for Image Recognition at Scale \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929)]\n[[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=YicbFdNTTy)]\n[[Implementation](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Fvit-pytorch)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 2.3 Multimodal Foundation Models\n\n\u003Cdetails open>\n\u003Csummary>MFMs\u003C\u002Fsummary>\n\n[Foundation Models (Back-to-Top)](#2-foundation-models)\n\n- `2024\u002F01` | `LLaVA-1.6`\n| `Liu et al.` \u003Cbr>\nLLaVA-1.6: Improved reasoning, OCR, and world knowledge \u003Cbr>\n[[code](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA)]\n[[blog](https:\u002F\u002Fllava-vl.github.io\u002Fblog\u002F2024-01-30-llava-1-6\u002F)]\n\n- `2024\u002F01` | `MouSi`\n| `Fan et al.`\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFudanNLPLAB\u002FMouSi.svg?style=social&label=Star) \u003Cbr>\nMouSi: Poly-Visual-Expert Vision-Language Models \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.17221)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.17221.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FFudanNLPLAB\u002FMouSi)]\n\n- `2023\u002F12` | `InternVL`\n| `Chen et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:266521410?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FOpenGVLab\u002FInternVL.svg?style=social&label=Star) \u003Cbr>\nInternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14238)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14238.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FInternVL)]\n\n- `2023\u002F12` | `Gemini` | [Gemini: A Family of Highly Capable Multimodal Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11805)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.11805.pdf)]\n[[Project](https:\u002F\u002Fdeepmind.google\u002Ftechnologies\u002Fgemini\u002F#introduction)]\n\n- `2023\u002F10` | `LLaVA-1.5`\n| `Liu et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:263672058?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nImproved Baselines with Visual Instruction Tuning \u003Cbr>[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03744)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03744.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA)]\n[[project](https:\u002F\u002Fllava-vl.github.io)]\n\n- `2023\u002F09` | `GPT-4V` | GPT-4V(ision) System Card\n\\-\n[[Paper](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002FGPTV_System_Card.pdf)]\n[[Blog](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4v-system-card)]\n\n- `2023\u002F08` | `Qwen-VL` | [Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12966)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12966.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen-VL)]\n\n- `2023\u002F05` | `InstructBLIP` | [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.06500)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vvoWPYqZJA)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Finstructblip)]\n\n- `2023\u002F05` | `Caption Anything` | [Caption Anything: Interactive Image Description with Diverse Multimodal Controls](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02677)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02677.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fttengwang\u002FCaption-Anything)]\n\n- `2023\u002F05` | `SAMText` | [Scalable Mask Annotation for Video Text Spotting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01443)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.01443.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FSAMText)]\n\n- `2023\u002F04` | `Text2Seg` | [Text2Seg: Remote Sensing Image Semantic Segmentation via Text-Guided Visual Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10597)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.10597.pdf)]\n\n- `2023\u002F04` | `MiniGPT-4` | [MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10592)\n\\-\n\n- `2023\u002F04` | `LLaVA` | [Visual Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=w0H2xGHlkw)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA)]\n[[Project](https:\u002F\u002Fllava-vl.github.io)]\n\n- `2023\u002F04` | `CLIP Surgery` | [CLIP Surgery for Better Explainability with Enhancement in Open-Vocabulary Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05653)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.05653.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fxmed-lab\u002FCLIP_Surgery)]\n\n- `2023\u002F03` | `UniDiffuser` | [One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.06555)\n\\-\n\n- `2023\u002F01` | `GALIP` | [GALIP: Generative Adversarial CLIPs for Text-to-Image Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12959)\n\\-\n[[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FTao_GALIP_Generative_Adversarial_CLIPs_for_Text-to-Image_Synthesis_CVPR_2023_paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Ftobran\u002FGALIP)]\n\n- `2023\u002F01` | `BLIP-2` | [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fli23q.html)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Fblip2)]\n\n- `2022\u002F12` | `Img2Prompt` | [From Images to Textual Prompts: Zero-shot VQA with Frozen Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10846)\n\\-\n\n- `2022\u002F05` | `CoCa` | [CoCa: Contrastive Captioners are Image-Text Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01917)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=Ee277P3AYC)]\n\n- `2022\u002F01` | `BLIP` | [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.12086)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv162\u002Fli22n.html)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FBLIP)]\n\n- `2021\u002F09` | `CoOp` | [Learning to Prompt for Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01134)\n\\-\n[[Paper](https:\u002F\u002Flink.springer.com\u002Farticle\u002F10.1007\u002Fs11263-022-01653-1)]\n[[Code](https:\u002F\u002Fgithub.com\u002FKaiyangZhou\u002FCoOp)]\n\n- `2021\u002F02` | `CLIP` | [Learning Transferable Visual Models From Natural Language Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fradford21a\u002Fradford21a.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP)]\n[[Blog](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fclip)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 2.4 Reasoning Applications\n\n\u003Cdetails open>\n\u003Csummary>reasoning applications\u003C\u002Fsummary>\n\n[Foundation Models (Back-to-Top)](#2-foundation-models)\n\n- `2022\u002F06` | `Minerva` | [Solving Quantitative Reasoning Problems with Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14858)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=IFXTZERXdM7)]\n[[Blog](https:\u002F\u002Fblog.research.google\u002F2022\u002F06\u002Fminerva-solving-quantitative-reasoning.html)]\n\n- `2022\u002F06` | `BIG-bench` | [Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.04615)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=uyTL5Bvosj)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench)]\n\n- `2022\u002F05` | `Zero-shot-CoT` | [Large Language Models are Zero-Shot Reasoners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11916)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=e2TBb5y0yFf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fkojima-takeshi188\u002Fzero_shot_cot)]\n\n- `2022\u002F03` | `STaR` | [STaR: Bootstrapping Reasoning With Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=_3ELRdg2sgI)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fezelikman\u002FSTaR)]\n\n- `2021\u002F07` |  `MWP-BERT` | [MWP-BERT: Numeracy-Augmented Pre-training for Math Word Problem Solving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13435)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2022.findings-naacl.74.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FLZhenwen\u002FMWP-BERT)]\n\n- `2017\u002F05` | `AQUA-RAT` | [Program Induction by Rationale Generation : Learning to Solve and Explain Algebraic Word Problems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.04146)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FP17-1015.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002FAQuA)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C\u002Fdetails>\n\n## 3 Reasoning Tasks\n\n\u003Cdetails open>\n\u003Csummary>reasoning tasks\u003C\u002Fsummary>\n\n[(Back-to-Top)](#table-of-contents)\n\n### Table of Contents - 3\n\n\u003Cdetails open>\n\u003Csummary>reasoning tasks (table of contents)\u003C\u002Fsummary>\n\n- [3 Reasoning Tasks](#3-reasoning-tasks)\n  - [3.1 Commonsense Reasoning](#31-commonsense-reasoning)\n    - [3.1.1 Commonsense Question and Answering (QA)](#311-commonsense-question-and-answering-qa)\n    - [3.1.2 Physical Commonsense Reasoning](#312-physical-commonsense-reasoning)\n    - [3.1.3 Spatial Commonsense Reasoning](#313-spatial-commonsense-reasoning)\n    - [3.1.x Benchmarks, Datasets, and Metrics](#31x-benchmarks-datasets-and-metrics)\n  - [3.2 Mathematical Reasoning](#32-mathematical-reasoning)\n    - [3.2.1 Arithmetic Reasoning](#321-arithmetic-reasoning)\n    - [3.2.2 Geometry Reasoning](#322-geometry-reasoning)\n    - [3.2.3 Theorem Proving](#323-theorem-proving)\n    - [3.2.4 Scientific Reasoning](#324-scientific-reasoning)\n    - [3.2.x Benchmarks, Datasets, and Metrics](#32x-benchmarks-datasets-and-metrics)\n  - [3.3 Logical Reasoning](#33-logical-reasoning)\n    - [3.3.1 Propositional Logic](#331-propositional-logic)\n    - [3.3.2 Predicate Logic](#332-predicate-logic)\n    - [3.3.x Benchmarks, Datasets, and Metrics](#33x-benchmarks-datasets-and-metrics)\n  - [3.4 Causal Reasoning](#34-causal-reasoning)\n    - [3.4.1 Counterfactual Reasoning](#341-counterfactual-reasoning)\n    - [3.4.x Benchmarks, Datasets, and Metrics](#34x-benchmarks-datasets-and-metrics)\n  - [3.5 Visual Reasoning](#35-visual-reasoning)\n    - [3.5.1 3D Reasoning](#351-3d-reasoning)\n    - [3.5.x Benchmarks, Datasets, and Metrics](#35x-benchmarks-datasets-and-metrics)\n  - [3.6 Audio Reasoning](#36-audio-reasoning)\n    - [3.6.1 Speech](#361-speech)\n    - [3.6.x Benchmarks, Datasets, and Metrics](#36x-benchmarks-datasets-and-metrics)\n  - [3.7 Multimodal Reasoning](#37-multimodal-reasoning)\n    - [3.7.1 Alignment](#371-alignment)\n    - [3.7.2 Generation](#372-generation)\n    - [3.7.3 Multimodal Understanding](#373-multimodal-understanding)\n    - [3.7.x Benchmarks, Datasets, and Metrics](#37x-benchmarks-datasets-and-metrics)\n  - [3.8 Agent Reasoning](#38-agent-reasoning)\n    - [3.8.1 Introspective Reasoning](#381-introspective-reasoning)\n    - [3.8.2 Extrospective Reasoning](#382-extrospective-reasoning)\n    - [3.8.3 Multi-agent Reasoning](#383-multi-agent-reasoning)\n    - [3.8.4 Driving Reasoning](#384-driving-reasoning)\n    - [3.8.x Benchmarks, Datasets, and Metrics](#38x-benchmarks-datasets-and-metrics)\n  - [3.9 Other Tasks and Applications](#39-other-tasks-and-applications)\n    - [3.9.1 Theory of Mind (ToM)](#391-theory-of-mind-tom)\n    - [3.9.2 LLMs for Weather Prediction](#392-llms-for-weather-prediction)\n    - [3.9.3 Abstract Reasoning](#393-abstract-reasoning)\n    - [3.9.4 Defeasible Reasoning](#394-defeasible-reasoning)\n    - [3.9.5 Medical Reasoning](#395-medical-reasoning)\n    - [3.9.6 Bioinformatics Reasoning](#396-bioinformatics-reasoning)\n    - [3.9.7 Long-Chain Reasoning](#397-long-chain-reasoning)\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.1 Commonsense Reasoning\n\n\u003Cdetails open>\n\u003Csummary>commonsense reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.1 Commonsense Reasoning](#31-commonsense-reasoning)\n  - [3.1.1 Commonsense Question and Answering (QA)](#311-commonsense-question-and-answering-qa)\n  - [3.1.2 Physical Commonsense Reasoning](#312-physical-commonsense-reasoning)\n  - [3.1.3 Spatial Commonsense Reasoning](#313-spatial-commonsense-reasoning)\n  - [3.1.x Benchmarks, Datasets, and Metrics](#31x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F12` | [Gemini in Reasoning: Unveiling Commonsense in Multimodal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17661)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17661.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FEternityYW\u002FGemini-Commonsense-Evaluation\u002F)]\n\n- `2023\u002F05` | `LLM-MCTS` | [Large Language Models as Commonsense Knowledge for Large-Scale Task Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14078)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Wjp1AYB8lH)]\n[[Code](https:\u002F\u002Fgithub.com\u002F1989Ryan\u002Fllm-mcts)]\n[[Project](https:\u002F\u002Fllm-mcts.github.io)]\n\n- `2023\u002F05` | Bridging the Gap between Pre-Training and Fine-Tuning for Commonsense Generation\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2023.findings-eacl.28.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FLHRYANG\u002FCommonGen)]\n\n- `2022\u002F11` | `DANCE` | [Improving Commonsense in Vision-Language Models via Knowledge Graph Riddles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.16504)\n\\-\n[[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FYe_Improving_Commonsense_in_Vision-Language_Models_via_Knowledge_Graph_Riddles_CVPR_2023_paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fpleaseconnectwifi\u002FDANCE)]\n[[Project](https:\u002F\u002Fshuquanye.com\u002FDANCE_website)]\n\n- `2022\u002F10` | `CoCoGen` | [Language Models of Code are Few-Shot Commonsense Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07128)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.90.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Freasoning-machines\u002FCoCoGen)]\n\n- `2021\u002F10` | [A Systematic Investigation of Commonsense Knowledge in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.00607)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.812.pdf)]\n\n\n- `2021\u002F05` | [Go Beyond Plain Fine-tuning: Improving Pretrained Models for Social Commonsense](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05913)\n\\-\n[[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?arnumber=9383453)]\n\n#### 3.1.1 Commonsense Question and Answering (QA)\n\n- `2019\u002F06` | `CoS-E` | [Explain Yourself! Leveraging Language Models for Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02361)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FP19-1487.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fcos-e)]\n\n- `2018\u002F11` | `CQA` | [CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.00937)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FN19-1421.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fjonathanherzig\u002Fcommonsenseqa)]\n[[Project](https:\u002F\u002Fwww.tau-nlp.sites.tau.ac.il\u002Fcommonsenseqa)]\n\n- `2016\u002F12` | `ConceptNet` | [ConceptNet 5.5: An Open Multilingual Graph of General Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.03975)\n\\-\n[[Paper](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F11164)]\n[[Project](https:\u002F\u002Fconceptnet.io)]\n\n#### 3.1.2 Physical Commonsense Reasoning\n\n- `2025\u002F05` | `PhyX` | [PhyX: Does Your Model Have the \"Wits\" for Physical Reasoning?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15929)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.15929.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FNastyMarcus\u002FPhyX)]\n[[Project](https:\u002F\u002Fphyx-bench.github.io\u002F)]\n\n- `2023\u002F10` | `NEWTON` | [NEWTON: Are Large Language Models Capable of Physical Reasoning?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07018)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07018.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FNewtonReasoning\u002FNewton)]\n[[Project](https:\u002F\u002Fnewtonreasoning.github.io)]\n\n- `2022\u002F03` | `PACS` | [PACS: A Dataset for Physical Audiovisual CommonSense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11130)\n\\-\n[[Paper](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2022\u002Fpapers_ECCV\u002Fpapers\u002F136970286.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsamuelyu2002\u002FPACS)]\n\n- `2021\u002F10` | `VRDP` | [Dynamic Visual Reasoning by Learning Differentiable Physics Models from Video and Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.15358)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=lk1ORT35tbi)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fdingmyu\u002FVRDP)]\n\n- `2020\u002F05` | `ESPRIT` | [ESPRIT: Explaining Solutions to Physical Reasoning Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00730)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2020.acl-main.706.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fesprit)]\n\n- `2019\u002F11` | `PIQA` | [PIQA: Reasoning about Physical Commonsense in Natural Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11641)\n\\-\n[[Paper](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F6239)]\n[[Project](https:\u002F\u002Fleaderboard.allenai.org\u002Fphysicaliqa\u002Fsubmissions\u002Fpublic)]\n\n#### 3.1.3 Spatial Commonsense Reasoning\n\n- `2024\u002F01` | `SpatialVLM`\n| `Chen et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267069344?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nSpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12168)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12168.pdf)]\n[[project](https:\u002F\u002Fspatial-vlm.github.io\u002F)]\n\n- `2022\u002F03` | [Things not Written in Text: Exploring Spatial Commonsense from Visual Signals](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.08075)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2022.acl-long.168.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fxxxiaol\u002Fspatial-commonsense)]\n\n- `2021\u002F06` | `PROST` | [PROST: Physical Reasoning of Objects through Space and Time](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.03634)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2021.findings-acl.404.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fnala-cub\u002Fprost)]\n\n- `2019\u002F02` | `GQA` | [GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.09506)\n\\-\n[[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FHudson_GQA_A_New_Dataset_for_Real-World_Visual_Reasoning_and_Compositional_CVPR_2019_paper.pdf)]\n[[Project](https:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fdorarad\u002Fgqa\u002Findex.html)]\n\n#### 3.1.x Benchmarks, Datasets, and Metrics\n\n- `2023\u002F06` | `CConS` | [Probing Physical Reasoning with Counter-Commonsense Context](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02258)\n\\-\n\n- `2023\u002F05` | `SummEdits` | [LLMs as Factual Reasoners: Insights from Existing Benchmarks and Beyond](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14540)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14540.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FfactualNLG)]\n\n- `2021\u002F03` | `RAINBOW` | [UNICORN on RAINBOW: A Universal Commonsense Reasoning Model on a New Multitask Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13009)\n\\-\n\n- `2020\u002F11` | `ProtoQA` | ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2020.emnlp-main.85.pdf)]\n\n- `2020\u002F10` | `DrFact` | [Differentiable Open-Ended Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.14439)\n\n- `2019\u002F11` | `CommonGen` | [CommonGen: A Constrained Text Generation Challenge for Generative Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03705)\n\n- `2019\u002F08` | `Cosmos QA` | [Cosmos QA: Machine Reading Comprehension with Contextual Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00277)\n\n- `2019\u002F08` | `αNLI` | [Abductive Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05739)\n\\-\n\n- `2019\u002F08` | `PHYRE` | [PHYRE: A New Benchmark for Physical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05656)\n\\-\n\n- `2019\u002F07` | `WinoGrande` | [WinoGrande: An Adversarial Winograd Schema Challenge at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10641)\n\\-\n\n- `2019\u002F05` | `MathQA` | [MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.13319)\n\\-\n\n- `2019\u002F05` | `HellaSwag` | [HellaSwag: Can a Machine Really Finish Your Sentence?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07830)\n\\-\n\n- `2019\u002F04` | `Social IQa` | [SocialIQA: Commonsense Reasoning about Social Interactions](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09728)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD19-1454.pdf)]\n\n- `2018\u002F08` | `SWAG` | [SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.05326)\n\\-\n\n- `2002\u002F07` | `BLEU` | BLEU: a Method for Automatic Evaluation of Machine Translation\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FP02-1040.pdf)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.2 Mathematical Reasoning\n\n\u003Cdetails open>\n\u003Csummary>mathematical reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.2 Mathematical Reasoning](#32-mathematical-reasoning)\n  - [3.2.1 Arithmetic Reasoning](#321-arithmetic-reasoning)\n  - [3.2.2 Geometry Reasoning](#322-geometry-reasoning)\n  - [3.2.3 Theorem Proving](#323-theorem-proving)\n  - [3.2.4 Scientific Reasoning](#324-scientific-reasoning)\n  - [3.2.x Benchmarks, Datasets, and Metrics](#32x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n\n- `2023\u002F10` | `MathVista` | [MathVista: Evaluating Math Reasoning in Visual Contexts with GPT-4V, Bard, and Other Large Multimodal Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02255)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=KUNzEQMWU7)]\n[[Code](https:\u002F\u002Fgithub.com\u002Flupantech\u002FMathVista)]\n[[Project](https:\u002F\u002Fmathvista.github.io\u002F)]\n| `Lu et al., ICLR 2024`\n\n\n\n- `2022\u002F11` | Tokenization in the Theory of Knowledge\n\\-\n[[Paper](https:\u002F\u002Fwww.mdpi.com\u002F2673-8392\u002F3\u002F1\u002F24)]\n\n- `2022\u002F06` | `MultiHiertt` | [MultiHiertt: Numerical Reasoning over Multi Hierarchical Tabular and Textual Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01347)\n\n- `2021\u002F04` | `MultiModalQA` | [MultiModalQA: Complex Question Answering over Text, Tables and Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06039)\n\n- `2017\u002F05` | [Program Induction by Rationale Generation : Learning to Solve and Explain Algebraic Word Problems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.04146)\n\n- `2014\u002F04` | [Deep Learning in Neural Networks: An Overview](https:\u002F\u002Farxiv.org\u002Fabs\u002F1404.7828)\n\\-\n[[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0893608014002135)]\n\n- `2004` | Wittgenstein on philosophy of logic and mathematics\n\\-\n[[Paper](https:\u002F\u002Fwww.pdcnet.org\u002Fgfpj\u002Fcontent\u002Fgfpj_2004_0025_0002_0227_0288)]\n\n- `1989` | `CLP` | Connectionist Learning Procedures\n\\-\n[[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002F0004370289900490)]\n\n#### 3.2.1 Arithmetic Reasoning\n\n[Mathematical Reasoning (Back-to-Top)](#32-mathematical-reasoning)\n\n- `2022\u002F09` | `PromptPG` | [Dynamic Prompt Learning via Policy Gradient for Semi-structured Mathematical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14610)\n\n- `2022\u002F01` | [Chain-of-Thought Prompting Elicits Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)\n\\-\n\n- `2021\u002F03` | `SVAMP` | [Are NLP Models really able to Solve Simple Math Word Problems?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07191)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2021.naacl-main.168.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Farkilpatel\u002FSVAMP)]\n\n- `2021\u002F03` | `MATH` | [Measuring Mathematical Problem Solving With the MATH Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.03874)\n\\-\n\n- `2016\u002F08` | [How well do Computers Solve Math Word Problems? Large-Scale Dataset Construction and Evaluation](https:\u002F\u002Faclanthology.org\u002FP16-1084\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FP16-1084.pdf)]\n\n- `2015\u002F09` | [Learn to Solve Algebra Word Problems Using Quadratic Programming](https:\u002F\u002Faclanthology.org\u002FD15-1096\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD15-1096.pdf)]\n\n- `2014\u002F06` | `Alg514` | [Learning to Automatically Solve Algebra Word Problems](https:\u002F\u002Faclanthology.org\u002FP14-1026\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FP14-1026.pdf)]\n\n#### 3.2.2 Geometry Reasoning\n\n[Mathematical Reasoning (Back-to-Top)](#32-mathematical-reasoning)\n\n- `2024\u002F01` | `AlphaGeometry` | Solving olympiad geometry without human demonstrations\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06747-5)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002Falphageometry)]\n[[Blog](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Falphageometry-an-olympiad-level-ai-system-for-geometry\u002F)]\n| `Trinh et al., Nature`\n\n- `2022\u002F12` | `UniGeo` \u002F `Geoformer` | [UniGeo: Unifying Geometry Logical Reasoning via Reformulating Mathematical Expression](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.02746)\n\n- `2021\u002F05` | `GeoQA` \u002F `NGS` | [GeoQA: A Geometric Question Answering Benchmark Towards Multimodal Numerical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14517)\n\n- `2021\u002F05` | `Geometry3K` \u002F `Inter-GPS` | [Inter-GPS: Interpretable Geometry Problem Solving with Formal Language and Symbolic Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.04165)\n\n- `2015\u002F09` | `GeoS` | [Solving Geometry Problems: Combining Text and Diagram Interpretation](https:\u002F\u002Faclanthology.org\u002FD15-1171\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD15-1171.pdf)]\n\n#### 3.2.3 Theorem Proving\n\n[Mathematical Reasoning (Back-to-Top)](#32-mathematical-reasoning)\n\n- `2020\u002F10` | `Prover` | [LEGO-Prover: Neural Theorem Proving with Growing Libraries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00656)\n\\-\n\n- `2023\u002F09` | `Lyra` | [Lyra: Orchestrating Dual Correction in Automated Theorem Proving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15806)\n\n- `2023\u002F06` | `DT-Solver` | [DT-Solver: Automated Theorem Proving with Dynamic-Tree Sampling Guided by Proof-level Value Function](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.706\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.706.pdf)]\n\n- `2023\u002F05` | [Decomposing the Enigma: Subgoal-based Demonstration Learning for Formal Theorem Proving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16366)\n\n- `2023\u002F03` | `Magnushammer` | [Magnushammer: A Transformer-based Approach to Premise Selection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04488)\n\n- `2022\u002F10` | `DSP` | [Draft, Sketch, and Prove: Guiding Formal Theorem Provers with Informal Proofs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.12283)\n\\-\n\n- `2022\u002F05` | [Learning to Find Proofs and Theorems by Learning to Refine Search Strategies: The Case of Loop Invariant Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14229)\n\n- `2022\u002F05` | [Autoformalization with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12615)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Ffile\u002Fd0c6bc641a56bebee9d985b937307367-Paper-Conference.pdf)]\n\n- `2022\u002F05` | `HTPS` | [HyperTree Proof Search for Neural Theorem Proving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11491)\n\n- `2022\u002F05` | `Thor` | [Thor: Wielding Hammers to Integrate Language Models and Automated Theorem Provers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10893)\n\\-\n\n- `2022\u002F02` | [Formal Mathematics Statement Curriculum Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01344)\n\\-\n\n- `2021\u002F07` | `Lean 4` | [The Lean 4 Theorem Prover and Programming Language](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-030-79876-5_37)\n\\-\n\n- `2021\u002F02` | `TacticZero` | [TacticZero: Learning to Prove Theorems from Scratch with Deep Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.09756)\n\\-\n\n- `2021\u002F02` | `PACT` | [Proof Artifact Co-training for Theorem Proving with Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.06203)\n\\-\n\n- `2020\u002F09` | `GPT-f` |[Generative Language Modeling for Automated Theorem Proving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.03393)\n\\-\n\n- `2019\u002F07` | [Formal Verification of Hardware Components in Critical Systems](https:\u002F\u002Fwww.hindawi.com\u002Fjournals\u002Fwcmc\u002F2020\u002F7346763\u002F)\n\\-\n[[Paper](https:\u002F\u002Fdownloads.hindawi.com\u002Fjournals\u002Fwcmc\u002F2020\u002F7346763.pdf?_gl=1*1yjtq1u*_ga*MjA3MTczMzQzOC4xNjk5NjE3NDI1*_ga_NF5QFMJT5V*MTY5OTYxNzQyNC4xLjEuMTY5OTYxNzQ2Ni4xOC4wLjA.&_ga=2.180805351.1310949615.1699617425-2071733438.1699617425)]\n\n- `2019\u002F06` | `Metamath` | A Computer Language for Mathematical Proofs\n\\-\n[[Paper](http:\u002F\u002Fde.metamath.org\u002Fdownloads\u002Fmetamath.pdf)]\n\n- `2019\u002F05` | `CoqGym` | [Learning to Prove Theorems via Interacting with Proof Assistants](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09381)\n\n- `2018\u002F12` | `AlphaZero` | [A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play](https:\u002F\u002Fwww.science.org\u002Fdoi\u002F10.1126\u002Fscience.aar6404)\n\\-\n[[Paper](https:\u002F\u002Fwww.science.org\u002Fdoi\u002Fpdf\u002F10.1126\u002Fscience.aar6404)]\n\n- `2018\u002F04` | `TacticToe` | [TacticToe: Learning to Prove with Tactics](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.00596)\n\n- `2015\u002F08` | `Lean` | The Lean Theorem Prover (system description)\n\\-\n[[Paper](https:\u002F\u002Flean-lang.org\u002Fpapers\u002Fsystem.pdf)]\n\n- `2010\u002F07` | Three Years of Experience with Sledgehammer, a Practical Link between Automatic and Interactive Theorem Provers\n\\-\n[[Paper](https:\u002F\u002Fwww.cl.cam.ac.uk\u002F~lp15\u002Fpapers\u002FAutomation\u002Fpaar.pdf)]\n\n- `2010\u002F04` | Formal Methods at Intel - An Overview\n\\-\n[[Slides](https:\u002F\u002Fshemesh.larc.nasa.gov\u002FNFM2010\u002Ftalks\u002Fharrison.pdf)]\n\n- `2005\u002F07` | Combining Simulation and Formal Verification for Integrated Circuit Design Validation\n\\-\n[[Paper](https:\u002F\u002Fs2.smu.edu\u002F~mitch\u002Fftp_dir\u002Fpubs\u002Fwmsci05.pdf)]\n\n- `2003` | Extracting a Formally Verified, Fully Executable\nCompiler from a Proof Assistant\n\\-\n[[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1571066105825988\u002Fpdf?md5=10b884badea7fe0e46c38b9419fbcca6&pid=1-s2.0-S1571066105825988-main.pdf&_valck=1)]\n\n- `1996` | `Coq` | The Coq Proof Assistant-Reference Manual\n\\-\n[[Project](https:\u002F\u002Fcoq.inria.fr\u002Fdocumentation)]\n\n- `1994` | `Isabelle` | Isabelle: A Generic Theorem Prover\n\\-\n[[Paper](https:\u002F\u002Flink.springer.com\u002Fcontent\u002Fpdf\u002F10.1007\u002Fbfb0030558.pdf)]\n\n#### 3.2.4 Scientific Reasoning\n\n[Mathematical Reasoning (Back-to-Top)](#32-mathematical-reasoning)\n\n- `2023\u002F07` | `SciBench` | [SciBench: Evaluating College-Level Scientific Problem-Solving Abilities of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.10635)\n\\-\n\n- `2022\u002F09` | `ScienceQA` | [Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09513)\n\n- `2022\u002F03` | `ScienceWorld` | [ScienceWorld: Is your Agent Smarter than a 5th Grader?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.07540)\n\n- `2012` | Current Topics in Children's Learning and Cognition\n\\-\n[[Book](https:\u002F\u002Fwww.intechopen.com\u002Fbooks\u002F654)]\n\n#### 3.2.x Benchmarks, Datasets, and Metrics\n\n[Mathematical Reasoning (Back-to-Top)](#32-mathematical-reasoning)\n\n- `2024\u002F01` | `MathBench`\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fopen-compass\u002FMathBench.svg?style=social&label=Star) \u003Cbr>\nMathBench: A Comprehensive Multi-Level Difficulty Mathematics Evaluation Dataset \u003Cbr>\n[[code](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002FMathBench)]\n\n- `2023\u002F08` | `Math23K-F` \u002F `MAWPS-F` \u002F `FOMAS` | [Guiding Mathematical Reasoning via Mastering Commonsense Formula Knowledge](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1145\u002F3580305.3599375)\n\\-\n[[Paper](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3580305.3599375)]\n\n- `2023\u002F07` | `ARB` | [ARB: Advanced Reasoning Benchmark for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.13692)\n\\-\n\n- `2023\u002F05` | `SwiftSage` | [SwiftSage: A Generative Agent with Fast and Slow Thinking for Complex Interactive Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.17390)\n\\-\n\n- `2023\u002F05` | `TheoremQA` | [TheoremQA: A Theorem-driven Question Answering dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12524)\n\\-\n\n- `2022\u002F10` | `MGSM` | [Language Models are Multilingual Chain-of-Thought Reasoners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03057)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=fR3wGCk-IXp)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Furl-nlp)]\n\n- `2021\u002F10` | `GSM8K` | [Training Verifiers to Solve Math Word Problems](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.14168)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.14168.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math)]\n[[Blog](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fsolving-math-word-problems)]\n\n- `2021\u002F10` | `IconQA` | [IconQA: A New Benchmark for Abstract Diagram Understanding and Visual Language Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.13214)\n\\-\n\n- `2021\u002F09` | `FinQA` | [FinQA: A Dataset of Numerical Reasoning over Financial Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.00122)\n\\-\n\n- `2021\u002F08` | `MBPP` \u002F `MathQA-Python` | [Program Synthesis with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07732)\n\n- `2021\u002F08` | `HiTab` \u002F `EA` | [HiTab: A Hierarchical Table Dataset for Question Answering and Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06712)\n\n- `2021\u002F07` | `HumanEval` \u002F `Codex` | [Evaluating Large Language Models Trained on Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374)\n\\-\n\n- `2021\u002F06` | `ASDiv` \u002F `CLD` | [A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.15772)\n\\-\n\n- `2021\u002F06` | `AIT-QA` | [AIT-QA: Question Answering Dataset over Complex Tables in the Airline Industry](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.12944)\n\\-\n\n- `2021\u002F05` | `APPS` | [Measuring Coding Challenge Competence With APPS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.09938)\n\\-\n\n- `2021\u002F05` | `TAT-QA` | [TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.07624)\n\n- `2021\u002F03` | `SVAMP` | [Are NLP Models really able to Solve Simple Math Word Problems?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07191)\n\\-\n\n- `2021\u002F01` | `TSQA` \u002F `MAP` \u002F `MRR` | [TSQA: Tabular Scenario Based Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.11429)\n\n- `2020\u002F10` | `HMWP` | [Semantically-Aligned Universal Tree-Structured Solver for Math Word Problems](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06823)\n\\-\n\n- `2020\u002F04` | `HybridQA` | [HybridQA: A Dataset of Multi-Hop Question Answering over Tabular and Textual Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.07347)\n\n- `2019\u002F03` | `DROP` | [DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.00161)\n\\-\n\n- `2019` | `NaturalQuestions` | [Natural Questions: A Benchmark for Question Answering Research](https:\u002F\u002Faclanthology.org\u002FQ19-1026\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FQ19-1026.pdf)]\n\n- `2018\u002F09` | `HotpotQA` | [HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.09600)\n\\-\n\n- `2018\u002F09` | `Spider` | [Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.08887)\n\\-\n\n- `2018\u002F03` | `ComplexWebQuestions` | [The Web as a Knowledge-base for Answering Complex Questions](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.06643)\n\\-\n\n- `2017\u002F12` | `MetaQA` | [Variational Reasoning for Question Answering with Knowledge Graph](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.04071)\n\\-\n\n- `2017\u002F09` | `GEOS++` | [From Textbooks to Knowledge: A Case Study in Harvesting Axiomatic Knowledge from Textbooks to Solve Geometry Problems](https:\u002F\u002Faclanthology.org\u002FD17-1081\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD17-1081.pdf)]\n\n- `2017\u002F09` | `Math23k` | [Deep Neural Solver for Math Word Problems](https:\u002F\u002Faclanthology.org\u002FD17-1088\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD17-1088.pdf)]\n\n- `2017\u002F08` | `WikiSQL` \u002F `Seq2SQL` | [Seq2SQL: Generating Structured Queries from Natural Language using Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.00103)\n\\-\n\n- `2017\u002F08` | [Learning to Solve Geometry Problems from Natural Language Demonstrations in Textbooks](https:\u002F\u002Faclanthology.org\u002FS17-1029\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FS17-1029.pdf)]\n\n- `2017\u002F05` | `TriviaQA` | [TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.03551)\n\\-\n\n- `2017\u002F05` | `GeoShader` | Synthesis of Solutions for Shaded Area Geometry Problems\n\\-\n[[Paper](https:\u002F\u002Fcdn.aaai.org\u002Focs\u002F15416\u002F15416-68619-1-PB.pdf)]\n\n- `2016\u002F09` | `DRAW-1K` | [Annotating Derivations: A New Evaluation Strategy and Dataset for Algebra Word Problems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.07197)\n\\-\n\n- `2016\u002F08` | `WebQSP` | [The Value of Semantic Parse Labeling for Knowledge Base Question Answering](https:\u002F\u002Faclanthology.org\u002FP16-2033\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FP16-2033.pdf)]\n\n- `2016\u002F06` | `SQuAD` | [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.05250)\n\\-\n\n- `2016\u002F06` | `WikiMovies` | [Key-Value Memory Networks for Directly Reading Documents](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.03126)\n\\-\n\n- `2016\u002F06` | `MAWPS` | [MAWPS: A Math Word Problem Repository](https:\u002F\u002Faclanthology.org\u002FN16-1136\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FN16-1136.pdf)]\n\n- `2015\u002F09` | `Dolphin1878` | [Automatically Solving Number Word Problems by Semantic Parsing and Reasoning](https:\u002F\u002Faclanthology.org\u002FD15-1135\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD15-1135.pdf)]\n\n- `2015\u002F08` | `WikiTableQA` | [Compositional Semantic Parsing on Semi-Structured Tables](https:\u002F\u002Farxiv.org\u002Fabs\u002F1508.00305)\n\\-\n\n- `2015` | `SingleEQ` | [Parsing Algebraic Word Problems into Equations](https:\u002F\u002Faclanthology.org\u002FQ15-1042\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FQ15-1042.pdf)]\n\n- `2015` | `DRAW` | DRAW: A Challenging and Diverse Algebra Word Problem Set\n\\-\n[[Paper](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fwp-content\u002Fuploads\u002F2016\u002F02\u002Ftech_rep.pdf)]\n\n- `2014\u002F10` | `Verb395` | [Learning to Solve Arithmetic Word Problems with Verb Categorization](https:\u002F\u002Faclanthology.org\u002FD14-1058\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD14-1058.pdf)]\n\n- `2013\u002F10` | `WebQuestions` | [Semantic Parsing on Freebase from Question-Answer Pairs](https:\u002F\u002Faclanthology.org\u002FD13-1160\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FD13-1160.pdf)]\n\n- `2013\u002F08` | `Free917` | [Large-scale Semantic Parsing via Schema Matching and Lexicon Extension](https:\u002F\u002Faclanthology.org\u002FP13-1042\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FP13-1042.pdf)]\n\n- `2002\u002F04` | `NMI` | [Cluster Ensembles - A Knowledge Reuse Framework for Combining Multiple Partitions](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1162\u002F153244303321897735)\n\\-\n[[Paper](https:\u002F\u002Fwww.jmlr.org\u002Fpapers\u002Fvolume3\u002Fstrehl02a\u002Fstrehl02a.pdf)]\n\n- `1990` | `ATIS` | [The ATIS Spoken Language Systems Pilot Corpus](https:\u002F\u002Faclanthology.org\u002FH90-1021\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FH90-1021.pdf)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.3 Logical Reasoning\n\n\u003Cdetails open>\n\u003Csummary>logical reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.3 Logical Reasoning](#33-logical-reasoning)\n  - [3.3.1 Propositional Logic](#331-propositional-logic)\n  - [3.3.2 Predicate Logic](#332-predicate-logic)\n  - [3.3.x Benchmarks, Datasets, and Metrics](#33x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2024\u002F12` | `FLDx2` | [Enhancing Reasoning Capabilities of LLMs via Principled Synthetic Logic Corpus](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.12498)\n\\-\n\n- `2023\u002F11` | `FLD` | [Learning Deductive Reasoning from Synthetic Corpus based on Formal Logic](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.07336)\n\\-\n\n- `2023\u002F10` | `LogiGLUE` | [Towards LogiGLUE: A Brief Survey and A Benchmark for Analyzing Logical Reasoning Capabilities of Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00836)\n\\-\n\n- `2023\u002F05` | `LogicLLM` | [LogicLLM: Exploring Self-supervised Logic-enhanced Training for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13718)\n\\-\n\n- `2023\u002F05` | `Logic-LM` | [Logic-LM: Empowering Large Language Models with Symbolic Solvers for Faithful Logical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12295)\n\\-\n\n- `2023\u002F03` | `LEAP` | [Explicit Planning Helps Language Models in Logical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.15714)\n\\-\n\n- `2023\u002F03` | [Sparks of Artificial General Intelligence: Early experiments with GPT-4](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12712)\n\\-\n\n- `2022\u002F10` | `Entailer` | [Entailer: Answering Questions with Faithful and Truthful Chains of Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.12217)\n\\-\n\n- `2022\u002F06` | `NeSyL` | [Weakly Supervised Neural Symbolic Learning for Cognitive Tasks](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F20533)\n\\-\n[[Paper](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F20533\u002F20292)]\n\n- `2022\u002F05` | `NeuPSL` | [NeuPSL: Neural Probabilistic Soft Logic](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14268)\n\\-\n\n- `2022\u002F05` | `NLProofS` | [Generating Natural Language Proofs with Verifier-Guided Search](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12443)\n\\-\n\n- `2022\u002F05` | `Least-to-Most Prompting` | [Least-to-Most Prompting Enables Complex Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625)\n\\-\n\n- `2022\u002F05` | `SI` | [Selection-Inference: Exploiting Large Language Models for Interpretable Logical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.09712)\n\\-\n\n- `2022\u002F05` | `MERIt` | [MERIt: Meta-Path Guided Contrastive Learning for Logical Reasoning](https:\u002F\u002Faclanthology.org\u002F2022.findings-acl.276\u002F)\n\\-\n\n- `2022\u002F03` | [Self-Consistency Improves Chain of Thought Reasoning in Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)\n\\-\n\n- `2021\u002F11` | `NSPS` | [Neuro-Symbolic Program Search for Autonomous Driving Decision Module Design](https:\u002F\u002Fproceedings.mlr.press\u002Fv155\u002Fsun21a.html)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv155\u002Fsun21a\u002Fsun21a.pdf)]\n\n- `2021\u002F09` | `DeepProbLog` | [Neural probabilistic logic programming in DeepProbLog](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0004370221000552)\n\\-\n[[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0004370221000552\u002Fpdfft?md5=1e6b82d50854f317478e487da9e75473&pid=1-s2.0-S0004370221000552-main.pdf)]\n\n- `2021\u002F08` | `GABL` | [Abductive Learning with Ground Knowledge Base](https:\u002F\u002Fwww.ijcai.org\u002Fproceedings\u002F2021\u002F250)\n\\-\n[[Paper](https:\u002F\u002Fwww.ijcai.org\u002Fproceedings\u002F2021\u002F0250.pdf)]\n\n- `2021\u002F05` | `LReasoner` | [Logic-Driven Context Extension and Data Augmentation for Logical Reasoning of Text](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.03659)\n\\-\n\n- `2020\u002F02` | `RuleTakers` | [Transformers as Soft Reasoners over Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.05867)\n\\-\n\n- `2019\u002F12` | `NMN-Drop` | [Neural Module Networks for Reasoning over Text](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04971)\n\\-\n\n- `2019\u002F04` | `NS-CL` | [The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.12584)\n\\-\n\n- `2012` | Logical Reasoning and Learning\n\\-\n[[Paper](https:\u002F\u002Flink.springer.com\u002Freferenceworkentry\u002F10.1007\u002F978-1-4419-1428-6_790#:~:text=Logical%20reasoning%20is%20a%20form,of%20science%20and%20artificial%20intelligence.)]\n\n#### 3.3.1 Propositional Logic\n\n- `2022\u002F09` | Propositional Reasoning via Neural Transformer\nLanguage Models\n\\-\n[[Paper](https:\u002F\u002Fwww.cs.cmu.edu\u002F~oscarr\u002Fpdf\u002Fpublications\u002F2022_nesy.pdf)]\n\n#### 3.3.2 Predicate Logic\n\n- `2021\u002F06` | `ILP` | [Inductive logic programming at 30](https:\u002F\u002Flink.springer.com\u002Farticle\u002F10.1007\u002Fs10994-021-06089-1)\n\\-\n[[Paper](https:\u002F\u002Flink.springer.com\u002Fcontent\u002Fpdf\u002F10.1007\u002Fs10994-021-06089-1.pdf)]\n\n- `2011` | Statistical Relational Learning\n\\-\n[[Paper](https:\u002F\u002Flink.springer.com\u002Freferenceworkentry\u002F10.1007\u002F978-0-387-30164-8_786)]\n\n#### 3.3.x Benchmarks, Datasets, and Metrics\n\n- `2022\u002F10` | `PrOntoQA` | [Language Models Are Greedy Reasoners: A Systematic Formal Analysis of Chain-of-Thought](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.01240)\n\\-\n\n- `2022\u002F09` | `FOLIO` | [FOLIO: Natural Language Reasoning with First-Order Logic](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00840)\n\\-\n\n- `2022\u002F06` | `BIG-bench` | [Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.04615)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=uyTL5Bvosj)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench)]\n\n- `2021\u002F04` | `AR-LSAT` | [AR-LSAT: Investigating Analytical Reasoning of Text](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.06598)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2022.findings-naacl.177\u002F)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fzhongwanjun\u002FAR-LSAT)]\n\n- `2020\u002F12` | `ProofWriter` [ProofWriter: Generating Implications, Proofs, and Abductive Statements over Natural Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.13048)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.4 Causal Reasoning\n\n\u003Cdetails open>\n\u003Csummary>causal reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.4 Causal Reasoning](#34-causal-reasoning)\n  - [3.4.1 Counterfactual Reasoning](#341-counterfactual-reasoning)\n  - [3.4.x Benchmarks, Datasets, and Metrics](#34x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F08` | [Causal Parrots: Large Language Models May Talk Causality But Are Not Causal](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.13067)\n\n- `2023\u002F07` | [Causal Discovery with Language Models as Imperfect Experts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02390)\n\\-\n\n- `2023\u002F06` | [From Query Tools to Causal Architects: Harnessing Large Language Models for Advanced Causal Discovery from Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16902)\n\\-\n\n- `2023\u002F06` | `Corr2Cause` | [Can Large Language Models Infer Causation from Correlation?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.05836)\n\\-\n\n- `2023\u002F05` | `Code-LLMs` | [The Magic of IF: Investigating Causal Reasoning Abilities in Large Language Models of Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19213)\n\\-\n\n- `2023\u002F04` | [Understanding Causality with Large Language Models: Feasibility and Opportunities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05524)\n\\-\n\n- `2023\u002F04` | [Causal Reasoning and Large Language Models: Opening a New Frontier for Causality](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.00050)\n\\-\n\n- `2023\u002F03` | [Can large language models build causal graphs?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05279)\n\\-\n\n- `2023\u002F01` | [Causal-Discovery Performance of ChatGPT in the context of Neuropathic Pain Diagnosis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.13819)\n\\-\n\n- `2022\u002F09` | [Probing for Correlations of Causal Facts: Large Language Models and Causality](https:\u002F\u002Fopenreview.net\u002Fforum?id=UPwzqPOs4-)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=UPwzqPOs4-)]\n\n- `2022\u002F07` | [Can Large Language Models Distinguish Cause from Effect?](https:\u002F\u002Fopenreview.net\u002Fforum?id=ucHh-ytUkOH&referrer=%5Bthe%20profile%20of%20Mrinmaya%20Sachan%5D(%2Fprofile%3Fid%3D~Mrinmaya_Sachan3))\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=ucHh-ytUkOH)]\n\n- `2021\u002F08` | [Learning Faithful Representations of Causal Graphs](https:\u002F\u002Faclanthology.org\u002F2021.acl-long.69\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2021.acl-long.69.pdf)]\n\n- `2021\u002F05` | `InferBERT` | [InferBERT: A Transformer-Based Causal Inference Framework for Enhancing Pharmacovigilance](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffrai.2021.659622\u002Ffull)\n\\-\n[[Paper](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffrai.2021.659622\u002Fpdf?isPublishedV2=False)]\n\n- `2021\u002F02` | [Towards Causal Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.11107)\n\\-\n\n- `2020\u002F05` | `CausaLM` | [CausaLM: Causal Model Explanation Through Counterfactual Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.13407)\n\\-\n\n- `2019\u002F06` | [Neuropathic Pain Diagnosis Simulator for Causal Discovery Algorithm Evaluation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01732)\n\\-\n\n- `2017` | [Elements of Causal Inference: Foundations and Learning Algorithms](https:\u002F\u002Fmitpress.mit.edu\u002F9780262037310\u002Felements-of-causal-inference\u002F)\n\\-\n[[Book](https:\u002F\u002Flibrary.oapen.org\u002Fbitstream\u002Fid\u002F056a11be-ce3a-44b9-8987-a6c68fce8d9b\u002F11283.pdf)]\n\n- `2016` | Actual Causality\n\\-\n[[Book](https:\u002F\u002Fdirect.mit.edu\u002Fbooks\u002Foa-monograph\u002F3451\u002FActual-Causality)]\n\n- `2013` | Causal Reasoning\n\\-\n[[Paper](https:\u002F\u002Fpsycnet.apa.org\u002Frecord\u002F2012-26298-046)]\n\n#### 3.4.1 Counterfactual Reasoning\n\n- `2023\u002F07` | [Reasoning or Reciting? Exploring the Capabilities and Limitations of Language Models Through Counterfactual Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02477)\n\\-\n\n- `2023\u002F05` | [Counterfactual reasoning: Testing language models' understanding of hypothetical scenarios](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16572)\n\\-\n\n- `2007` | The Rational Imagination: How People Create Alternatives to Reality\n\\-\n[[Paper](https:\u002F\u002Fscholar.archive.org\u002Fwork\u002Fzjwdgk7r6vefxaole362qftqji\u002Faccess\u002Fwayback\u002Fhttp:\u002F\u002Fwww.tara.tcd.ie\u002Fbitstream\u002Fhandle\u002F2262\u002F39428\u002FPrecis%20of%20The%20Rational%20Imagination%20-%20How%20People%20Create%20Alternatives%20to%20Reality.pdf?sequence=1)]\n\n- `1986` | Norm theory: Comparing reality to its alternatives\n\\-\n[[Paper](https:\u002F\u002Fpsycnet.apa.org\u002Frecord\u002F1986-21899-001)]\n\n#### 3.4.x Benchmarks, Datasets, and Metrics\n\n- `2021\u002F12` | `CRASS` | [CRASS: A Novel Data Set and Benchmark to Test Counterfactual Reasoning of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.11941)\n\\-\n\n- `2021\u002F08` | `Arctic sea ice` | [Benchmarking of Data-Driven Causality Discovery Approaches in the Interactions of Arctic Sea Ice and Atmosphere](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffdata.2021.642182\u002Ffull)\n\\-\n[[Paper](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffdata.2021.642182\u002Fpdf?isPublishedV2=False)]\n\n- `2014\u002F12` | `CauseEffectPairs` | [Distinguishing cause from effect using observational data: methods and benchmarks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1412.3773)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.5 Visual Reasoning\n\n\u003Cdetails open>\n\u003Csummary>visual reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.5 Visual Reasoning](#35-visual-reasoning)\n  - [3.5.1 3D Reasoning](#351-3d-reasoning)\n  - [3.5.x Benchmarks, Datasets, and Metrics](#35x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2025\u002F02` | `VPT` | [Introducing Visual Perception Token into Multimodal Large Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17425) - [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17425)] - [[Code](https:\u002F\u002Fgithub.com\u002Fyu-rp\u002FVisualPerceptionToken)] - [[Models](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Frp-yu\u002Fvpt-models-67b6afdc8679a05a2876f07a)] - [[Datasets](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Frp-yu\u002FVPT_Datasets)]\n\n- `2022\u002F11` | `G-VUE` | [Perceive, Ground, Reason, and Act: A Benchmark for General-purpose Visual Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15402)\n\\-\n\n- `2021\u002F03` | `VLGrammar` | [VLGrammar: Grounded Grammar Induction of Vision and Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.12975)\n\\-\n\n- `2020\u002F12` | [Attention over learned object embeddings enables complex visual reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.08508)\n\\-\n\n#### 3.5.1 3D Reasoning\n\n- `2023\u002F08` | `PointLLM` | [PointLLM: Empowering Large Language Models to Understand Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.16911)\n\\-\n\n- `2023\u002F08` | `3D-VisTA` | [3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.04352)\n\\-\n\n- `2023\u002F07` | `3D-LLM` | [3D-LLM: Injecting the 3D World into Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.12981)\n\\-\n\n- `2022\u002F10` | `SQA3D` | [SQA3D: Situated Question Answering in 3D Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07474)\n\\-\n\n#### 3.5.x Benchmarks, Datasets, and Metrics\n- `2025\u002F04` | `VisuLogic` | [VisuLogic: A Benchmark for Evaluating Visual Reasoning in Multi-modal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.15279)\n\\-\n\n- `2021\u002F12` | `PTR` | [PTR: A Benchmark for Part-based Conceptual, Relational, and Physical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.05136)\n\\-\n\n- `2019\u002F05` | `OK-VQA` | [OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.00067)\n\\-\n\n- `2016\u002F12` | `CLEVR` | [CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.06890)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.6 Audio Reasoning\n\n\u003Cdetails open>\n\u003Csummary>audio reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.6 Audio Reasoning](#36-audio-reasoning)\n  - [3.6.1 Speech](#361-speech)\n  - [3.6.x Benchmarks, Datasets, and Metrics](#36x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F11` | `M2UGen` | [M2UGen: Multi-modal Music Understanding and Generation with the Power of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.11255)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.11255.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fcrypto-code\u002FM2UGen)]\n\n- `2023\u002F08` | `MU-LLaMA` | [Music Understanding LLaMA: Advancing Text-to-Music Generation with Question Answering and Captioning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.11276)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.11276.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fcrypto-code\u002FMU-LLaMA)]\n\n- `2022\u002F05` | [Self-Supervised Speech Representation Learning: A Review](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10643)\n\\-\n\n#### 3.6.1 Speech\n\n- `2022\u002F03` | `SUPERB-SG` | [SUPERB-SG: Enhanced Speech processing Universal PERformance Benchmark for Semantic and Generative Capabilities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.06849)\n\\-\n\n- `2022\u002F02` | `Data2Vec` | [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03555)\n\\-\n\n- `2021\u002F10` | `WavLM` | [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.13900)\n\\-\n\n- `2021\u002F06` | `HuBERT` | [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.07447)\n\\-\n\n- `2021\u002F05` | `SUPERB` | [SUPERB: Speech processing Universal PERformance Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01051)\n\\-\n\n- `2020\u002F10` | `Speech SIMCLR` | [Speech SIMCLR: Combining Contrastive and Reconstruction Objective for Self-supervised Speech Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.13991)\n\\-\n\n- `2020\u002F06` | `Wav2Vec 2.0` | [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11477)\n\\-\n\n- `2020\u002F05` | `Conformer` | [Conformer: Convolution-augmented Transformer for Speech Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08100)\n\\-\n\n- `2019\u002F10` | `Mockingjay` | [Mockingjay: Unsupervised Speech Representation Learning with Deep Bidirectional Transformer Encoders](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12638)\n\\-\n\n- `2019\u002F04` | `APC` | [An Unsupervised Autoregressive Model for Speech Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03240)\n\\-\n\n- `2018\u002F07` | `CPC` | [Representation Learning with Contrastive Predictive Coding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.03748)\n\\-\n\n- `2018\u002F04` | `Speech-Transformer` | Speech-Transformer: A No-Recurrence Sequence-to-Sequence Model for Speech Recognition\n\\-\n[[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8462506)]\n\n- `2017\u002F11` | `VQ-VAE` | [Neural Discrete Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.00937)\n\\-\n\n- `2017\u002F08` | [Large-Scale Domain Adaptation via Teacher-Student Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.05466)\n\\-\n\n#### 3.6.x Benchmarks, Datasets, and Metrics\n\n- `2022\u002F03` | `SUPERB-SG` | [SUPERB-SG: Enhanced Speech processing Universal PERformance Benchmark for Semantic and Generative Capabilities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.06849)\n\\-\n\n- `2021\u002F11` | `VoxPopuli` \u002F `XLS-R` | [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09296)\n\\-\n\n- `2021\u002F05` | `SUPERB` | [SUPERB: Speech processing Universal PERformance Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01051)\n\\-\n\n- `2020\u002F12` | `Multilingual LibriSpeech` | [MLS: A Large-Scale Multilingual Dataset for Speech Research](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.03411)\n\\-\n\n- `2020\u002F05` | `Didi Dictation` \u002F `Didi Callcenter` | [A Further Study of Unsupervised Pre-training for Transformer Based Speech Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.09862)\n\\-\n\n- `2019\u002F12` | `Libri-Light` | [Libri-Light: A Benchmark for ASR with Limited or No Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07875)\n\\-\n\n- `2019\u002F12` | `Common Voice` | [Common Voice: A Massively-Multilingual Speech Corpus](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06670)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.7 Multimodal Reasoning\n\n\u003Cdetails open>\n\u003Csummary>multimodal reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.7 Multimodal Reasoning](#37-multimodal-reasoning)\n  - [3.7.1 Alignment](#371-alignment)\n  - [3.7.2 Generation](#372-generation)\n  - [3.7.3 Multimodal Understanding](#373-multimodal-understanding)\n  - [3.7.x Benchmarks, Datasets, and Metrics](#37x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F12` | [A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise]()\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.12436.pdf)]\n[[Project](https:\u002F\u002Fgithub.com\u002FBradyFU\u002FAwesome-Multimodal-Large-Language-Models)]\n\n#### 3.7.1 Alignment\n\n- `2023\u002F01` | `BLIP-2` | [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fli23q.html)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Fblip2)]\n\n#### 3.7.2 Generation\n\n- `2023\u002F10` | `DALL·E 3` | Improving Image Generation with Better Captions\n\\-\n[[Paper](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fdall-e-3.pdf)]\n[[Project](https:\u002F\u002Fopenai.com\u002Fdall-e-3)]\n\n- `2023\u002F06` | `Kosmos-2` | [Kosmos-2: Grounding Multimodal Large Language Models to the World](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.14824)\n\\-\n\n- `2023\u002F05` | `BiomedGPT` | [BiomedGPT: A Unified and Generalist Biomedical Generative Pre-trained Transformer for Vision, Language, and Multimodal Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.17100)\n\\-\n\n- `2023\u002F03` | `Visual ChatGPT` | [Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04671)\n\\-\n\n- `2023\u002F02` | `Kosmos-1` | [Language Is Not All You Need: Aligning Perception with Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045)\n\\-\n\n- `2022\u002F07` | `Midjourney`\n\\-\n[[Project](https:\u002F\u002Fwww.midjourney.com\u002Fhome)]\n\n- `2022\u002F04` | `Flamingo` | [Flamingo: a Visual Language Model for Few-Shot Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198)\n\\-\n\n- `2021\u002F12` | `MAGMA` | [MAGMA -- Multimodal Augmentation of Generative Models through Adapter-based Finetuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.05253)\n\\-\n\n#### 3.7.3 Multimodal Understanding\n\n- `2023\u002F09` | `Q-Bench` | [Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.14181)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.14181.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FQ-Future\u002FQ-Bench)]\n\n- `2023\u002F05` | `DetGPT` | [DetGPT: Detect What You Need via Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14167)\n\\-\n\n- `2023\u002F03` | `Vicuna` | Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality\n\\-\n[[Blog](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-03-30-vicuna\u002F)]\n[[Code](https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat)]\n\n- `2022\u002F12` | `DePlot` | [DePlot: One-shot visual language reasoning by plot-to-table translation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10505)\n\\-\n\n- `2022\u002F12` | `MatCha` | [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09662)\n\\-\n\n#### 3.7.x Benchmarks, Datasets, and Metrics\n\n- `2023\u002F06` | `LVLM-eHub` | [LVLM-eHub: A Comprehensive Evaluation Benchmark for Large Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09265)\n\\-\n\n- `2023\u002F06` | `LAMM` | [LAMM: Language-Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06687)\n\\-\n\n- `2023\u002F05` | `AttackVLM` | [On Evaluating Adversarial Robustness of Large Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16934)\n\\-\n\n- `2023\u002F05` | `POPE` | [Evaluating Object Hallucination in Large Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10355)\n\\-\n\n- `2023\u002F05` | `MultimodalOCR` | [On the Hidden Mystery of OCR in Large Multimodal Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.07895)\n\\-\n\n- `2022\u002F10` | `ObjMLM` | [Plausible May Not Be Faithful: Probing Object Hallucination in Vision-Language Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07688)\n\n- `2022\u002F06` | `RAVEN` \u002F `ARC` | [Evaluating Understanding on Conceptual Abstraction Benchmarks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14187)\n\\-\n\n- `2021\u002F06` | `LARC` | [Communicating Natural Programs to Humans and Machines](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.07824)\n\\-\n\n- `2014\u002F11` | `CIDEr` \u002F `PASCAL-50S` \u002F `ABSTRACT-50S` | [CIDEr: Consensus-based Image Description Evaluation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.5726)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.8 Agent Reasoning\n\n\u003Cdetails open>\n\u003Csummary>agent reasoning\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.8 Agent Reasoning](#38-agent-reasoning)\n  - [3.8.1 Introspective Reasoning](#381-introspective-reasoning)\n  - [3.8.2 Extrospective Reasoning](#382-extrospective-reasoning)\n  - [3.8.3 Multi-agent Reasoning](#383-multi-agent-reasoning)\n  - [3.8.4 Driving Reasoning](#384-driving-reasoning)\n  - [3.8.x Benchmarks, Datasets, and Metrics](#38x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2024\u002F01` | `AutoRT`\n| `Ahn et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:266906759?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nAutoRT: Embodied Foundation Models for Large Scale Orchestration of Robotic Agents \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12963)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12963.pdf)]\n[[project](https:\u002F\u002Fauto-rt.github.io\u002F)]\n\n- `2023\u002F11` | `OpenFlamingo` | [Vision-Language Foundation Models as Effective Robot Imitators](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.01378)\n\\-\n\n- `2023\u002F07` | `RT-2` | [RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15818)\n\\-\n\n- `2023\u002F05` | `RAP` | [Reasoning with Language Model is Planning with World Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)\n\\-\n\n- `2023\u002F03` | `PaLM-E`\n| `Driess et al., ICML 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:257364842?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nPaLM-E: An Embodied Multimodal Language Model \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03378)]\n[[paper](https:\u002F\u002Ficml.cc\u002Fvirtual\u002F2023\u002Fposter\u002F23969)]\n[[project](https:\u002F\u002Fpalm-e.github.io\u002F)]\n\n- `2022\u002F12` | `RT-1` | [RT-1: Robotics Transformer for Real-World Control at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)\n\n- `2022\u002F10` | [Skill Induction and Planning with Latent Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.01517)\n\\-\n\n- `2022\u002F05` | `Gato` | [A Generalist Agent](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.06175)\n\\-\n\n- `2022\u002F04` | `SMs` | [Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.00598)\n\\-\n\n- `2022\u002F02` | | [Pre-Trained Language Models for Interactive Decision-Making](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01771)\n\\-\n\n- `2022\u002F01` | `Language-Planner` | [Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.07207)\n\\-\n\n- `2021\u002F11` | [Value Function Spaces: Skill-Centric State Abstractions for Long-Horizon Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.03189)\n\\-\n\n- `2020\u002F09` | [Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.14259)\n\\-\n\n- `2016\u002F01` | `AlphaGo` | Mastering the game of Go with deep neural networks and tree search\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature16961)]\n\n- `2014\u002F05` | Gesture in reasoning: An embodied perspective\n\\-\n[[Paper](https:\u002F\u002Fwww.taylorfrancis.com\u002Fchapters\u002Fedit\u002F10.4324\u002F9781315775845-19\u002Fgesture-reasoning-martha-alibali-rebecca-boncoddo-autumn-hostetter)]\n\n#### 3.8.1 Introspective Reasoning\n\n- `2022\u002F11` | `PAL` | [PAL: Program-aided Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10435)\n\\-\n\n- `2022\u002F09` | `ProgPrompt` | [ProgPrompt: Generating Situated Robot Task Plans using Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302)\n\\-\n\n- `2022\u002F09` | `Code as Policies` | [Code as Policies: Language Model Programs for Embodied Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07753)\n\\-\n\n- `2022\u002F04` | `SayCan` | [Do As I Can, Not As I Say: Grounding Language in Robotic Affordances](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.01691)\n\\-\n\n- `2012` | Introspective Learning and Reasoning\n\\-\n[[Paper](https:\u002F\u002Flink.springer.com\u002Freferenceworkentry\u002F10.1007\u002F978-1-4419-1428-6_1802)]\n\n#### 3.8.2 Extrospective Reasoning\n\n- `2023\u002F06` | `Statler` | [Statler: State-Maintaining Language Models for Embodied Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17840)\n\\-\n\n- `2023\u002F02` | `Planner-Actor-Reporter` | [Collaborating with language models for embodied reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00763)\n\\-\n\n- `2023\u002F02` | `Toolformer` | [Toolformer: Language Models Can Teach Themselves to Use Tools](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.04761)\n\\-\n\n- `2022\u002F12` | `LLM-Planner` | [LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.04088)\n\\-\n\n- `2022\u002F10` | `ReAct` | [ReAct: Synergizing Reasoning and Acting in Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)\n\\-\n\n- `2022\u002F10` | `Self-Ask` | [Measuring and Narrowing the Compositionality Gap in Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03350)\n\\-\n\n- `2022\u002F07` | `Inner Monologue` | [Inner Monologue: Embodied Reasoning through Planning with Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.05608)\n\\-\n\n#### 3.8.3 Multi-agent Reasoning\n\n- `2023\u002F07` | `Federated LLM` | [Federated Large Language Model: A Position Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.08925)\n\\-\n\n- `2023\u002F07` | [Self-Adaptive Large Language Model (LLM)-Based Multiagent Systems](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.06187)\n\\-\n\n- `2023\u002F07` | `Co-LLM-Agents` | [Building Cooperative Embodied Agents Modularly with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02485)\n\\-\n\n- `2023\u002F05` | [Improving Factuality and Reasoning in Language Models through Multiagent Debate](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14325)\n\\-\n\n- `2017\u002F02` | `FIoT` | FIoT: An agent-based framework for self-adaptive and self-organizing applications based on the Internet of Things\n\\-\n[[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0020025516313664)]\n\n- `2004` | A Practical Guide to the IBM Autonomic Computing Toolkit\n\\-\n[[Book](https:\u002F\u002Fbooks.google.com.hk\u002Fbooks\u002Fabout\u002FA_Practical_Guide_to_the_IBM_Autonomic_C.html?id=XHeoSgAACAAJ&redir_esc=y)]\n\n#### 3.8.4 Driving Reasoning\n\n- `2023\u002F12` | `DriveLM` | [DriveLM: Driving with Graph Visual Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14150)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14150.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveLM)]\n\n- `2023\u002F12` | `LiDAR-LLM` | [LiDAR-LLM: Exploring the Potential of Large Language Models for 3D LiDAR Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14074)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14074.pdf)]\n[[Project](https:\u002F\u002Fsites.google.com\u002Fview\u002Flidar-llm)]\n\n- `2023\u002F12` | `DriveMLM` | [DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral Planning States for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.09245)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.09245.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FDriveMLM)]\n\n- `2023\u002F12` | `LMDrive` | [LMDrive: Closed-Loop End-to-End Driving with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.07488)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.07488.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopendilab\u002FLMDrive)]\n\n- `2023\u002F10` | [Driving through the Concept Gridlock: Unraveling Explainability Bottlenecks in Automated Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.16639)\n\\-\n\n- `2023\u002F10` | [Vision Language Models in Autonomous Driving and Intelligent Transportation Systems](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.14414)\n\\-\n\n- `2023\u002F10` | `DriveGPT4` | [DriveGPT4: Interpretable End-to-end Autonomous Driving via Large Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01412)\n\\-\n\n- `2023\u002F09` | `MotionLM` | [MotionLM: Multi-Agent Motion Forecasting as Language Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16534)\n\\-\n\n- `2023\u002F06` | [End-to-end Autonomous Driving: Challenges and Frontiers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16927)\n\\-\n\n- `2023\u002F04` | [Graph-based Topology Reasoning for Driving Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05277)\n\\-\n\n- `2022\u002F09` | [Delving into the Devils of Bird's-eye-view Perception: A Review, Evaluation and Recipe](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05324)\n\\-\n\n- `2021\u002F11` | Artificial intelligence: A powerful paradigm for scientific research\n\\-\n[[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS2666675821001041)]\n\n#### 3.8.x Benchmarks, Datasets, and Metrics\n\n- `2023\u002F12` | `DriveLM` | [DriveLM: Driving with Graph Visual Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14150)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14150.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveLM)]\n\n- `2023\u002F09` | `NuPrompt` \u002F `PromptTrack` | [Language Prompt for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.04379)\n\\-\n\n- `2023\u002F07` | `LCTGen` | [Language Conditioned Traffic Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07947)\n\n- `2023\u002F05` | `NuScenes-QA` | [NuScenes-QA: A Multi-modal Visual Question Answering Benchmark for Autonomous Driving Scenario](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14836)\n\\-\n\n- `2022\u002F06` | `BEHAVIOR-1K` | [BEHAVIOR-1K: A Benchmark for Embodied AI with 1,000 Everyday Activities and Realistic Simulation](https:\u002F\u002Fproceedings.mlr.press\u002Fv205\u002Fli23a.html)\n\\-\n\n- `2021\u002F08` | `iGibson` | [iGibson 2.0: Object-Centric Simulation for Robot Learning of Everyday Household Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.03272)\n\\-\n\n- `2021\u002F06` | `Habitat 2.0` | [Habitat 2.0: Training Home Assistants to Rearrange their Habitat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.14405)z\n\\-\n\n- `2020\u002F04` | `RoboTHOR` | [RoboTHOR: An Open Simulation-to-Real Embodied AI Platform](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06799)\n\\-\n\n- `2019\u002F11` | `HAD` | [Grounding Human-to-Vehicle Advice for Self-driving Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06978)\n\\-\n\n- `2019\u002F04` | `Habitat` | [Habitat: A Platform for Embodied AI Research](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01201)\n\\-\n\n- `2018\u002F08` | `Gibson` | [Gibson Env: Real-World Perception for Embodied Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.10654)\n\\-\n\n- `2018\u002F06` | `VirtualHome` | [VirtualHome: Simulating Household Activities via Programs](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.07011)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.9 Other Tasks and Applications\n\n\u003Cdetails open>\n\u003Csummary>other tasks and applications\u003C\u002Fsummary>\n\n[Reasoning Tasks (Back-to-Top)](#3-reasoning-tasks)\n\n- [3.9 Other Tasks and Applications](#39-other-tasks-and-applications)\n  - [3.9.1 Theory of Mind (ToM)](#391-theory-of-mind-tom)\n  - [3.9.2 LLMs for Weather Prediction](#392-llms-for-weather-prediction)\n  - [3.9.3 Abstract Reasoning](#393-abstract-reasoning)\n  - [3.9.4 Defeasible Reasoning](#394-defeasible-reasoning)\n  - [3.9.5 Medical Reasoning](#395-medical-reasoning)\n  - [3.9.6 Bioinformatics Reasoning](#396-bioinformatics-reasoning)\n  - [3.9.7 Long-Chain Reasoning](#397-long-chain-reasoning)\n\n#### 3.9.1 Theory of Mind (ToM)\n\n- `2023\u002F02` | `ToM` | [Theory of Mind Might Have Spontaneously Emerged in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.02083)\n\\-\n\n#### 3.9.2 LLMs for Weather Prediction\n\n- `2022\u002F09` | `MetNet-2` | Deep learning for twelve hour precipitation forecasts\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41467-022-32483-x)]\n\n- `2023\u002F07` | `Pangu-Weather` | Accurate medium-range global weather forecasting with 3D neural networks\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06185-3)]\n\n#### 3.9.3 Abstract Reasoning\n\n- `2023\u002F05` | [Large Language Models Are Not Strong Abstract Reasoners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19555)\n\\-\n\n#### 3.9.4 Defeasible Reasoning\n\n- `2023\u002F06` | `BoardgameQA` | [BoardgameQA: A Dataset for Natural Language Reasoning with Contradictory Information](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07934)\n\\-\n\n- `2021\u002F10` | `CURIOUS` | [Think about it! Improving defeasible reasoning by first modeling the question scenario](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.12349)\n\\-\n\n- `2020\u002F11` | `Defeasible NLI` \u002F `δ-NLI` | [Thinking Like a Skeptic: Defeasible Inference in Natural Language](https:\u002F\u002Faclanthology.org\u002F2020.findings-emnlp.418\u002F)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002F2020.findings-emnlp.418.pdf)]\n\n- `2020\u002F04` | `KACC` | [KACC: A Multi-task Benchmark for Knowledge Abstraction, Concretization and Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.13631)\n\\-\n\n- `2009\u002F01` | A Recursive Semantics for Defeasible Reasoning\n\\-\n[[Paper](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-0-387-98197-0_9)]\n\n#### 3.9.5 Medical Reasoning\n\n- `2024\u002F01` | `CheXagent` \u002F `CheXinstruct` \u002F `CheXbench`\n| `Chen et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267069358?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FStanford-AIMI\u002FCheXagent.svg?style=social&label=Star) \u003Cbr>\nCheXagent: Towards a Foundation Model for Chest X-Ray Interpretation \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12208)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12208.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FStanford-AIMI\u002FCheXagent)]\n[[project](https:\u002F\u002Fstanford-aimi.github.io\u002Fchexagent.html)]\n[[huggingface](https:\u002F\u002Fhuggingface.co\u002Fstanford-crfm\u002FBioMedLM)]\n\n- `2024\u002F01` | `EchoGPT`\n| `Chao et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267042212?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nEchoGPT: A Large Language Model for Echocardiography Report Summarization \u003Cbr>\n[[medRxiv](https:\u002F\u002Fwww.medrxiv.org\u002Fcontent\u002F10.1101\u002F2024.01.18.24301503)]\n[[paper](https:\u002F\u002Fwww.medrxiv.org\u002Fcontent\u002F10.1101\u002F2024.01.18.24301503v1.full.pdf)]\n\n- `2023\u002F10` | `GPT4V-Medical-Report`\n| `Yan et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:264805701?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FZhilingYan\u002FGPT4V-Medical-Report.svg?style=social&label=Star) \u003Cbr>\nMultimodal ChatGPT for Medical Applications: an Experimental Study of GPT-4V \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.19061)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15189.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FZhilingYan\u002FGPT4V-Medical-Report)]\n\n- `2023\u002F10` | `VisionFM`\n| `Qiu et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:263828921?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nVisionFM: a Multi-Modal Multi-Task Vision Foundation Model for Generalist Ophthalmic Artificial Intelligence \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04992)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.04992.pdf)]\n\n- `2023\u002F09`\n| `Yang et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:263310951?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nThe Dawn of LMMs: Preliminary Explorations with GPT-4V(ision) \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17421)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.17421.pdf)]\n\n- `2023\u002F09` | `RETFound`\n| `Zhou et al., Nature`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:264168236?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Frmaphoh\u002FRETFound_MAE.svg?style=social&label=Star) \u003Cbr>\nA foundation model for generalizable disease detection from retinal images \u003Cbr>\n[[paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06555-x)]\n[[code](https:\u002F\u002Fgithub.com\u002Frmaphoh\u002FRETFound_MAE)]\n\n- `2023\u002F08` | `ELIXR`\n| `Xu et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:260378981?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nELIXR: Towards a general purpose X-ray artificial intelligence system through alignment of large language models and radiology vision encoders \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01317)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.01317.pdf)]\n\n- `2023\u002F07` | `Med-Flamingo`\n| `Moor et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:260316059?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fsnap-stanford\u002Fmed-flamingo.svg?style=social&label=Star) \u003Cbr>\nMed-Flamingo: a Multimodal Medical Few-shot Learner \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15189)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15189.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002Fsnap-stanford\u002Fmed-flamingo)]\n\n- `2023\u002F07` | `Med-PaLM M`\n| `Tu et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:260164663?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fkyegomez\u002FMed-PaLM.svg?style=social&label=Star) \u003Cbr>\nTowards Generalist Biomedical AI \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.14334)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.14334.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002Fkyegomez\u002FMed-PaLM)]\n\n- `2023\u002F06` | `Endo-FM`\n| `Wang et al., MICCAI 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:259287248?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmed-air\u002FEndo-FM.svg?style=social&label=Star) \u003Cbr>\nFoundation Model for Endoscopy Video Analysis via Large-scale Self-supervised Pre-train \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16741)]\n[[paper](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-031-43996-4_10)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmed-air\u002FEndo-FM)]\n\n- `2023\u002F06` | `XrayGPT`\n| `Thawkar et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:259145194?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmbzuai-oryx\u002FXrayGPT.svg?style=social&label=Star) \u003Cbr>\nXrayGPT: Chest Radiographs Summarization using Medical Vision-Language Models \u003Cbr>\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07971)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.07971.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FXrayGPT)]\n\n- `2023\u002F06` | `LLaVA-Med`\n| `Li et al., NeurIPS 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258999820?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FLLaVA-Med.svg?style=social&label=Star) \u003Cbr>\nLLaVA-Med: Training a Large Language-and-Vision Assistant for Biomedicine in One Day \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00890)]\n[[paper](https:\u002F\u002Fneurips.cc\u002Fvirtual\u002F2023\u002Fposter\u002F73643)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FLLaVA-Med)]\n\n- `2023\u002F05` | `HuatuoGPT`\n| `Zhang et al., Findings of EMNLP 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258865566?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFreedomIntelligence\u002FHuatuoGPT.svg?style=social&label=Star) \u003Cbr>\nHuatuoGPT, Towards Taming Language Model to Be a Doctor \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15075)]\n[[paper](https:\u002F\u002Faclanthology.org\u002F2023.findings-emnlp.725\u002F)]\n[[code](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FHuatuoGPT)]\n\n- `2023\u002F05` | `Med-PaLM 2`\n| `Singhal et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258715226?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nTowards Expert-Level Medical Question Answering with Large Language Models \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.09617)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.09617.pdf)]\n\n- `2022\u002F12` | ` Med-PaLM` \u002F `MultiMedQA` \u002F `HealthSearchQA`\n| `Singhal et al., Nature`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:255124952?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nLarge Language Models Encode Clinical Knowledge \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.13138)]\n[[paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06291-2)]\n\n#### 3.9.6 Bioinformatics Reasoning\n\n- `2023\u002F07` | `Prot2Text` | [Prot2Text: Multimodal Protein's Function Generation with GNNs and Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.14367)\n\\-\n\n- `2023\u002F07` | `Uni-RNA` | [Uni-RNA: Universal Pre-Trained Models Revolutionize RNA Research](https:\u002F\u002Fwww.biorxiv.org\u002Fcontent\u002F10.1101\u002F2023.07.11.548588v1)\n\\-\n\n- `2023\u002F07` | `RFdiffusion` | De novo design of protein structure and function with RFdiffusion\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06415-8)]\n\n- `2023\u002F06` | `HyenaDNA` | [HyenaDNA: Long-Range Genomic Sequence Modeling at Single Nucleotide Resolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.15794)\n\\-\n\n- `2023\u002F06` | `DrugGPT` | [DrugGPT: A GPT-based Strategy for Designing Potential Ligands Targeting Specific Proteins](https:\u002F\u002Fwww.biorxiv.org\u002Fcontent\u002F10.1101\u002F2023.06.29.543848v1)\n\\-\n\n- `2023\u002F04` | `GeneGPT` | [GeneGPT: Augmenting Large Language Models with Domain Tools for Improved Access to Biomedical Information](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09667)\n\\-\n\n- `2023\u002F04` | Drug discovery companies are customizing ChatGPT: here’s how\n\\-\n[[News](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41587-023-01788-7)]\n\n- `2023\u002F01` | `ProGen` | Large language models generate functional protein sequences across diverse families\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41587-022-01618-2)]\n\n- `2022\u002F06` | `ProGen2` | [ProGen2: Exploring the Boundaries of Protein Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.13517)\n\\-\n\n- `2021\u002F07` | `AlphaFold` | Highly accurate protein structure prediction with AlphaFold\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-021-03819-2)]\n\n#### 3.9.7 Long-Chain Reasoning\n\n- `2022\u002F12` | `Fine-tune-CoT` | [Large Language Models Are Reasoning Teachers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10071)\n\\-\n\n- `2021\u002F09` | `PlaTe` | [PlaTe: Visually-Grounded Planning with Transformers in Procedural Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.04869)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C\u002Fdetails>\n\n\n## 4 Reasoning Techniques\n\n\u003Cdetails open>\n\u003Csummary>reasoning techniques\u003C\u002Fsummary>\n\n[(Back-to-Top)](#table-of-contents)\n\n### Table of Contents - 4\n\n\u003Cdetails open>\n\u003Csummary>reasoning techniques (table of contents)\u003C\u002Fsummary>\n\n- [4 Reasoning Techniques](#4-reasoning-techniques)\n  - [4.1 Pre-Training](#41-pre-training)\n    - [4.1.1 Data](#411-data)\n      - [a. Data - Text](#a-data---text)\n      - [b. Data - Image](#b-data---image)\n      - [c. Data - Multimodality](#c-data---multimodality)\n    - [4.1.2 Network Architecture](#412-network-architecture)\n      - [a. Encoder-Decoder](#a-encoder-decoder)\n      - [b. Decoder-Only](#b-decoder-only)\n      - [c. CLIP Variants](#c-clip-variants)\n      - [d. Others](#d-others)\n  - [4.2 Fine-Tuning](#42-fine-tuning)\n    - [4.2.1 Data](#421-data)\n    - [4.2.2 Parameter-Efficient Fine-tuning](#422-parameter-efficient-fine-tuning)\n      - [a. Adapter Tuning](#a-adapter-tuning)\n      - [b. Low-Rank Adaptation](#b-low-rank-adaptation)\n      - [c. Prompt Tuning](#c-prompt-tuning)\n      - [d. Partial Parameter Tuning](#d-partial-parameter-tuning)\n      - [e. Mixture-of-Modality Adaption](#e-mixture-of-modality-adaption)\n  - [4.3 Alignment Training](#43-alignment-training)\n    - [4.3.1 Data](#431-data)\n      - [a. Data - Human](#a-data---human)\n      - [b. Data - Synthesis](#b-data---synthesis)\n    - [4.3.2 Training Pipeline](#432-training-pipeline)\n      - [a. Online Human Preference Training](#a-online-human-preference-training)\n      - [b. Offline Human Preference Training](#b-offline-human-preference-training)\n  - [4.4 Mixture of Experts (MoE)](#44-mixture-of-experts-moe)\n  - [4.5 In-Context Learning](#45-in-context-learning)\n    - [4.5.1 Demonstration Example Selection](#451-demonstration-example-selection)\n      - [a. Prior-Knowledge Approach](#a-prior-knowledge-approach)\n      - [b. Retrieval Approach](#b-retrieval-approach)\n    - [4.5.2 Chain-of-Thought](#452-chain-of-thought)\n      - [a. Zero-Shot CoT](#a-zero-shot-cot)\n      - [b. Few-Shot CoT](#b-few-shot-cot)\n      - [c. Multiple Paths Aggregation](#c-multiple-paths-aggregation)\n    - [4.5.3 Multi-Round Prompting](#453-multi-round-prompting)\n      - [a. Learned Refiners](#a-learned-refiners)\n      - [b. Prompted Refiners](#b-prompted-refiners)\n  - [4.6 Autonomous Agent](#46-autonomous-agent)\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.1 Pre-Training\n\n\u003Cdetails open>\n\u003Csummary>pre-training\u003C\u002Fsummary>\n\n[Reasoning Techniques (Back-to-Top)](#4-reasoning-techniques)\n\n- [4.1 Pre-Training](#41-pre-training)\n  - [4.1.1 Data](#411-data)\n    - [a. Data - Text](#a-data---text)\n    - [b. Data - Image](#b-data---image)\n    - [c. Data - Multimodality](#c-data---multimodality)\n  - [4.1.2 Network Architecture](#412-network-architecture)\n    - [a. Encoder-Decoder](#a-encoder-decoder)\n    - [b. Decoder-Only](#b-decoder-only)\n    - [c. CLIP Variants](#c-clip-variants)\n    - [d. Others](#d-others)\n\n#### 4.1.1 Data\n\n##### a. Data - Text\n\n- `2023\u002F07` | `peS2o` | peS2o (Pretraining Efficiently on S2ORC) Dataset\n\\-\n[[Code](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fpes2o)]\n\n- `2023\u002F05` | `ROOTS` \u002F `BLOOM` | [The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03915)\n\\-\n\n- `2023\u002F04` | `RedPajama` | RedPajama: an Open Dataset for Training Large Language Models\n\\-\n[[Code](https:\u002F\u002Fgithub.com\u002Ftogethercomputer\u002FRedPajama-Data)]\n\n- `2020\u002F12` | `The Pile` | [The Pile: An 800GB Dataset of Diverse Text for Language Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.00027)\n\\-\n\n- `2020\u002F04` | `Reddit` | [Recipes for building an open-domain chatbot](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.13637)\n\\-\n\n- `2020\u002F04` | `CLUE` | [CLUE: A Chinese Language Understanding Evaluation Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.05986)\n\\-\n\n- `2019\u002F10` | `C4` | [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683)\n\\-\n\n- `2013\u002F10` | `Gutenberg` | [Complexity of Word Collocation Networks: A Preliminary Structural Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F1310.5111)\n\n##### b. Data - Image\n\n- `2023\u002F06` | `I2E` \u002F `MOFI` | [MOFI: Learning Image Representations from Noisy Entity Annotated Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07952)\n\\-\n\n- `2022\u002F01` | `SWAG` [Revisiting Weakly Supervised Pre-Training of Visual Perception Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.08371)\n\\-\n\n- `2021\u002F04` | `ImageNet-21K` | [ImageNet-21K Pretraining for the Masses](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10972)\n\\-\n\n- `2017\u002F07` | `JFT` | [Revisiting Unreasonable Effectiveness of Data in Deep Learning Era](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.02968)\n\\-\n\n- `2014\u002F09` | `ImageNet` | [ImageNet Large Scale Visual Recognition Challenge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.0575)\n\\-\n\n##### c. Data - Multimodality\n\n- `2023\u002F09` | `Point-Bind` | [Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D Understanding, Generation, and Instruction Following](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00615)\n\\-\n\n- `2023\u002F05` | `ImageBind` | [ImageBind: One Embedding Space To Bind Them All](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05665)\n\\-\n\n- `2023\u002F04` | `DataComp` | [DataComp: In search of the next generation of multimodal datasets](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14108)\n\\-\n\n- `2022\u002F10` | `LAION-5B` | [LAION-5B: An open large-scale dataset for training next generation image-text models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08402)\n\\-\n\n- `2022\u002F08` | `Shutterstock` | [Quality Not Quantity: On the Interaction between Dataset Design and Robustness of CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.05516)\n\\-\n\n- `2022\u002F08` | `COYO-700M` | COYO-700M: Image-Text Pair Dataset\n\\-\n[[Code](https:\u002F\u002Fgithub.com\u002Fkakaobrain\u002Fcoyo-dataset)]\n\n- `2022\u002F04` | `M3W` | [Flamingo: a Visual Language Model for Few-Shot Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198)\n\\-\n\n- `2021\u002F11` | `RedCaps` | [RedCaps: web-curated image-text data created by the people, for the people](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.11431)\n\\-\n\n- `2021\u002F11` | `LAION-400M` | [LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.02114)\n\\-\n\n- `2021\u002F03` | `WIT` | [WIT: Wikipedia-based Image Text Dataset for Multimodal Multilingual Machine Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.01913)\n\\-\n\n- `2011\u002F12` | `Im2Text` \u002F `SBU` | [Im2Text: Describing Images Using 1 Million Captioned Photographs](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2011\u002Fhash\u002F5dd9db5e033da9c6fb5ba83c7a7ebea9-Abstract.html)\n\\-\n\n#### 4.1.2 Network Architecture\n\n- `2023\u002F04` | [Decoder-Only or Encoder-Decoder? Interpreting Language Model as a Regularized Encoder-Decoder](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04052)\n\\-\n\n##### a. Encoder-Decoder\n\n- `2019\u002F10` | `BART` | [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.13461)\n\\-\n\n- `2019\u002F10` | `T5` | [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683)\n\\-\n\n- `2018\u002F10` | `BERT` | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)\n\\-\n[[Paper](https:\u002F\u002Faclanthology.org\u002FN19-1423.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert)]\n[[Blog](https:\u002F\u002Fblog.research.google\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html)]\n\n- `2017\u002F06` | `Transformer` | [Attention Is All You Need](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762)\n\\-\n\n##### b. Decoder-Only\n\n- `2023\u002F07` | `Llama 2` | [Llama 2: Open Foundation and Fine-Tuned Chat Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09288)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[Blog](https:\u002F\u002Fai.meta.com\u002Fllama\u002F)]\n\n- `2023\u002F02` | `LLaMA` | [LLaMA: Open and Efficient Foundation Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13971)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.13971.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[Blog](https:\u002F\u002Fai.meta.com\u002Fblog\u002Flarge-language-model-llama-meta-ai\u002F)]\n\n- `2022\u002F11` | `BLOOM` | [BLOOM: A 176B-Parameter Open-Access Multilingual Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.05100)\n\\-\n\n- `2022\u002F10` | `GLM` | [GLM-130B: An Open Bilingual Pre-trained Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02414)\n\\-\n\n- `2022\u002F05` | `OPT` | [OPT: Open Pre-trained Transformer Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068)\n\\-\n\n- `2021\u002F12` | `Gopher` | [Scaling Language Models: Methods, Analysis & Insights from Training Gopher](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.11446)\n\\-\n\n- `2021\u002F05` | `GPT-3` | [Language Models are Few-Shot Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)\n\\-\n[[Paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-3)]\n\n- `2019\u002F02` | `GPT-2` | Language Models are Unsupervised Multitask Learners\n\\-\n[[Paper](https:\u002F\u002Fcdn.openai.com\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)]\n\n- `2018\u002F06` | `GPT-1` | Improving Language Understanding by Generative Pre-Training\n\\-\n[[Paper](https:\u002F\u002Fcdn.openai.com\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf)]\n\n##### c. CLIP Variants\n\n- `2023\u002F05` | `LaCLIP` | [Improving CLIP Training with Language Rewrites](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20088)\n\\-\n\n- `2023\u002F04` | `DetCLIPv2` | [DetCLIPv2: Scalable Open-Vocabulary Object Detection Pre-training via Word-Region Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04514)\n\\-\n\n- `2022\u002F12` | `FLIP` | [Scaling Language-Image Pre-training via Masking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.00794)\n\\-\n\n- `2022\u002F09` | `DetCLIP` | [DetCLIP: Dictionary-Enriched Visual-Concept Paralleled Pre-training for Open-world Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09407)\n\\-\n\n- `2022\u002F04` | `K-LITE` | [K-LITE: Learning Transferable Visual Models with External Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.09222)\n\\-\n\n- `2021\u002F11` | `FILIP` | [FILIP: Fine-grained Interactive Language-Image Pre-Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07783)\n\\-\n\n- `2021\u002F02` | `CLIP` | [Learning Transferable Visual Models From Natural Language Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fradford21a\u002Fradford21a.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP)]\n[[Blog](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fclip)]\n\n##### d. Others\n\n- `2023\u002F09` | `StreamingLLM` | [Efficient Streaming Language Models with Attention Sinks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17453)\n\\-\n\n- `2023\u002F07` | `RetNet` | [Retentive Network: A Successor to Transformer for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.08621)\n\\-\n\n- `2023\u002F07` | | [LongNet: Scaling Transformers to 1,000,000,000 Tokens](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02486)\n\\-\n\n- `2023\u002F05` | `RWKV` | [RWKV: Reinventing RNNs for the Transformer Era](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13048)\n\\-\n\n- `2023\u002F02` | `Hyena` | [Hyena Hierarchy: Towards Larger Convolutional Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10866)\n\\-\n\n- `2022\u002F12` | `H3` | [Hungry Hungry Hippos: Towards Language Modeling with State Space Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14052)\n\\-\n\n- `2022\u002F06` | `GSS` | [Long Range Language Modeling via Gated State Spaces](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.13947)\n\\-\n\n- `2022\u002F03` | `DSS` | [Diagonal State Spaces are as Effective as Structured State Spaces](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14343)\n\\-\n\n- `2021\u002F10` | `S4` | [Efficiently Modeling Long Sequences with Structured State Spaces](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.00396)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.2 Fine-Tuning\n\n\u003Cdetails open>\n\u003Csummary>fine-tuning\u003C\u002Fsummary>\n\n[Reasoning Techniques (Back-to-Top)](#4-reasoning-techniques)\n\n- [4.2 Fine-Tuning](#42-fine-tuning)\n  - [4.2.1 Data](#421-data)\n  - [4.2.2 Parameter-Efficient Fine-tuning](#422-parameter-efficient-fine-tuning)\n    - [a. Adapter Tuning](#a-adapter-tuning)\n    - [b. Low-Rank Adaptation](#b-low-rank-adaptation)\n    - [c. Prompt Tuning](#c-prompt-tuning)\n    - [d. Partial Parameter Tuning](#d-partial-parameter-tuning)\n    - [e. Mixture-of-Modality Adaption](#e-mixture-of-modality-adaption)\n\n#### 4.2.1 Data\n\n- `2023\u002F09` | `MetaMath` | [MetaMath: Bootstrap Your Own Mathematical Questions for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.12284)\n\\-\n\n- `2023\u002F09` | `MAmmoTH` | [MAmmoTH: Building Math Generalist Models through Hybrid Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05653)\n\\-\n\n- `2023\u002F08` | `WizardMath` | [WizardMath: Empowering Mathematical Reasoning for Large Language Models via Reinforced Evol-Instruct](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.09583)\n\\-\n\n- `2023\u002F08` | `RFT` | [Scaling Relationship on Learning Mathematical Reasoning with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01825)\n\\-\n\n- `2023\u002F05` | `PRM800K` \u002F `` | [Let's Verify Step by Step](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20050)\n\\-\n\n- `2023\u002F05` | `Distilling Step-by-Step` | [Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02301)\n\\-\n\n- `2023\u002F01` | [Specializing Smaller Language Models towards Multi-Step Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12726)\n\\-\n\n- `2022\u002F12` | `Fine-tune-CoT` | [Large Language Models Are Reasoning Teachers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10071)\n\\-\n\n- `2022\u002F12` | [Teaching Small Language Models to Reason](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08410)\n\\-\n\n- `2022\u002F10` | [Large Language Models Can Self-Improve](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.11610)\n\\-\n\n- `2022\u002F10` | [Explanations from Large Language Models Make Small Reasoners Better](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06726)\n\\-\n\n#### 4.2.2 Parameter-Efficient Fine-tuning\n\n##### a. Adapter Tuning\n\n- `2023\u002F03` | `LLaMA-Adapter` | [LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16199)\n\\-\n\n- `2022\u002F05` | `AdaMix` | [AdaMix: Mixture-of-Adaptations for Parameter-efficient Model Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12410)\n\\-\n\n- `2021\u002F10` | [Towards a Unified View of Parameter-Efficient Transfer Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.04366)\n\\-\n\n- `2021\u002F06` | `Compacter` | [Compacter: Efficient Low-Rank Hypercomplex Adapter Layers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.04647)\n\\-\n\n- `2020\u002F04` | `MAD-X` | [MAD-X: An Adapter-Based Framework for Multi-Task Cross-Lingual Transfer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00052)\n\\-\n\n- `2019\u002F02` | `Adapter` | [Parameter-Efficient Transfer Learning for NLP](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00751)\n\\-\n\n##### b. Low-Rank Adaptation\n\n- `2023\u002F09` | `LongLoRA` \u002F `LongAlpaca-12k`\n| `Chen et al., ICLR 2024`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:262084134?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdvlab-research\u002FLongLoRA.svg?style=social&label=Star) \u003Cbr>\nLongLoRA: Efficient Fine-tuning of Long-Context Large Language Models \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.12307)]\n[[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=6PmJoRfdaK)]\n[[code](https:\u002F\u002Fgithub.com\u002Fdvlab-research\u002FLongLoRA)]\n\n- `2023\u002F05` | `QLoRA` | [QLoRA: Efficient Finetuning of Quantized LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14314)\n\\-\n\n- `2023\u002F03` | `AdaLoRA` | [Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10512)\n\\-\n\n- `2022\u002F12` | `KronA` | [KronA: Parameter Efficient Tuning with Kronecker Adapter](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10650)\n\\-\n\n- `2022\u002F10` | `DyLoRA` | [DyLoRA: Parameter Efficient Tuning of Pre-trained Models using Dynamic Search-Free Low-Rank Adaptation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07558)\n\\-\n\n- `2021\u002F06` | `LoRA` | [LoRA: Low-Rank Adaptation of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.09685)\n\\-\n\n##### c. Prompt Tuning\n\n- `2021\u002F10` | `P-Tuning v2` | [P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.07602)\n\\-\n\n- `2021\u002F04` | `Prompt Tuning` | [The Power of Scale for Parameter-Efficient Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691)\n\\-\n\n- `2021\u002F04` | `OptiPrompt` | [Factual Probing Is [MASK]: Learning vs. Learning to Recall](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05240)\n\\-\n\n- `2021\u002F03` | `P-Tuning` | [GPT Understands, Too](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.10385)\n\\-\n\n- `2021\u002F01` | `Prefix-Tuning` | [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.00190)\n\\-\n\n##### d. Partial Parameter Tuning\n\n- `2023\u002F04` | `DiffFit` | [DiffFit: Unlocking Transferability of Large Diffusion Models via Simple Parameter-Efficient Fine-Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06648)\n\\-\n\n- `2022\u002F10` | `SSF` | [Scaling & Shifting Your Features: A New Baseline for Efficient Model Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08823)\n\\-\n\n- `2021\u002F09` | `Child-Tuning` | [Raise a Child in Large Language Model: Towards Effective and Generalizable Fine-tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.05687)\n\\-\n\n- `2021\u002F06` | `BitFit` | [BitFit: Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.10199)\n\\-\n\n##### e. Mixture-of-Modality Adaption\n\n- `2023\u002F10` | `LLaVA-1.5` | [Improved Baselines with Visual Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03744)\n\\-\n\n- `2023\u002F05` | `MMA` \u002F `LaVIN` | [Cheap and Quick: Efficient Vision-Language Instruction Tuning for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15023)\n\\-\n\n- `2023\u002F04` | `LLaMA-Adapter V2` | [LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.15010)\n\\-\n\n- `2023\u002F04` | `LLaVA` | [Visual Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n\\-\n\n- `2023\u002F02` | `RepAdapter` | [Towards Efficient Visual Adaption via Structural Re-parameterization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.08106)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.3 Alignment Training\n\n\u003Cdetails open>\n\u003Csummary>alignment training\u003C\u002Fsummary>\n\n[Reasoning Techniques (Back-to-Top)](#4-reasoning-techniques)\n\n- [4.3 Alignment Training](#43-alignment-training)\n  - [4.3.1 Data](#431-data)\n    - [a. Data - Human](#a-data---human)\n    - [b. Data - Synthesis](#b-data---synthesis)\n  - [4.3.2 Training Pipeline](#432-training-pipeline)\n    - [a. Online Human Preference Training](#a-online-human-preference-training)\n    - [b. Offline Human Preference Training](#b-offline-human-preference-training)\n\n#### 4.3.1 Data\n\n##### a. Data - Human\n\n- `2023\u002F06` | `Dolly` | Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM\n\\-\n[[Code](https:\u002F\u002Fgithub.com\u002Fdatabrickslabs\u002Fdolly)]\n\n- `2023\u002F04` | `LongForm` | [LongForm: Optimizing Instruction Tuning for Long Text Generation with Corpus Extraction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08460)\n\\-\n\n- `2023\u002F04` | `COIG` | [Chinese Open Instruction Generalist: A Preliminary Release](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.07987)\n\\-\n\n- `2023\u002F04` | `OpenAssistant Conversations` | [OpenAssistant Conversations -- Democratizing Large Language Model Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.07327)\n\\-\n\n- `2023\u002F01` | `Flan 2022` | [The Flan Collection: Designing Data and Methods for Effective Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.13688)\n\\-\n\n- `2022\u002F11` | `xP3` | [Crosslingual Generalization through Multitask Finetuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01786)\n\\-\n\n- `2022\u002F04` | `Super-NaturalInstructions` | [Super-NaturalInstructions: Generalization via Declarative Instructions on 1600+ NLP Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07705)\n\\-\n\n- `2021\u002F11` | `ExT5` | [ExT5: Towards Extreme Multi-Task Scaling for Transfer Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.10952)\n\\-\n\n- `2021\u002F10` | `MetaICL` | [MetaICL: Learning to Learn In Context](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.15943)\n\\-\n\n- `2021\u002F10` | `P3` | [Multitask Prompted Training Enables Zero-Shot Task Generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.08207)\n\\-\n\n- `2021\u002F04` | `CrossFit` | [CrossFit: A Few-shot Learning Challenge for Cross-task Generalization in NLP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08835)\n\\-\n\n- `2021\u002F04` | `NATURAL INSTRUCTIONS` | [Cross-Task Generalization via Natural Language Crowdsourcing Instructions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08773)\n\\-\n\n- `2020\u002F05` | `UnifiedQA` | [UnifiedQA: Crossing Format Boundaries With a Single QA System](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00700)\n\\-\n\n##### b. Data - Synthesis\n\n- `2023\u002F08` | `Instruction Backtranslation` | [Self-Alignment with Instruction Backtranslation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.06259)\n\\-\n\n- `2023\u002F05` | `Dynosaur` | [Dynosaur: A Dynamic Growth Paradigm for Instruction-Tuning Data Curation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14327)\n\\-\n\n- `2023\u002F05` | `UltraChat` | [Enhancing Chat Language Models by Scaling High-quality Instructional Conversations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14233)\n\\-\n\n- `2023\u002F05` | `CoT Collection` | [The CoT Collection: Improving Zero-shot and Few-shot Learning of Language Models via Chain-of-Thought Fine-Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14045)\n\\-\n\n- `2023\u002F05` | `CoEdIT` | [CoEdIT: Text Editing by Task-Specific Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.09857)\n\\-\n\n- `2023\u002F04` | `LaMini-LM` | [LaMini-LM: A Diverse Herd of Distilled Models from Large-Scale Instructions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14402)\n\\-\n\n- `2023\u002F04` | `GPT-4-LLM` | [Instruction Tuning with GPT-4](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03277)\n\\-\n\n- `2023\u002F04` | `Koala` | Koala: A Dialogue Model for Academic Research\n\\-\n[[Blog](https:\u002F\u002Fbair.berkeley.edu\u002Fblog\u002F2023\u002F04\u002F03\u002Fkoala\u002F)]\n\n- `2023\u002F03` | `Alpaca` | Alpaca: A Strong, Replicable Instruction-Following Model\n\\-\n[[Blog](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html)]\n\n- `2023\u002F03` | `GPT4All` | GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo\n\\-\n[[Code](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all)]\n\n- `2022\u002F12` | `OPT-IML` \u002F `OPT-IML Bench` | [OPT-IML: Scaling Language Model Instruction Meta Learning through the Lens of Generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.12017)\n\\-\n\n- `2022\u002F12` | `Self-Instruct` | [Self-Instruct: Aligning Language Models with Self-Generated Instructions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10560)\n\\-\n\n- `2022\u002F12` | `Unnatural Instructions` | [Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09689)\n\\-\n\n#### 4.3.2 Training Pipeline\n\n##### a. Online Human Preference Training\n\n- `2023\u002F06` | `APA` | [Fine-Tuning Language Models with Advantage-Induced Policy Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02231)\n\\-\n\n- `2023\u002F04` | `RAFT` | [RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06767)\n\\-\n\n- `2022\u002F03` | `InstructGPT` \u002F `RLHF` | [Training language models to follow instructions with human feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155)\n\\-\n\n##### b. Offline Human Preference Training\n\n- `2023\u002F06` | `PRO` | [Preference Ranking Optimization for Human Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17492)\n\\-\n\n- `2023\u002F05` | `DPO` | [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18290)\n\\-\n\n- `2023\u002F04` | `RRHF` | [RRHF: Rank Responses to Align Language Models with Human Feedback without tears](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05302)\n\\-\n\n- `2022\u002F09` | `SLiC` | [Calibrating Sequence likelihood Improves Conditional Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00045)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.4 Mixture of Experts (MoE)\n\n\u003Cdetails open>\n\u003Csummary>mixture of experts\u003C\u002Fsummary>\n\n[Reasoning Techniques (Back-to-Top)](#4-reasoning-techniques)\n\n- `2024\u002F01` | `MoE-LLaVA`\n| `Lin et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267311517?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FPKU-YuanGroup\u002FMoE-LLaVA.svg?style=social&label=Star) \u003Cbr>\nMoE-LLaVA: Mixture of Experts for Large Vision-Language Models \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.15947)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.15947.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FPKU-YuanGroup\u002FMoE-LLaVA)]\n\n- `2023\u002F06` | [An Efficient General-Purpose Modular Vision Model via Multi-Task Heterogeneous Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17165)\n\\-\n\n- `2023\u002F03` | `MixedAE` | [Mixed Autoencoder for Self-supervised Visual Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17152)\n\\-\n\n- `2022\u002F12` | `Mod-Squad` | [Mod-Squad: Designing Mixture of Experts As Modular Multi-Task Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08066)\n\\-\n\n- `2022\u002F04` | `MoEBERT` | [MoEBERT: from BERT to Mixture-of-Experts via Importance-Guided Adaptation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07675)\n\\-\n\n- `2021\u002F12` | `GLaM` | [GLaM: Efficient Scaling of Language Models with Mixture-of-Experts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.06905)\n\\-\n\n- `2021\u002F07` | `WideNet` | [Go Wider Instead of Deeper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.11817)\n\\-\n\n- `2021\u002F01` | `Switch Transformers` | [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.03961)\n\\-\n\n- `2020\u002F06` | `GShard` | [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.16668)\n\\-\n\n- `2017\u002F01` | `Sparsely-Gated Mixture-of-Experts` | [Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1701.06538)\n\\-\n\n- `1991\u002F03` | Adaptive Mixtures of Local Experts\n\\-\n[[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F6797059)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.5 In-Context Learning\n\n\u003Cdetails open>\n\u003Csummary>in-context learning\u003C\u002Fsummary>\n\n[Reasoning Techniques (Back-to-Top)](#4-reasoning-techniques)\n\n- [4.5 In-Context Learning](#45-in-context-learning)\n  - [4.5.1 Demonstration Example Selection](#451-demonstration-example-selection)\n    - [a. Prior-Knowledge Approach](#a-prior-knowledge-approach)\n    - [b. Retrieval Approach](#b-retrieval-approach)\n  - [4.5.2 Chain-of-Thought](#452-chain-of-thought)\n    - [a. Zero-Shot CoT](#a-zero-shot-cot)\n    - [b. Few-Shot CoT](#b-few-shot-cot)\n    - [c. Multiple Paths Aggregation](#c-multiple-paths-aggregation)\n  - [4.5.3 Multi-Round Prompting](#453-multi-round-prompting)\n    - [a. Learned Refiners](#a-learned-refiners)\n    - [b. Prompted Refiners](#b-prompted-refiners)\n\n\u003Cbr>\n\n- `2022\u002F10` | `FLAN-T5` | [Scaling Instruction-Finetuned Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.11416)\n\\-\n\n- `2021\u002F05` | `GPT-3` | [Language Models are Few-Shot Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)\n\\-\n[[Paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-3)]\n\n#### 4.5.1 Demonstration Example Selection\n\n##### a. Prior-Knowledge Approach\n\n- `2022\u002F12` | [Diverse Demonstrations Improve In-context Compositional Generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06800)\n\\-\n\n- `2022\u002F11` | [Complementary Explanations for Effective In-Context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.13892)\n\\-\n\n- `2022\u002F10` | `Auto-CoT` | [Automatic Chain of Thought Prompting in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03493)\n\\-\n\n- `2022\u002F10` | `Complex CoT` | [Complexity-Based Prompting for Multi-Step Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00720)\n\\-\n\n- `2022\u002F10` | `EmpGPT-3` | [Does GPT-3 Generate Empathetic Dialogues? A Novel In-Context Example Selection Method and Automatic Evaluation Metric for Empathetic Dialogue Generation](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.56\u002F)\n\\-\n[[Paper](https:\u002F\u002Fgithub.com\u002Fpassing2961\u002FEmpGPT-3)]\n\n- `2022\u002F09` | [Selective Annotation Makes Language Models Better Few-Shot Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.01975)\n\\-\n\n- `2021\u002F01` | [What Makes Good In-Context Examples for GPT-3?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06804)\n\\-\n\n##### b. Retrieval Approach\n\n- `2023\u002F10` | `DQ-LoRe` | [DQ-LoRe: Dual Queries with Low Rank Approximation Re-ranking for In-Context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02954)\n\\-\n\n- `2023\u002F07` | `LLM-R` | [Learning to Retrieve In-Context Examples for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07164)\n\\-\n\n- `2023\u002F05` | `Dr.ICL` | [Dr.ICL: Demonstration-Retrieved In-context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14128)\n\\-\n\n- `2023\u002F02` | `LENS` | [Finding Support Examples for In-Context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13539)\n\\-\n\n- `2023\u002F02` | `CEIL` | [Compositional Exemplars for In-context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05698)\n\\-\n\n- `2021\u002F12` | [Learning To Retrieve Prompts for In-Context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08633)\n\\-\n\n#### 4.5.2 Chain-of-Thought\n\n##### a. Zero-Shot CoT\n- `2023\u002F09` | `LoT` | [Enhancing Zero-Shot Chain-of-Thought Reasoning in Large Language Models through Logic](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.13339)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.13339)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fxf-zhao\u002FLoT)]\n\n- `2023\u002F05` | `Plan-and-Solve` | [Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04091)\n\\-\n\n- `2022\u002F05` | `Zero-shot-CoT` | [Large Language Models are Zero-Shot Reasoners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11916)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=e2TBb5y0yFf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fkojima-takeshi188\u002Fzero_shot_cot)]\n\n##### b. Few-Shot CoT\n\n- `2023\u002F07` | `SoT` | [Skeleton-of-Thought: Large Language Models Can Do Parallel Decoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15337)\n\\-\n\n- `2023\u002F05` | `Code Prompting` | [Code Prompting: a Neural Symbolic Method for Complex Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18507)\n\\-\n\n- `2023\u002F05` | `GoT` | [Beyond Chain-of-Thought, Effective Graph-of-Thought Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16582)\n\\-\n\n- `2023\u002F05` | `ToT` | [Tree of Thoughts: Deliberate Problem Solving with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)\n\\-\n\n- `2023\u002F03` | `MathPrompter` | [MathPrompter: Mathematical Reasoning using Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05398)\n\\-\n\n- `2022\u002F11` | `PoT` | [Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.12588)\n\\-\n\n- `2022\u002F11` | `PAL` | [PAL: Program-aided Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10435)\n\\-\n\n- `2022\u002F10` | `Auto-CoT` | [Automatic Chain of Thought Prompting in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03493)\n\\-\n\n- `2022\u002F10` | `Complex CoT` | [Complexity-Based Prompting for Multi-Step Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00720)\n\\-\n\n- `2022\u002F05` | `Least-to-Most Prompting` | [Least-to-Most Prompting Enables Complex Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625)\n\\-\n\n- `2022\u002F01` | [Chain-of-Thought Prompting Elicits Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)\n\\-\n\n##### c. Multiple Paths Aggregation\n\n- `2023\u002F05` | `RAP` | [Reasoning with Language Model is Planning with World Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)\n\\-\n\n- `2023\u002F05` | [Automatic Model Selection with Large Language Models for Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14333)\n\\-\n\n- `2023\u002F05` | `AdaptiveConsistency` | [Let's Sample Step by Step: Adaptive-Consistency for Efficient Reasoning and Coding with LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11860)\n\\-\n\n- `2023\u002F05` | `ToT` | [Tree of Thoughts: Deliberate Problem Solving with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)\n\\-\n\n- `2023\u002F05` | `ToT` | [Large Language Model Guided Tree-of-Thought](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.08291)\n\\-\n\n- `2023\u002F05` | [Self-Evaluation Guided Beam Search for Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.00633)\n\\-\n\n- `2022\u002F10` | `Complex CoT` | [Complexity-Based Prompting for Multi-Step Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00720)\n\\-\n\n- `2022\u002F06` | `DIVERSE` | [Making Large Language Models Better Reasoners with Step-Aware Verifier](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.02336)\n\\-\n\n- `2022\u002F03` | [Self-Consistency Improves Chain of Thought Reasoning in Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)\n\\-\n\n#### 4.5.3 Multi-Round Prompting\n\n##### a. Learned Refiners\n\n- `2023\u002F02` | `LLM-Augmenter` | [Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12813)\n\\-\n\n- `2022\u002F10` | `Self-Correction` | [Generating Sequences by Learning to Self-Correct](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.00053)\n\\-\n\n- `2022\u002F08` | `PEER` | [PEER: A Collaborative Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.11663)\n\\-\n\n- `2022\u002F04` | `R3` | [Read, Revise, Repeat: A System Demonstration for Human-in-the-loop Iterative Text Revision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03685)\n\\-\n\n- `2021\u002F10` | `CURIOUS` | [Think about it! Improving defeasible reasoning by first modeling the question scenario](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.12349)\n\\-\n\n- `2020\u002F05` | `DrRepair` | [Graph-based, Self-Supervised Program Repair from Diagnostic Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.10636)\n\\-\n\n##### b. Prompted Refiners\n\n- `2023\u002F06` | `InterCode` | [InterCode: Standardizing and Benchmarking Interactive Coding with Execution Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.14898)\n\\-\n\n- `2023\u002F06` | [Is Self-Repair a Silver Bullet for Code Generation?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09896)\n\\-\n\n- `2023\u002F05` | [Improving Factuality and Reasoning in Language Models through Multiagent Debate](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14325)\n\\-\n\n- `2023\u002F05` | `CRITIC` | [CRITIC: Large Language Models Can Self-Correct with Tool-Interactive Critiquing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11738)\n\\-\n\n- `2023\u002F05` | `GPT-Bargaining` | [Improving Language Model Negotiation with Self-Play and In-Context Learning from AI Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10142)\n\\-\n\n- `2023\u002F05` | `Self-Edit` | [Self-Edit: Fault-Aware Code Editor for Code Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04087)\n\\-\n\n- `2023\u002F04` | `PHP` | [Progressive-Hint Prompting Improves Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09797)\n\\-\n\n- `2023\u002F04` | ` Self-collaboration` | [Self-collaboration Code Generation via ChatGPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.07590)\n\\-\n\n- `2023\u002F04` | `Self-Debugging` | [Teaching Large Language Models to Self-Debug](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05128)\n\\-\n\n- `2023\u002F04` | `REFINER` | [REFINER: Reasoning Feedback on Intermediate Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.01904)\n\\-\n\n- `2023\u002F03` | `Self-Refine` | [Self-Refine: Iterative Refinement with Self-Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.6 Autonomous Agent\n\n\u003Cdetails open>\n\u003Csummary>autonomous agent\u003C\u002Fsummary>\n\n[Reasoning Techniques (Back-to-Top)](#4-reasoning-techniques)\n\n- `2023\u002F10` | `planning tokens` | [Guiding Language Model Reasoning with Planning Tokens](https:\u002F\u002Faps.arxiv.org\u002Fabs\u002F2310.05707)\n\\-\n\n- `2023\u002F09` | `AutoAgents` | [AutoAgents: A Framework for Automatic Agent Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17288)\n\\-\n\n- `2023\u002F06` | `AssistGPT` | [AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.08640)\n\\-\n\n- `2023\u002F05` | `SwiftSage` | [SwiftSage: A Generative Agent with Fast and Slow Thinking for Complex Interactive Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.17390)\n\\-\n\n- `2023\u002F05` | `MultiTool-CoT` | [MultiTool-CoT: GPT-3 Can Use Multiple External Tools with Chain of Thought Prompting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16896)\n\\-\n\n- `2023\u002F05` | `Voyager` | [Voyager: An Open-Ended Embodied Agent with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)\n\\-\n\n- `2023\u002F05` | `ChatCoT` | [ChatCoT: Tool-Augmented Chain-of-Thought Reasoning on Chat-based Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14323)\n\\-\n\n- `2023\u002F05` | `CREATOR` | [CREATOR: Tool Creation for Disentangling Abstract and Concrete Reasoning of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14318)\n\\-\n\n- `2023\u002F05` | `TRICE` | [Making Language Models Better Tool Learners with Execution Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13068)\n\\-\n\n- `2023\u002F05` | `ToolkenGPT` | [ToolkenGPT: Augmenting Frozen Language Models with Massive Tools via Tool Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11554)\n\\-\n\n- `2023\u002F04` | `Chameleon` | [Chameleon: Plug-and-Play Compositional Reasoning with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09842)\n\\-\n\n- `2023\u002F04` | `OpenAGI` | [OpenAGI: When LLM Meets Domain Experts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04370)\n\\-\n\n- `2023\u002F03` | `CAMEL` | [CAMEL: Communicative Agents for \"Mind\" Exploration of Large Language Model Society](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17760)\n\\-\n\n- `2023\u002F03` | `HuggingGPT` | [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)\n\\-\n\n- `2023\u002F03` | `Reflexion` | [Reflexion: Language Agents with Verbal Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)\n\\-\n\n- `2023\u002F03` | `ART` | [ART: Automatic multi-step reasoning and tool-use for large language models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09014)\n\\-\n\n- `2023\u002F03` | `Auto-GPT` | Auto-GPT: An Autonomous GPT-4 Experiment\n\\-\n[[Code](https:\u002F\u002Fgithub.com\u002Fantony0596\u002Fauto-gpt)]\n\n- `2023\u002F02` | `Toolformer` | [Toolformer: Language Models Can Teach Themselves to Use Tools](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.04761)\n\\-\n\n- `2022\u002F11` | `VISPROG` | [Visual Programming: Compositional visual reasoning without training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11559)\n\\-\n\n- `2022\u002F10` | `ReAct` | [ReAct: Synergizing Reasoning and Acting in Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C\u002Fdetails>\n","# 令人惊叹的推理基础模型\n[![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002FDOI\u002F10.5281\u002Fzenodo.10298864.svg)](https:\u002F\u002Fdoi.org\u002F10.5281\u002Fzenodo.10298864)\n[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-arxiv.org\u002Fabs\u002F2312.11562-\u003CCOLOR>.svg)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11562)\n\n\n![overview](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Freasoning-survey_Awesome-Reasoning-Foundation-Models_readme_5a69a0541029.jpg) \n\n[`survey.pdf`](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.11562.pdf) |\n一个精心整理的、关于用于**推理**的**大型AI模型**或**基础模型**的精彩列表。\n\n我们把当前的[基础模型](#2-foundation-models)分为三类：*语言基础模型*、*视觉基础模型*和*多模态基础模型*。\n此外，我们还详细介绍了这些基础模型在[推理任务](#3-reasoning-tasks)中的应用，包括*常识推理*、*数学推理*、*逻辑推理*、*因果推理*、*视觉推理*、*音频推理*、*多模态推理*、*智能体推理*等。\n[推理技术](#4-reasoning-techniques)，如*预训练*、*微调*、*对齐训练*、*专家混合模型*、*上下文学习*和*自主智能体*，也被总结在此。\n\n我们欢迎为本仓库贡献更多资源。如果您想贡献力量，请提交拉取请求！详情请参阅[CONTRIBUTING](CONTRIBUTING.md)。\n\n\u003C!-- ## 新闻 -->\n\n\n## 目录\n\n\u003Cdetails open>\n\u003Csummary>目录\u003C\u002Fsummary>\n\n- [0 概述](#0-survey)\n- [1 相关综述](#1-relevant-surveys-and-links)\n- [2 基础模型](#2-foundation-models)\n  - [2.1 语言基础模型](#21-language-foundation-models)\n  - [2.2 视觉基础模型](#22-vision-foundation-models)\n  - [2.3 多模态基础模型](#23-multimodal-foundation-models)\n  - [2.4 推理应用](#24-reasoning-applications)\n- [3 推理任务](#3-reasoning-tasks)\n  - [3.1 常识推理](#31-commonsense-reasoning)\n  - [3.2 数学推理](#32-mathematical-reasoning)\n  - [3.3 逻辑推理](#33-logical-reasoning)\n  - [3.4 因果推理](#34-causal-reasoning)\n  - [3.5 视觉推理](#35-visual-reasoning)\n  - [3.6 音频推理](#36-audio-reasoning)\n  - [3.7 多模态推理](#37-multimodal-reasoning)\n  - [3.8 智能体推理](#38-agent-reasoning)\n  - [3.9 其他任务和应用](#39-other-tasks-and-applications)\n- [4 推理技术](#4-reasoning-techniques)\n  - [4.1 预训练](#41-pre-training)\n  - [4.2 微调](#42-fine-tuning)\n  - [4.3 对齐训练](#43-alignment-training)\n  - [4.4 专家混合模型 (MoE)](#44-mixture-of-experts-moe)\n  - [4.5 上下文学习](#45-in-context-learning)\n  - [4.6 自主智能体](#46-autonomous-agent)\n\n\u003C\u002Fdetails>\n\n\n## 0 概述\n\n![overview](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Freasoning-survey_Awesome-Reasoning-Foundation-Models_readme_fa97ddc33c00.jpg)\n\n本仓库主要基于以下论文：\n\n>[**基础模型推理综述：概念、方法与展望**](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3729218) \u003Cbr>\n>\n> [[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1145\u002F3729218)][[ArXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11562)]\n>\n> 孙建凯()、郑传扬()、谢恩泽()、刘正颖()、楚睿航()、邱佳宁()、徐嘉琪()、丁明宇()、李洪洋()、耿孟哲()、吴岳()、王文海()、陈俊松()、尹章悦()、任晓哲()、傅杰()、何俊贤()、吴源()、刘奇()、刘希辉()、李宇()、董浩()、程宇()、张明()、彭安恒()、戴继峰()、罗平()、王京东()、温继荣()、邱锡鹏()、郭义克()、熊辉()、刘群()和李振国()\n\n如果您觉得本仓库有所帮助，请考虑引用：\n\n```bibtex\n@article{sun2025survey,\n  author = {孙建凯、郑传扬、谢恩泽、刘正颖、楚睿航、邱佳宁、徐嘉琪、丁明宇、李洪洋、耿孟哲、吴岳、王文海、陈俊松、尹章悦、任晓哲、傅杰、何俊贤、吴源、刘奇、刘希辉、李宇、董浩、程宇、张明、彭安恒、戴继峰、罗平、王京东、温继荣、邱锡鹏、郭义克、熊辉、刘群、李振国},\n  title = {基础模型推理综述：概念、方法与展望},\n  year = {2025},\n  publisher = {美国计算机协会},\n  address = {纽约, 美国},\n  issn = {0360-0300},\n  url = {https:\u002F\u002Fdoi.org\u002F10.1145\u002F3729218},\n  doi = {10.1145\u002F3729218},\n  abstract = {推理是解决复杂问题的关键能力，在谈判、医学诊断和刑事侦查等多种现实场景中发挥着核心作用。它也是通用人工智能（AGI）领域的基本方法论。随着基础模型的不断发展，人们对其在推理任务中的能力越来越感兴趣。本文介绍了可用于或可适配于推理的代表性基础模型，并重点展示了各类推理任务、方法和基准测试的最新进展。随后，我们探讨了基础模型中推理能力出现的潜在未来方向。同时，我们也讨论了多模态学习、自主智能体和超级对齐在推理背景下的相关性。通过探讨这些未来的研究方向，我们希望激励研究人员进一步探索这一领域，推动基础模型（例如大型语言模型LLM）在推理方面的更多进展，并为AGI的发展做出贡献。},\n  journal = {ACM 计算机科学评论},\n  month = apr,\n  keywords = {推理、基础模型、多模态、AI智能体、通用人工智能、LLM}\n}\n```\n\n## 1 相关综述与链接\n\n\u003Cdetails open>\n\u003Csummary>相关综述\u003C\u002Fsummary>\n\n[(返回顶部)](#table-of-contents)\n\n- 在大语言模型时代对抗虚假信息：机遇与挑战\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05656)]\n[[链接](https:\u002F\u002Fllm-misinformation.github.io\u002F)]\n\n- 基于大语言模型的智能体的兴起与潜力：综述\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07864)]\n[[链接](https:\u002F\u002Fgithub.com\u002FWooooDyy\u002FLLM-Agent-Paper-List)]\n\n- 多模态基础模型：从专家系统到通用助手\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.10020)]\n[[教程](https:\u002F\u002Fvlp-tutorial.github.io\u002F2023\u002F)]\n\n- 多模态大语言模型综述\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.13549)]\n[[链接](https:\u002F\u002Fgithub.com\u002FBradyFU\u002FAwesome-Multimodal-Large-Language-Models)]\n\n- 交互式自然语言处理\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13246)]\n[[链接](https:\u002F\u002Fgithub.com\u002FInteractiveNLP-Team\u002Fawesome-InteractiveNLP-papers)]\n\n- 大语言模型综述\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.18223)]\n[[链接](https:\u002F\u002Fgithub.com\u002FRUCAIBox\u002FLLMSurvey)]\n\n- 自监督多模态学习：综述\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.01008)]\n[[链接](https:\u002F\u002Fgithub.com\u002Fys-zong\u002Fawesome-self-supervised-multimodal-learning)]\n\n- 大型人工智能模型在健康信息学中的应用、挑战与未来\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11568)]\n[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10261199)]\n[[链接](https:\u002F\u002Fgithub.com\u002FJianing-Qiu\u002FAwesome-Healthcare-Foundation-Models)]\n\n- 向大语言模型推理迈进：综述\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10403)]\n[[论文](https:\u002F\u002Faclanthology.org\u002F2023.findings-acl.67.pdf)]\n[[链接](https:\u002F\u002Fgithub.com\u002Fjeffhj\u002FLM-reasoning)]\n\n- 使用语言模型提示进行推理：综述\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09597)]\n[[论文](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.294.pdf)]\n[[链接](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FPrompt4ReasoningPapers)]\n\n- 优秀多模态推理资源\n\\-\n[[链接](https:\u002F\u002Fgithub.com\u002Fatfortes\u002FAwesome-Multimodal-Reasoning)]\n\n\u003C\u002Fdetails>\n\n\n## 2 基础模型\n\n\u003Cdetails open>\n\u003Csummary>基础模型\u003C\u002Fsummary>\n\n[(返回顶部)](#table-of-contents)\n\n![foundation_models](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Freasoning-survey_Awesome-Reasoning-Foundation-Models_readme_76064674e424.jpg)\n\n### 目录 - 2\n\n\u003Cdetails open>\n\u003Csummary>基础模型（目录）\u003C\u002Fsummary>\n\n[(返回顶部)](#table-of-contents)\n\n- [2 基础模型](#2-foundation-models)\n  - [2.1 语言基础模型](#21-language-foundation-models)\n  - [2.2 视觉基础模型](#22-vision-foundation-models)\n  - [2.3 多模态基础模型](#23-multimodal-foundation-models)\n  - [2.4 推理应用](#24-reasoning-applications)\n\n\u003C\u002Fdetails>\n\n### 2.1 语言基础模型\n\n\u003Cdetails open>\n\u003Csummary>LFMs\u003C\u002Fsummary>\n\n[基础模型（返回顶部）](#2-foundation-models)\n\n- `2023\u002F10` | `Mistral` | [Mistral 7B](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06825)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06825.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fmistralai\u002Fmistral-src?tab=readme-ov-file)]\n\n- `2023\u002F09` | `Qwen` | [Qwen技术报告](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16609)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.16609.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen)]\n[[项目](https:\u002F\u002Ftongyi.aliyun.com\u002Fqianwen\u002F)]\n\n- `2023\u002F07` | `Llama 2` | [Llama 2：开放的基础及微调聊天模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09288)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[博客](https:\u002F\u002Fai.meta.com\u002Fllama\u002F)]\n\n- `2023\u002F07` | `InternLM` | InternLM：一款逐步增强能力的多语言语言模型\n\\-\n[[论文](https:\u002F\u002Fgithub.com\u002FInternLM\u002FInternLM-techreport\u002Fblob\u002Fmain\u002FInternLM.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FInternLM\u002FInternLM)]\n[[项目](https:\u002F\u002Finternlm.intern-ai.org.cn)]\n\n- `2023\u002F05` | `PaLM 2` | [PaLM 2技术报告](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10403)\n\\-\n\n- `2023\u002F03` | `PanGu-Σ` | [PanGu-Σ：迈向采用稀疏异构计算的万亿参数语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.10845.pdf)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10845)]\n\n- `2023\u002F03` | `Vicuna` | Vicuna：一款以90%* ChatGPT质量惊艳GPT-4的开源聊天机器人\n\\-\n[[博客](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-03-30-vicuna\u002F)]\n[[代码](https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat)]\n\n- `2023\u002F03` | `GPT-4` | [GPT-4技术报告](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08774)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.08774.pdf)]\n[[博客](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4)]\n\n- `2023\u002F02` | `LLaMA` | [LLaMA：开放且高效的语言基础模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13971)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.13971.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[博客](https:\u002F\u002Fai.meta.com\u002Fblog\u002Flarge-language-model-llama-meta-ai\u002F)]\n\n- `2022\u002F11` | `ChatGPT` | Chatgpt：优化语言模型以用于对话\n\\-\n[[博客](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt)]\n\n- `2022\u002F04` | `PaLM` | [PaLM：通过Pathways扩展语言建模能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02311)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.02311.pdf)]\n[[博客](https:\u002F\u002Fblog.research.google\u002F2022\u002F04\u002Fpathways-language-model-palm-scaling-to.html)]\n\n- `2021\u002F09` | `FLAN` | [微调后的语言模型是零样本学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01652)\n\\-\n\n- `2021\u002F07` | `Codex` | [评估基于代码训练的大语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374)\n\\-\n\n- `2021\u002F05` | `GPT-3` | [语言模型是少样本学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)\n\\-\n[[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-3)]\n\n- `2021\u002F04` | `PanGu-α` | [PanGu-α：大规模自回归预训练中文语言模型，采用自动并行计算](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.12369)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.12369.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fhuawei-noah\u002FPretrained-Language-Model)]\n\n- `2019\u002F08` | `Sentence-BERT` | [Sentence-BERT：使用暹罗BERT网络生成句子嵌入]\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.10084)]\n\n- `2019\u002F07` | `RoBERTa` | [RoBERTa：一种鲁棒优化的BERT预训练方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692)\n\\-\n\n- `2018\u002F10` | `BERT` | [BERT：用于语言理解的深度双向Transformer预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FN19-1423.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert)]\n[[博客](https:\u002F\u002Fblog.research.google\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 2.2 视觉基础模型\n\n\u003Cdetails open>\n\u003Csummary>VFMs\u003C\u002Fsummary>\n\n[基础模型（返回顶部）](#2-基础模型)\n\n- `2024\u002F01` | `Depth Anything`\n| `Yang et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267061016?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FLiheYoung\u002FDepth-Anything.svg?style=social&label=Star) \u003Cbr>\nDepth Anything：释放大规模无标注数据的力量 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.10891)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.10891.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FLiheYoung\u002FDepth-Anything)]\n[[project](https:\u002F\u002Fdepth-anything.github.io\u002F)]\n\n- `2023\u002F05` | `SAA+`\n| `Cao et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258762545?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fcaoyunkang\u002FSegment-Any-Anomaly.svg?style=social&label=Star) \u003Cbr>\n无需训练的任意异常分割：基于混合提示正则化的方案 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10724)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.10724.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002Fcaoyunkang\u002FSegment-Any-Anomaly)]\n\n- `2023\u002F05` | `Explain Any Concept` | [解释任意概念：Segment Anything 配合基于概念的解释方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10289)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.10289.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FJerry00917\u002Fsamshap)]\n\n- `2023\u002F05` | `SAM-Track` | [分割并跟踪任意对象](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.06558)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.06558.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fz-x-yang\u002FSegment-and-Track-Anything)]\n\n- `2023\u002F05` | `SAMRS` | [SAMRS：利用 Segment Anything 模型扩展遥感分割数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02034)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02034.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FSAMRS)]\n\n- `2023\u002F04` | `Edit Everything` | [编辑一切：一种文本引导的图像生成系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14006)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14006.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FDefengXie\u002FEdit_Everything)]\n\n- `2023\u002F04` | `Inpaint Anything` | [修复任意内容：Segment Anything 结合图像修复技术](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06790)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.06790.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fgeekyutao\u002FInpaint-Anything)]\n\n- `2023\u002F04` | `SAM`\n| `Kirillov et al., ICCV 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:257952310?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ffacebookresearch\u002Fsegment-anything.svg?style=social&label=Star) \u003Cbr>\n分割任意对象 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643)]\n[[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2023\u002Fhtml\u002FKirillov_Segment_Anything_ICCV_2023_paper.html)]\n[[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fsegment-anything)]\n[[blog](https:\u002F\u002Fsegment-anything.com\u002F)]\n\n- `2023\u002F03` | `VideoMAE V2` | [VideoMAE V2：通过双重掩码扩展视频掩码自编码器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16727)\n\\-\n[[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FWang_VideoMAE_V2_Scaling_Video_Masked_Autoencoders_With_Dual_Masking_CVPR_2023_paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FVideoMAEv2)]\n\n- `2023\u002F03` | `Grounding DINO`\n| `Liu et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:257427307?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FIDEA-Research\u002FGroundingDINO.svg?style=social&label=Star) \u003Cbr>\nGrounding DINO：将 DINO 与接地预训练结合用于开放集目标检测 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05499)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.05499.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FIDEA-Research\u002FGroundingDINO)]\n\n- `2022\u002F03` | `VideoMAE` | [VideoMAE：掩码自编码器是自监督视频预训练中高效的数据学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.12602)\n\\-\n[[Paper](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Fhash\u002F416f9cb3276121c42eebb86352a4354a-Abstract-Conference.html)]\n[[Code](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FVideoMAE)]\n\n- `2021\u002F12` | `Stable Diffusion`\n| `Rombach et al., CVPR 2022`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:245335280?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FCompVis\u002Flatent-diffusion.svg?style=social&label=Star) \u003Cbr>\n使用潜在扩散模型进行高分辨率图像合成 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10752)]\n[[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022\u002Fhtml\u002FRombach_High-Resolution_Image_Synthesis_With_Latent_Diffusion_Models_CVPR_2022_paper.html)]\n[[code](https:\u002F\u002Fgithub.com\u002FCompVis\u002Flatent-diffusion)]\n[[stable diffusion](https:\u002F\u002Fgithub.com\u002FStability-AI\u002Fstablediffusion)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FStability-AI\u002Fstablediffusion.svg?style=social&label=Star)]\n\n- `2021\u002F09` | `LaMa` | [具有傅里叶卷积的分辨率鲁棒性大范围遮罩修复](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.07161)\n\\-\n[[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FWACV2022\u002Fpapers\u002FSuvorov_Resolution-Robust_Large_Mask_Inpainting_With_Fourier_Convolutions_WACV_2022_paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fadvimman\u002Flama)]\n\n- `2021\u002F03` | `Swin`\n| `Liu et al., ICCV 2021`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:232352874?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FSwin-Transformer.svg?style=social&label=Star) \u003Cbr>\nSwin Transformer：使用移位窗口的分层视觉 Transformer \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14030)]\n[[paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FLiu_Swin_Transformer_Hierarchical_Vision_Transformer_Using_Shifted_Windows_ICCV_2021_paper.html)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FSwin-Transformer)]\n\n- `2020\u002F10` | `ViT`\n| `Dosovitskiy et al., ICLR 2021`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:225039882?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\n一张图胜过 16x16 个词：大规模图像识别中的 Transformer \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929)]\n[[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=YicbFdNTTy)]\n[[Implementation](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Fvit-pytorch)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 2.3 多模态基础模型\n\n\u003Cdetails open>\n\u003Csummary>MFMs\u003C\u002Fsummary>\n\n[基础模型（返回顶部）](#2-基础模型)\n\n- `2024\u002F01` | `LLaVA-1.6`\n| `Liu et al.` \u003Cbr>\nLLaVA-1.6：改进了推理能力、OCR 和世界知识 \u003Cbr>\n[[代码](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA)]\n[[博客](https:\u002F\u002Fllava-vl.github.io\u002Fblog\u002F2024-01-30-llava-1-6\u002F)]\n\n- `2024\u002F01` | `MouSi`\n| `Fan et al.`\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFudanNLPLAB\u002FMouSi.svg?style=social&label=Star) \u003Cbr>\nMouSi：多视觉专家视觉-语言模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.17221)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.17221.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FFudanNLPLAB\u002FMouSi)]\n\n- `2023\u002F12` | `InternVL`\n| `Chen et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:266521410?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FOpenGVLab\u002FInternVL.svg?style=social&label=Star) \u003Cbr>\nInternVL：扩展视觉基础模型并对其对齐，以应对通用的视觉-语言任务 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14238)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14238.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FInternVL)]\n\n- `2023\u002F12` | `Gemini` | [Gemini：一个高度强大的多模态模型家族](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.11805)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.11805.pdf)]\n[[项目](https:\u002F\u002Fdeepmind.google\u002Ftechnologies\u002Fgemini\u002F#introduction)]\n\n- `2023\u002F10` | `LLaVA-1.5`\n| `Liu et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:263672058?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\n通过视觉指令微调改进基准模型 \u003Cbr>[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03744)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03744.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA)]\n[[项目](https:\u002F\u002Fllava-vl.github.io)]\n\n- `2023\u002F09` | `GPT-4V` | GPT-4V(ision) 系统卡片\n\\-\n[[论文](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002FGPTV_System_Card.pdf)]\n[[博客](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4v-system-card)]\n\n- `2023\u002F08` | `Qwen-VL` | [Qwen-VL：一种多功能的视觉-语言模型，用于理解、定位、文本阅读等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12966)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12966.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen-VL)]\n\n- `2023\u002F05` | `InstructBLIP` | [InstructBLIP：通过指令微调迈向通用型视觉-语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.06500)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vvoWPYqZJA)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Finstructblip)]\n\n- `2023\u002F05` | `Caption Anything` | [Caption Anything：具有多样化多模态控制的交互式图像描述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02677)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02677.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fttengwang\u002FCaption-Anything)]\n\n- `2023\u002F05` | `SAMText` | [视频文本检测的可扩展掩码标注](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01443)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.01443.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FSAMText)]\n\n- `2023\u002F04` | `Text2Seg` | [Text2Seg：通过文本引导的视觉基础模型进行遥感图像语义分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10597)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.10597.pdf)]\n\n- `2023\u002F04` | `MiniGPT-4` | [MiniGPT-4：利用先进的大型语言模型增强视觉-语言理解能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10592)\n\\-\n\n- `2023\u002F04` | `LLaVA` | [视觉指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=w0H2xGHlkw)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA)]\n[[项目](https:\u002F\u002Fllava-vl.github.io)]\n\n- `2023\u002F04` | `CLIP Surgery` | [CLIP 手术：在开放词汇任务中提升可解释性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05653)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.05653.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fxmed-lab\u002FCLIP_Surgery)]\n\n- `2023\u002F03` | `UniDiffuser` | [一个 Transformer 适用于大规模多模态扩散中的所有分布](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.06555)\n\\-\n\n- `2023\u002F01` | `GALIP` | [GALIP：用于文本到图像合成的生成对抗 CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12959)\n\\-\n[[论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FTao_GALIP_Generative_Adversarial_CLIPs_for_Text-to-Image_Synthesis_CVPR_2023_paper.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Ftobran\u002FGALIP)]\n\n- `2023\u002F01` | `BLIP-2` | [BLIP-2：使用冻结的图像编码器和大型语言模型进行语言-图像预训练的自举](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597)\n\\-\n[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fli23q.html)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Fblip2)]\n\n- `2022\u002F12` | `Img2Prompt` | [从图像到文本提示：使用冻结的大语言模型进行零样本 VQA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10846)\n\\-\n\n- `2022\u002F05` | `CoCa` | [CoCa：对比式标题生成器是图像-文本的基础模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01917)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=Ee277P3AYC)]\n\n- `2022\u002F01` | `BLIP` | [BLIP：为统一的视觉-语言理解和生成而进行的语言-图像预训练自举](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.12086)\n\\-\n[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv162\u002Fli22n.html)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FBLIP)]\n\n- `2021\u002F09` | `CoOp` | [学习如何为视觉-语言模型设计提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01134)\n\\-\n[[论文](https:\u002F\u002Flink.springer.com\u002Farticle\u002F10.1007\u002Fs11263-022-01653-1)]\n[[代码](https:\u002F\u002Fgithub.com\u002FKaiyangZhou\u002FCoOp)]\n\n- `2021\u002F02` | `CLIP` | [从自然语言监督中学习可迁移的视觉模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)\n\\-\n[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fradford21a\u002Fradford21a.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP)]\n[[博客](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fclip)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 2.4 推理应用\n\n\u003Cdetails open>\n\u003Csummary>推理应用\u003C\u002Fsummary>\n\n[基础模型（返回顶部）](#2-基础模型)\n\n- `2022\u002F06` | `Minerva` | [使用语言模型解决定量推理问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14858)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=IFXTZERXdM7)]\n[[博客](https:\u002F\u002Fblog.research.google\u002F2022\u002F06\u002Fminerva-solving-quantitative-reasoning.html)]\n\n- `2022\u002F06` | `BIG-bench` | [超越模仿游戏：量化并外推语言模型的能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.04615)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=uyTL5Bvosj)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench)]\n\n- `2022\u002F05` | `Zero-shot-CoT` | [大型语言模型是零样本推理者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11916)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=e2TBb5y0yFf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fkojima-takeshi188\u002Fzero_shot_cot)]\n\n- `2022\u002F03` | `STaR` | [STaR：通过推理来启动推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=_3ELRdg2sgI)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fezelikman\u002FSTaR)]\n\n- `2021\u002F07` | `MWP-BERT` | [MWP-BERT：用于数学文字题求解的数值增强预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13435)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2022.findings-naacl.74.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FLZhenwen\u002FMWP-BERT)]\n\n- `2017\u002F05` | `AQUA-RAT` | [基于推理生成的程序归纳：学习解决并解释代数文字题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.04146)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FP17-1015.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002FAQuA)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C\u002Fdetails>\n\n## 3 推理任务\n\n\u003Cdetails open>\n\u003Csummary>推理任务\u003C\u002Fsummary>\n\n[(返回顶部)](#table-of-contents)\n\n### 目录 - 3\n\n\u003Cdetails open>\n\u003Csummary>推理任务（目录）\u003C\u002Fsummary>\n\n- [3 推理任务](#3-reasoning-tasks)\n  - [3.1 常识推理](#31-commonsense-reasoning)\n    - [3.1.1 常识问答（QA）](#311-commonsense-question-and-answering-qa)\n    - [3.1.2 物理常识推理](#312-physical-commonsense-reasoning)\n    - [3.1.3 空间常识推理](#313-spatial-commonsense-reasoning)\n    - [3.1.x 基准、数据集和指标](#31x-benchmarks-datasets-and-metrics)\n  - [3.2 数学推理](#32-mathematical-reasoning)\n    - [3.2.1 算术推理](#321-arithmetic-reasoning)\n    - [3.2.2 几何推理](#322-geometry-reasoning)\n    - [3.2.3 定理证明](#323-theorem-proving)\n    - [3.2.4 科学推理](#324-scientific-reasoning)\n    - [3.2.x 基准、数据集和指标](#32x-benchmarks-datasets-and-metrics)\n  - [3.3 逻辑推理](#33-logical-reasoning)\n    - [3.3.1 命题逻辑](#331-propositional-logic)\n    - [3.3.2 谓词逻辑](#332-predicate-logic)\n    - [3.3.x 基准、数据集和指标](#33x-benchmarks-datasets-and-metrics)\n  - [3.4 因果推理](#34-causal-reasoning)\n    - [3.4.1 反事实推理](#341-counterfactual-reasoning)\n    - [3.4.x 基准、数据集和指标](#34x-benchmarks-datasets-and-metrics)\n  - [3.5 视觉推理](#35-visual-reasoning)\n    - [3.5.1 3D推理](#351-3d-reasoning)\n    - [3.5.x 基准、数据集和指标](#35x-benchmarks-datasets-and-metrics)\n  - [3.6 音频推理](#36-audio-reasoning)\n    - [3.6.1 语音](#361-speech)\n    - [3.6.x 基准、数据集和指标](#36x-benchmarks-datasets-and-metrics)\n  - [3.7 多模态推理](#37-multimodal-reasoning)\n    - [3.7.1 对齐](#371-alignment)\n    - [3.7.2 生成](#372-generation)\n    - [3.7.3 多模态理解](#373-multimodal-understanding)\n    - [3.7.x 基准、数据集和指标](#37x-benchmarks-datasets-and-metrics)\n  - [3.8 代理推理](#38-agent-reasoning)\n    - [3.8.1 内省式推理](#381-introspective-reasoning)\n    - [3.8.2 外省式推理](#382-extrospective-reasoning)\n    - [3.8.3 多智能体推理](#383-multi-agent-reasoning)\n    - [3.8.4 自动驾驶推理](#384-driving-reasoning)\n    - [3.8.x 基准、数据集和指标](#38x-benchmarks-datasets-and-metrics)\n  - [3.9 其他任务和应用](#39-other-tasks-and-applications)\n    - [3.9.1 心智理论（ToM）](#391-theory-of-mind-tom)\n    - [3.9.2 用于天气预报的LLM](#392-llms-for-weather-prediction)\n    - [3.9.3 抽象推理](#393-abstract-reasoning)\n    - [3.9.4 可废止推理](#394-defeasible-reasoning)\n    - [3.9.5 医疗推理](#395-medical-reasoning)\n    - [3.9.6 生物信息学推理](#396-bioinformatics-reasoning)\n    - [3.9.7 长链推理](#397-long-chain-reasoning)\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.1 常识推理\n\n\u003Cdetails open>\n\u003Csummary>常识推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.1 常识推理](#31-commonsense-reasoning)\n  - [3.1.1 常识问答（QA）](#311-commonsense-question-and-answering-qa)\n  - [3.1.2 物理常识推理](#312-physical-commonsense-reasoning)\n  - [3.1.3 空间常识推理](#313-spatial-commonsense-reasoning)\n  - [3.1.x 基准、数据集和指标](#31x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F12` | [Gemini在推理中的应用：揭示多模态大型语言模型中的常识](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17661)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17661.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FEternityYW\u002FGemini-Commonsense-Evaluation\u002F)]\n\n- `2023\u002F05` | `LLM-MCTS` | [大型语言模型作为大规模任务规划的常识知识](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14078)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=Wjp1AYB8lH)]\n[[代码](https:\u002F\u002Fgithub.com\u002F1989Ryan\u002Fllm-mcts)]\n[[项目](https:\u002F\u002Fllm-mcts.github.io)]\n\n- `2023\u002F05` | 桥接预训练与微调之间的鸿沟，用于常识生成\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2023.findings-eacl.28.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FLHRYANG\u002FCommonGen)]\n\n- `2022\u002F11` | `DANCE` | [通过知识图谱谜题提升视觉-语言模型中的常识](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.16504)\n\\-\n[[论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FYe_Improving_Commonsense_in_Vision-Language_Models_via_Knowledge_Graph_Riddles_CVPR_2023_paper.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fpleaseconnectwifi\u002FDANCE)]\n[[项目](https:\u002F\u002Fshuquanye.com\u002FDANCE_website)]\n\n- `2022\u002F10` | `CoCoGen` | [代码语言模型是少数样本常识学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07128)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.90.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Freasoning-machines\u002FCoCoGen)]\n\n- `2021\u002F10` | [对大型语言模型中常识知识的系统性研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.00607)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.812.pdf)]\n\n- `2021\u002F05` | [超越简单的微调：改进面向社会常识的预训练模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05913)\n\\-\n[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?arnumber=9383453)]\n\n#### 3.1.1 常识问答（QA）\n\n- `2019\u002F06` | `CoS-E` | [解释你自己！利用语言模型进行常识推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02361)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FP19-1487.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fcos-e)]\n\n- `2018\u002F11` | `CQA` | [常识QA：一项针对常识知识的问答挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.00937)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FN19-1421.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fjonathanherzig\u002Fcommonsenseqa)]\n[[项目](https:\u002F\u002Fwww.tau-nlp.sites.tau.ac.il\u002Fcommonsenseqa)]\n\n- `2016\u002F12` | `ConceptNet` | [ConceptNet 5.5：一个开放的多语言通用知识图谱](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.03975)\n\\-\n[[论文](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F11164)]\n[[项目](https:\u002F\u002Fconceptnet.io)]\n\n#### 3.1.2 物理常识推理\n\n- `2025\u002F05` | `PhyX` | [PhyX：你的模型具备物理推理所需的“智慧”吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15929)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.15929.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FNastyMarcus\u002FPhyX)]\n[[项目](https:\u002F\u002Fphyx-bench.github.io\u002F)]\n\n- `2023\u002F10` | `NEWTON` | [NEWTON：大型语言模型是否具备物理推理能力？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07018)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07018.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FNewtonReasoning\u002FNewton)]\n[[项目](https:\u002F\u002Fnewtonreasoning.github.io)]\n\n- `2022\u002F03` | `PACS` | [PACS：用于物理视听常识推理的数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11130)\n\\-\n[[论文](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2022\u002Fpapers_ECCV\u002Fpapers\u002F136970286.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsamuelyu2002\u002FPACS)]\n\n- `2021\u002F10` | `VRDP` | [通过从视频和语言中学习可微分物理模型实现动态视觉推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.15358)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=lk1ORT35tbi)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fdingmyu\u002FVRDP)]\n\n- `2020\u002F05` | `ESPRIT` | [ESPRIT：解释物理推理任务的解决方案](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00730)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2020.acl-main.706.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fesprit)]\n\n- `2019\u002F11` | `PIQA` | [PIQA：自然语言中的物理常识推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11641)\n\\-\n[[论文](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F6239)]\n[[项目](https:\u002F\u002Fleaderboard.allenai.org\u002Fphysicaliqa\u002Fsubmissions\u002Fpublic)]\n\n#### 3.1.3 空间常识推理\n\n- `2024\u002F01` | `SpatialVLM`\n| `Chen et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267069344?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nSpatialVLM：赋予视觉-语言模型空间推理能力 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12168)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12168.pdf)]\n[[项目](https:\u002F\u002Fspatial-vlm.github.io\u002F)]\n\n- `2022\u002F03` | [文本中未提及的事物：从视觉信号中探索空间常识](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.08075)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2022.acl-long.168.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fxxxiaol\u002Fspatial-commonsense)]\n\n- `2021\u002F06` | `PROST` | [PROST：通过时空对物体进行物理推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.03634)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2021.findings-acl.404.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fnala-cub\u002Fprost)]\n\n- `2019\u002F02` | `GQA` | [GQA：一个新的用于现实世界视觉推理和组合式问答的数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.09506)\n\\-\n[[论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FHudson_GQA_A_New_Dataset_for_Real-World_Visual_Reasoning_and_Compositional_CVPR_2019_paper.pdf)]\n[[项目](https:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fdorarad\u002Fgqa\u002Findex.html)]\n\n#### 3.1.x 基准、数据集和指标\n\n- `2023\u002F06` | `CConS` | [用反常识情境探测物理推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02258)\n\\-\n\n- `2023\u002F05` | `SummEdits` | [LLMs作为事实推理者：来自现有基准及其他方面的洞见](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14540)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14540.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FfactualNLG)]\n\n- `2021\u002F03` | `RAINBOW` | [UNICORN在RAINBOW上：一种基于新型多任务基准的通用常识推理模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13009)\n\\-\n\n- `2020\u002F11` | `ProtoQA` | ProtoQA：一个用于原型式常识推理的问答数据集\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2020.emnlp-main.85.pdf)]\n\n- `2020\u002F10` | `DrFact` | [可微分的开放式常识推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.14439)\n\n- `2019\u002F11` | `CommonGen` | [CommonGen：一项针对生成式常识推理的受限文本生成挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03705)\n\n- `2019\u002F08` | `Cosmos QA` | [Cosmos QA：带有上下文常识推理的机器阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00277)\n\n- `2019\u002F08` | `αNLI` | [溯因式常识推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05739)\n\\-\n\n- `2019\u002F08` | `PHYRE` | [PHYRE：一个新的物理推理基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05656)\n\\-\n\n- `2019\u002F07` | `WinoGrande` | [WinoGrande：大规模的对抗性维诺格拉德模式挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10641)\n\\-\n\n- `2019\u002F05` | `MathQA` | [MathQA：借助基于运算的形式化方法实现可解释的数学应用题求解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.13319)\n\\-\n\n- `2019\u002F05` | `HellaSwag` | [HellaSwag：机器真的能帮你完成句子吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07830)\n\\-\n\n- `2019\u002F04` | `Social IQa` | [SocialIQA：关于社会互动的常识推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09728)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD19-1454.pdf)]\n\n- `2018\u002F08` | `SWAG` | [SWAG：一个大规模的对抗性数据集，用于 grounded常识推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.05326)\n\\-\n\n- `2002\u002F07` | `BLEU` | BLEU：一种自动评估机器翻译的方法\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FP02-1040.pdf)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n\n### 3.2 数学推理\n\n\u003Cdetails open>\n\u003Csummary>数学推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.2 数学推理](#32-mathematical-reasoning)\n  - [3.2.1 算术推理](#321-arithmetic-reasoning)\n  - [3.2.2 几何推理](#322-geometry-reasoning)\n  - [3.2.3 定理证明](#323-theorem-proving)\n  - [3.2.4 科学推理](#324-scientific-reasoning)\n  - [3.2.x 基准、数据集和指标](#32x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F10` | `MathVista` | [MathVista: 利用GPT-4V、Bard及其他多模态大模型评估视觉情境下的数学推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02255)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=KUNzEQMWU7)]\n[[代码](https:\u002F\u002Fgithub.com\u002Flupantech\u002FMathVista)]\n[[项目主页](https:\u002F\u002Fmathvista.github.io\u002F)]\n| `Lu等人，ICLR 2024`\n\n\n\n- `2022\u002F11` | 知识理论中的分词\n\\-\n[[论文](https:\u002F\u002Fwww.mdpi.com\u002F2673-8392\u002F3\u002F1\u002F24)]\n\n- `2022\u002F06` | `MultiHiertt` | [MultiHiertt：面向多层级表格与文本数据的数值推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01347)\n\n- `2021\u002F04` | `MultiModalQA` | [MultiModalQA：跨文本、表格和图像的复杂问答任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06039)\n\n- `2017\u002F05` | [通过生成理由进行程序归纳：学习解决并解释代数应用题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.04146)\n\n- `2014\u002F04` | [神经网络中的深度学习：综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F1404.7828)\n\\-\n[[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0893608014002135)]\n\n- `2004` | 维特根斯坦论逻辑与数学哲学\n\\-\n[[论文](https:\u002F\u002Fwww.pdcnet.org\u002Fgfpj\u002Fcontent\u002Fgfpj_2004_0025_0002_0227_0288)]\n\n- `1989` | `CLP` | 连接主义学习算法\n\\-\n[[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002F0004370289900490)]\n\n#### 3.2.1 算术推理\n\n[数学推理（返回顶部）](#32-数学推理)\n\n- `2022\u002F09` | `PromptPG` | [基于策略梯度的动态提示学习用于半结构化数学推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14610)\n\n- `2022\u002F01` | [思维链提示能够激发大型语言模型的推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)\n\\-\n\n- `2021\u002F03` | `SVAMP` | [NLP模型真的能解决简单的数学应用题吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07191)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2021.naacl-main.168.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Farkilpatel\u002FSVAMP)]\n\n- `2021\u002F03` | `MATH` | [利用MATH数据集衡量数学问题解决能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.03874)\n\\-\n\n- `2016\u002F08` | [计算机在解决数学应用题方面表现如何？大规模数据集构建与评估](https:\u002F\u002Faclanthology.org\u002FP16-1084\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FP16-1084.pdf)]\n\n- `2015\u002F09` | [使用二次规划学习解决代数应用题](https:\u002F\u002Faclanthology.org\u002FD15-1096\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD15-1096.pdf)]\n\n- `2014\u002F06` | `Alg514` | [学习自动解决代数应用题](https:\u002F\u002Faclanthology.org\u002FP14-1026\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FP14-1026.pdf)]\n\n#### 3.2.2 几何推理\n\n[数学推理（返回顶部）](#32-数学推理)\n\n- `2024\u002F01` | `AlphaGeometry` | 在无需人类示范的情况下解决奥林匹克几何问题\n\\-\n[[论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06747-5)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002Falphageometry)]\n[[博客](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Falphageometry-an-olympiad-level-ai-system-for-geometry\u002F)]\n| `Trinh等人，Nature`\n\n- `2022\u002F12` | `UniGeo` \u002F `Geoformer` | [UniGeo：通过重写数学表达式统一几何逻辑推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.02746)\n\n- `2021\u002F05` | `GeoQA` \u002F `NGS` | [GeoQA：迈向多模态数值推理的几何问答基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14517)\n\n- `2021\u002F05` | `Geometry3K` \u002F `Inter-GPS` | [Inter-GPS：利用形式语言和符号推理实现可解释的几何问题求解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.04165)\n\n- `2015\u002F09` | `GeoS` | [解决几何问题：结合文本与图形的解读](https:\u002F\u002Faclanthology.org\u002FD15-1171\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD15-1171.pdf)]\n\n#### 3.2.3 定理证明\n\n[数学推理（返回顶部）](#32-数学推理)\n\n- `2020\u002F10` | `Prover` | [LEGO-Prover：具有不断增长库的神经定理证明系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00656)\n\\-\n\n- `2023\u002F09` | `Lyra` | [Lyra：在自动化定理证明中协调双重校正](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15806)\n\n- `2023\u002F06` | `DT-Solver` | [DT-Solver：基于证明级价值函数引导的动态树采样自动化定理证明](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.706\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.706.pdf)]\n\n- `2023\u002F05` | [分解谜题：基于子目标的演示学习用于形式定理证明](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16366)\n\n- `2023\u002F03` | `Magnushammer` | [Magnushammer：一种基于Transformer的假设选择方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04488)\n\n- `2022\u002F10` | `DSP` | [草稿、草图与证明：以非正式证明指导形式定理证明者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.12283)\n\\-\n\n- `2022\u002F05` | [通过优化搜索策略来学习寻找证明与定理：以循环不变式合成为例](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14229)\n\n- `2022\u002F05` | [利用大型语言模型进行自动形式化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12615)\n\\-\n[[论文](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Ffile\u002Fd0c6bc641a56bebee9d985b937307367-Paper-Conference.pdf)]\n\n- `2022\u002F05` | `HTPS` | [超树证明搜索用于神经定理证明](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11491)\n\n- `2022\u002F05` | `Thor` | [Thor：运用“锤子”整合语言模型与自动化定理证明器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10893)\n\\-\n\n- `2022\u002F02` | [形式数学语句课程学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01344)\n\\-\n\n- `2021\u002F07` | `Lean 4` | [Lean 4定理证明器与编程语言](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-030-79876-5_37)\n\\-\n\n- `2021\u002F02` | `TacticZero` | [TacticZero：利用深度强化学习从零开始学习证明定理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.09756)\n\\-\n\n- `2021\u002F02` | `PACT` | [与语言模型协同训练证明工件以用于定理证明](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.06203)\n\\-\n\n- `2020\u002F09` | `GPT-f` | [用于自动化定理证明的生成式语言建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.03393)\n\\-\n\n- `2019\u002F07` | [关键系统中硬件组件的形式验证](https:\u002F\u002Fwww.hindawi.com\u002Fjournals\u002Fwcmc\u002F2020\u002F7346763\u002F)\n\\-\n[[论文](https:\u002F\u002Fdownloads.hindawi.com\u002Fjournals\u002Fwcmc\u002F2020\u002F7346763.pdf?_gl=1*1yjtq1u*_ga*MjA3MTczMzQzOC4xNjk5NjE3NDI1*_ga_NF5QFMJT5V*MTY5OTYxNzQyNC4xLjEuMTY5OTYxNzQ2Ni4xOC4wLjA.&_ga=2.180805351.1310949615.1699617425-2071733438.1699617425)]\n\n- `2019\u002F06` | `Metamath` | 一种用于数学证明的计算机语言\n\\-\n[[论文](http:\u002F\u002Fde.metamath.org\u002Fdownloads\u002Fmetamath.pdf)]\n\n- `2019\u002F05` | `CoqGym` | [通过与证明助手交互学习证明定理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09381)\n\n- `2018\u002F12` | `AlphaZero` | [一种通用的强化学习算法，可通过自我对弈掌握国际象棋、将棋和围棋](https:\u002F\u002Fwww.science.org\u002Fdoi\u002F10.1126\u002Fscience.aar6404)\n\\-\n[[论文](https:\u002F\u002Fwww.science.org\u002Fdoi\u002Fpdf\u002F10.1126\u002Fscience.aar6404)]\n\n- `2018\u002F04` | `TacticToe` | [TacticToe: 通过战术学习进行证明](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.00596)\n\n- `2015\u002F08` | `Lean` | Lean 定理证明器（系统说明）\n\\-\n[[论文](https:\u002F\u002Flean-lang.org\u002Fpapers\u002Fsystem.pdf)]\n\n- `2010\u002F07` | 使用 Sledgehammer 的三年经验：自动与交互式定理证明器之间的实用桥梁\n\\-\n[[论文](https:\u002F\u002Fwww.cl.cam.ac.uk\u002F~lp15\u002Fpapers\u002FAutomation\u002Fpaar.pdf)]\n\n- `2010\u002F04` | 英特尔的形式化方法——概述\n\\-\n[[幻灯片](https:\u002F\u002Fshemesh.larc.nasa.gov\u002FNFM2010\u002Ftalks\u002Fharrison.pdf)]\n\n- `2005\u002F07` | 将仿真与形式验证相结合用于集成电路设计验证\n\\-\n[[论文](https:\u002F\u002Fs2.smu.edu\u002F~mitch\u002Fftp_dir\u002Fpubs\u002Fwmsci05.pdf)]\n\n- `2003` | 从证明助手提取一个经过形式化验证且完全可执行的编译器\n\\-\n[[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1571066105825988\u002Fpdf?md5=10b884badea7fe0e46c38b9419fbcca6&pid=1-s2.0-S1571066105825988-main.pdf&_valck=1)]\n\n- `1996` | `Coq` | Coq 证明助手参考手册\n\\-\n[[项目](https:\u002F\u002Fcoq.inria.fr\u002Fdocumentation)]\n\n- `1994` | `Isabelle` | Isabelle：一种通用的定理证明器\n\\-\n[[论文](https:\u002F\u002Flink.springer.com\u002Fcontent\u002Fpdf\u002F10.1007\u002Fbfb0030558.pdf)]\n\n#### 3.2.4 科学推理\n\n[数学推理（返回顶部）](#32-数学推理)\n\n- `2023\u002F07` | `SciBench` | [SciBench：评估大型语言模型的大学水平科学问题解决能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.10635)\n\\-\n\n- `2022\u002F09` | `ScienceQA` | [学会解释：基于思维链的多模态推理在科学问答中的应用](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09513)\n\n- `2022\u002F03` | `ScienceWorld` | [ScienceWorld：你的智能体比五年级学生更聪明吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.07540)\n\n- `2012` | 儿童学习与认知的前沿课题\n\\-\n[[书籍](https:\u002F\u002Fwww.intechopen.com\u002Fbooks\u002F654)]\n\n#### 3.2.x 基准、数据集和指标\n\n[数学推理（返回顶部）](#32-数学推理)\n\n- `2024\u002F01` | `MathBench`\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fopen-compass\u002FMathBench.svg?style=social&label=Star) \u003Cbr>\nMathBench：一个全面的多级别难度数学评估数据集 \u003Cbr>\n[[代码](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002FMathBench)]\n\n- `2023\u002F08` | `Math23K-F` \u002F `MAWPS-F` \u002F `FOMAS` | [通过掌握常识公式知识引导数学推理](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1145\u002F3580305.3599375)\n\\-\n[[论文](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3580305.3599375)]\n\n- `2023\u002F07` | `ARB` | [ARB：面向大型语言模型的高级推理基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.13692)\n\\-\n\n- `2023\u002F05` | `SwiftSage` | [SwiftSage：一种具有快慢思维的生成式智能体，适用于复杂交互任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.17390)\n\\-\n\n- `2023\u002F05` | `TheoremQA` | [TheoremQA：一个以定理为导向的问答数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12524)\n\\-\n\n- `2022\u002F10` | `MGSM` | [语言模型是多语言的思维链推理者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03057)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=fR3wGCk-IXp)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Furl-nlp)]\n\n- `2021\u002F10` | `GSM8K` | [训练验证器解决数学文字题](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.14168)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.14168.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math)]\n[[博客](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fsolving-math-word-problems)]\n\n- `2021\u002F10` | `IconQA` | [IconQA：一个新的抽象图表理解和视觉语言推理基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.13214)\n\\-\n\n- `2021\u002F09` | `FinQA` | [FinQA：一个关于金融数据的数值推理数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.00122)\n\\-\n\n- `2021\u002F08` | `MBPP` \u002F `MathQA-Python` | [使用大型语言模型进行程序合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07732)\n\n- `2021\u002F08` | `HiTab` \u002F `EA` | [HiTab：一个用于问答和自然语言生成的层次化表格数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06712)\n\n- `2021\u002F07` | `HumanEval` \u002F `Codex` | [评估基于代码训练的大型语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374)\n\\-\n\n- `2021\u002F06` | `ASDiv` \u002F `CLD` | [一个多样化的语料库，用于评估和开发英语数学文字题求解器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.15772)\n\\-\n\n- `2021\u002F06` | `AIT-QA` | [AIT-QA：一个关于航空业复杂表格的问答数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.12944)\n\\-\n\n- `2021\u002F05` | `APPS` | [用 APPS 衡量编码挑战能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.09938)\n\\-\n\n- `2021\u002F05` | `TAT-QA` | [TAT-QA：一个关于金融领域表格与文本混合内容的问答基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.07624)\n\n- `2021\u002F03` | `SVAMP` | [NLP 模型真的能解决简单的数学文字题吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07191)\n\\-\n\n- `2021\u002F01` | `TSQA` \u002F `MAP` \u002F `MRR` | [TSQA：基于表格场景的问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.11429)\n\n- `2020\u002F10` | `HMWP` | [语义对齐的通用树状结构数学文字题求解器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06823)\n\\-\n\n- `2020\u002F04` | `HybridQA` | [HybridQA：一个关于表格和文本数据的多跳问答数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.07347)\n\n- `2019\u002F03` | `DROP` | [DROP：一个需要对段落进行离散推理的阅读理解基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.00161)\n\\-\n\n- `2019` | `NaturalQuestions` | [自然问题：一个用于问答研究的基准](https:\u002F\u002Faclanthology.org\u002FQ19-1026\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FQ19-1026.pdf)]\n\n- `2018\u002F09` | `HotpotQA` | [HotpotQA：一个用于多样化、可解释的多跳问答的数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.09600)\n\\-\n\n- `2018\u002F09` | `Spider` | [Spider：一个大规模的人工标注数据集，用于复杂且跨领域的语义解析和文本到 SQL 任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.08887)\n\\-\n\n- `2018\u002F03` | `ComplexWebQuestions` | [网络作为解答复杂问题的知识库](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.06643)\n\\-\n\n- `2017\u002F12` | `MetaQA` | [基于知识图谱的变分推理用于问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.04071)\n\\-\n\n- `2017\u002F09` | `GEOS++` | [从教科书到知识：以教科书中的公理化知识为例，用于解决几何问题](https:\u002F\u002Faclanthology.org\u002FD17-1081\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD17-1081.pdf)]\n\n- `2017\u002F09` | `Math23k` | [深度神经网络求解数学文字题](https:\u002F\u002Faclanthology.org\u002FD17-1088\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD17-1088.pdf)]\n\n- `2017\u002F08` | `WikiSQL` \u002F `Seq2SQL` | [Seq2SQL：利用强化学习从自然语言生成结构化查询](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.00103)\n\\-\n\n- `2017\u002F08` | [从教科书中的自然语言演示中学习如何解决几何问题](https:\u002F\u002Faclanthology.org\u002FS17-1029\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FS17-1029.pdf)]\n\n- `2017\u002F05` | `TriviaQA` | [TriviaQA：用于阅读理解的大规模远距离监督挑战数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.03551)\n\\-\n\n- `2017\u002F05` | `GeoShader` | 阴影区域几何问题解法的综合\n\\-\n[[论文](https:\u002F\u002Fcdn.aaai.org\u002Focs\u002F15416\u002F15416-68619-1-PB.pdf)]\n\n- `2016\u002F09` | `DRAW-1K` | [标注推导过程：代数文字题的新评估策略与数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.07197)\n\\-\n\n- `2016\u002F08` | `WebQSP` | [语义解析标注在知识库问答中的价值](https:\u002F\u002Faclanthology.org\u002FP16-2033\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FP16-2033.pdf)]\n\n- `2016\u002F06` | `SQuAD` | [SQuAD：超过10万个用于机器阅读理解的问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.05250)\n\\-\n\n- `2016\u002F06` | `WikiMovies` | [用于直接阅读文档的键值记忆网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1606.03126)\n\\-\n\n- `2016\u002F06` | `MAWPS` | [MAWPS：数学文字题库](https:\u002F\u002Faclanthology.org\u002FN16-1136\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FN16-1136.pdf)]\n\n- `2015\u002F09` | `Dolphin1878` | [通过语义解析与推理自动求解数字文字题](https:\u002F\u002Faclanthology.org\u002FD15-1135\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD15-1135.pdf)]\n\n- `2015\u002F08` | `WikiTableQA` | [半结构化表格上的组合语义解析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1508.00305)\n\\-\n\n- `2015` | `SingleEQ` | [将代数文字题解析为方程](https:\u002F\u002Faclanthology.org\u002FQ15-1042\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FQ15-1042.pdf)]\n\n- `2015` | `DRAW` | DRAW：一套具有挑战性且多样化的代数文字题集\n\\-\n[[论文](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fwp-content\u002Fuploads\u002F2016\u002F02\u002Ftech_rep.pdf)]\n\n- `2014\u002F10` | `Verb395` | [通过动词分类学习解决算术文字题](https:\u002F\u002Faclanthology.org\u002FD14-1058\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD14-1058.pdf)]\n\n- `2013\u002F10` | `WebQuestions` | [基于问题—答案对的Freebase语义解析](https:\u002F\u002Faclanthology.org\u002FD13-1160\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FD13-1160.pdf)]\n\n- `2013\u002F08` | `Free917` | [通过模式匹配和词汇扩展进行大规模语义解析](https:\u002F\u002Faclanthology.org\u002FP13-1042\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FP13-1042.pdf)]\n\n- `2002\u002F04` | `NMI` | [聚类集成——一种结合多个划分的知识重用框架](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1162\u002F153244303321897735)\n\\-\n[[论文](https:\u002F\u002Fwww.jmlr.org\u002Fpapers\u002Fvolume3\u002Fstrehl02a\u002Fstrehl02a.pdf)]\n\n- `1990` | `ATIS` | [ATIS口语语言系统试点语料库](https:\u002F\u002Faclanthology.org\u002FH90-1021\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FH90-1021.pdf)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 3.3 逻辑推理\n\n\u003Cdetails open>\n\u003Csummary>逻辑推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-推理任务)\n\n- [3.3 逻辑推理](#33-logical-reasoning)\n  - [3.3.1 命题逻辑](#331-propositional-logic)\n  - [3.3.2 谓词逻辑](#332-predicate-logic)\n  - [3.3.x 基准、数据集和指标](#33x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2024\u002F12` | `FLDx2` | [通过原则性的合成逻辑语料库增强大语言模型的推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.12498)\n\\-\n\n- `2023\u002F11` | `FLD` | [基于形式逻辑的合成语料库学习演绎推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.07336)\n\\-\n\n- `2023\u002F10` | `LogiGLUE` | [迈向 LogiGLUE：关于分析语言模型逻辑推理能力的简要综述与基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00836)\n\\-\n\n- `2023\u002F05` | `LogicLLM` | [LogicLLM：探索用于大语言模型的自监督逻辑增强训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13718)\n\\-\n\n- `2023\u002F05` | `Logic-LM` | [Logic-LM：利用符号求解器赋能大语言模型实现忠实的逻辑推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12295)\n\\-\n\n- `2023\u002F03` | `LEAP` | [显式规划有助于语言模型进行逻辑推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.15714)\n\\-\n\n- `2023\u002F03` | [通用人工智能的火花：GPT-4 的早期实验](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12712)\n\\-\n\n- `2022\u002F10` | `Entailer` | [Entailer：以忠实且真实的推理链回答问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.12217)\n\\-\n\n- `2022\u002F06` | `NeSyL` | [面向认知任务的弱监督神经符号学习](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F20533)\n\\-\n[[论文](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F20533\u002F20292)]\n\n- `2022\u002F05` | `NeuPSL` | [NeuPSL：神经概率软逻辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14268)\n\\-\n\n- `2022\u002F05` | `NLProofS` | [通过验证者引导的搜索生成自然语言证明](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12443)\n\\-\n\n- `2022\u002F05` | `Least-to-Most Prompting` | [由浅入深提示法使大语言模型具备复杂推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625)\n\\-\n\n- `2022\u002F05` | `SI` | [选择—推理：利用大语言模型实现可解释的逻辑推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.09712)\n\\-\n\n- `2022\u002F05` | `MERIt` | [MERIt：元路径引导的对比学习用于逻辑推理](https:\u002F\u002Faclanthology.org\u002F2022.findings-acl.276\u002F)\n\\-\n\n- `2022\u002F03` | [自我一致性提升语言模型的思维链推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)\n\\-\n\n- `2021\u002F11` | `NSPS` | [神经符号程序搜索在自动驾驶决策模块设计中的应用](https:\u002F\u002Fproceedings.mlr.press\u002Fv155\u002Fsun21a.html)\n\\-\n[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv155\u002Fsun21a\u002Fsun21a.pdf)]\n\n- `2021\u002F09` | `DeepProbLog` | [DeepProbLog 中的神经概率逻辑编程](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0004370221000552)\n\\-\n[[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0004370221000552\u002Fpdfft?md5=1e6b82d50854f317478e487da9e75473&pid=1-s2.0-S0004370221000552-main.pdf)]\n\n- `2021\u002F08` | `GABL` | [基于基础知识库的溯因学习](https:\u002F\u002Fwww.ijcai.org\u002Fproceedings\u002F2021\u002F250)\n\\-\n[[论文](https:\u002F\u002Fwww.ijcai.org\u002Fproceedings\u002F2021\u002F0250.pdf)]\n\n- `2021\u002F05` | `LReasoner` | [基于逻辑的上下文扩展与数据增强用于文本的逻辑推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.03659)\n\\-\n\n- `2020\u002F02` | `RuleTakers` | [Transformer 作为语言上的软推理器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.05867)\n\\-\n\n- `2019\u002F12` | `NMN-Drop` | [用于文本推理的神经模块网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04971)\n\\-\n\n- `2019\u002F04` | `NS-CL` | [神经符号概念学习者：从自然监督中理解场景、词语和句子](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.12584)\n\\-\n\n- `2012` | 逻辑推理与学习\n\\-\n[[论文](https:\u002F\u002Flink.springer.com\u002Freferenceworkentry\u002F10.1007\u002F978-1-4419-1428-6_790#:~:text=Logical%20reasoning%20is%20a%20form,of%20science%20and%20artificial%20intelligence.)]\n\n#### 3.3.1 命题逻辑\n\n- `2022\u002F09` | 通过神经 Transformer 语言模型进行命题推理\n\\-\n[[论文](https:\u002F\u002Fwww.cs.cmu.edu\u002F~oscarr\u002Fpdf\u002Fpublications\u002F2022_nesy.pdf)]\n\n#### 3.3.2 谓词逻辑\n\n- `2021\u002F06` | `ILP` | [归纳逻辑编程三十周年](https:\u002F\u002Flink.springer.com\u002Farticle\u002F10.1007\u002Fs10994-021-06089-1)\n\\-\n[[论文](https:\u002F\u002Flink.springer.com\u002Fcontent\u002Fpdf\u002F10.1007\u002Fs10994-021-06089-1.pdf)]\n\n- `2011` | 统计关系学习\n\\-\n[[论文](https:\u002F\u002Flink.springer.com\u002Freferenceworkentry\u002F10.1007\u002F978-0-387-30164-8_786)]\n\n#### 3.3.x 基准、数据集和指标\n\n- `2022\u002F10` | `PrOntoQA` | [语言模型是贪婪的推理者：对思维链的系统性形式化分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.01240)\n\\-\n\n- `2022\u002F09` | `FOLIO` | [FOLIO：使用一阶逻辑进行自然语言推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00840)\n\\-\n\n- `2022\u002F06` | `BIG-bench` | [超越模仿游戏：量化并外推语言模型的能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.04615)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=uyTL5Bvosj)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench)]\n\n- `2021\u002F04` | `AR-LSAT` | [AR-LSAT：探究文本的分析推理能力](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.06598)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2022.findings-naacl.177\u002F)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fzhongwanjun\u002FAR-LSAT)]\n\n- `2020\u002F12` | `ProofWriter` [ProofWriter：在自然语言上生成蕴含、证明和溯因陈述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.13048)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 3.4 因果推理\n\n\u003Cdetails open>\n\u003Csummary>因果推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.4 因果推理](#34-causal-reasoning)\n  - [3.4.1 反事实推理](#341-counterfactual-reasoning)\n  - [3.4.x 基准、数据集和指标](#34x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F08` | [因果鹦鹉：大型语言模型可能谈论因果关系，但并不具备真正的因果能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.13067)\n\n- `2023\u002F07` | [利用语言模型作为不完美专家进行因果发现](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02390)\n\\-\n\n- `2023\u002F06` | [从查询工具到因果架构师：利用大型语言模型从数据中进行高级因果发现](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16902)\n\\-\n\n- `2023\u002F06` | `Corr2Cause` | [大型语言模型能否从相关性中推断出因果关系？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.05836)\n\\-\n\n- `2023\u002F05` | `Code-LLMs` | [IF的魔力：探究代码类大型语言模型的因果推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19213)\n\\-\n\n- `2023\u002F04` | [利用大型语言模型理解因果关系：可行性与机遇](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05524)\n\\-\n\n- `2023\u002F04` | [因果推理与大型语言模型：开启因果关系研究的新前沿](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.00050)\n\\-\n\n- `2023\u002F03` | [大型语言模型能否构建因果图？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05279)\n\\-\n\n- `2023\u002F01` | [ChatGPT在神经性疼痛诊断背景下的因果发现性能](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.13819)\n\\-\n\n- `2022\u002F09` | [探查因果事实的相关性：大型语言模型与因果关系](https:\u002F\u002Fopenreview.net\u002Fforum?id=UPwzqPOs4-)\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=UPwzqPOs4-)]\n\n- `2022\u002F07` | [大型语言模型能否区分原因与结果？](https:\u002F\u002Fopenreview.net\u002Fforum?id=ucHh-ytUkOH&referrer=%5Bthe%20profile%20of%20Mrinmaya%20Sachan%5D(%2Fprofile%3Fid%3D~Mrinmaya_Sachan3))\n\\-\n[[论文](https:\u002F\u002Fopenreview.net\u002Fpdf?id=ucHh-ytUkOH)]\n\n- `2021\u002F08` | [学习因果图的忠实表示](https:\u002F\u002Faclanthology.org\u002F2021.acl-long.69\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2021.acl-long.69.pdf)]\n\n- `2021\u002F05` | `InferBERT` | [InferBERT：一种基于Transformer的因果推断框架，用于增强药物警戒](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffrai.2021.659622\u002Ffull)\n\\-\n[[论文](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffrai.2021.659622\u002Fpdf?isPublishedV2=False)]\n\n- `2021\u002F02` | [迈向因果表征学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.11107)\n\\-\n\n- `2020\u002F05` | `CausaLM` | [CausaLM：通过反事实语言模型解释因果模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.13407)\n\\-\n\n- `2019\u002F06` | [用于评估因果发现算法的神经性疼痛诊断模拟器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01732)\n\\-\n\n- `2017` | [因果推断要素：基础与学习算法](https:\u002F\u002Fmitpress.mit.edu\u002F9780262037310\u002Felements-of-causal-inference\u002F)\n\\-\n[[书籍](https:\u002F\u002Flibrary.oapen.org\u002Fbitstream\u002Fid\u002F056a11be-ce3a-44b9-8987-a6c68fce8d9b\u002F11283.pdf)]\n\n- `2016` | 实际因果关系\n\\-\n[[书籍](https:\u002F\u002Fdirect.mit.edu\u002Fbooks\u002Foa-monograph\u002F3451\u002FActual-Causality)]\n\n- `2013` | 因果推理\n\\-\n[[论文](https:\u002F\u002Fpsycnet.apa.org\u002Frecord\u002F2012-26298-046)]\n\n#### 3.4.1 反事实推理\n\n- `2023\u002F07` | [推理还是背诵？通过反事实任务探索语言模型的能力与局限性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02477)\n\\-\n\n- `2023\u002F05` | [反事实推理：测试语言模型对假设情景的理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16572)\n\\-\n\n- `2007` | 理性的想象：人们如何创造现实的替代方案\n\\-\n[[论文](https:\u002F\u002Fscholar.archive.org\u002Fwork\u002Fzjwdgk7r6vefxaole362qftqji\u002Faccess\u002Fwayback\u002Fhttp:\u002F\u002Fwww.tara.tcd.ie\u002Fbitstream\u002Fhandle\u002F2262\u002F39428\u002FPrecis%20of%20The%20Rational%20Imagination%20-%20How%20People%20Create%20Alternatives%20to%20Reality.pdf?sequence=1)]\n\n- `1986` | 规范理论：将现实与其替代方案进行比较\n\\-\n[[论文](https:\u002F\u002Fpsycnet.apa.org\u002Frecord\u002F1986-21899-001)]\n\n#### 3.4.x 基准、数据集和指标\n\n- `2021\u002F12` | `CRASS` | [CRASS：一个用于测试大型语言模型反事实推理能力的新数据集和基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.11941)\n\\-\n\n- `2021\u002F08` | `Arctic sea ice` | [北极海冰与大气相互作用中数据驱动的因果发现方法的基准测试](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffdata.2021.642182\u002Ffull)\n\\-\n[[论文](https:\u002F\u002Fwww.frontiersin.org\u002Farticles\u002F10.3389\u002Ffdata.2021.642182\u002Fpdf?isPublishedV2=False)]\n\n- `2014\u002F12` | `CauseEffectPairs` | [利用观测数据区分原因与结果：方法与基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1412.3773)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 3.5 视觉推理\n\n\u003Cdetails open>\n\u003Csummary>视觉推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.5 视觉推理](#35-visual-reasoning)\n  - [3.5.1 3D 推理](#351-3d-reasoning)\n  - [3.5.x 基准、数据集和指标](#35x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2025\u002F02` | `VPT` | [在多模态大语言模型中引入视觉感知标记](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17425) - [[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17425)] - [[代码](https:\u002F\u002Fgithub.com\u002Fyu-rp\u002FVisualPerceptionToken)] - [[模型](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Frp-yu\u002Fvpt-models-67b6afdc8679a05a2876f07a)] - [[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Frp-yu\u002FVPT_Datasets)]\n\n- `2022\u002F11` | `G-VUE` | [感知、定位、推理与行动：通用视觉表征的基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15402)\n\\-\n\n- `2021\u002F03` | `VLGrammar` | [VLGrammar：视觉与语言的接地语法归纳](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.12975)\n\\-\n\n- `2020\u002F12` | [基于学习到的对象嵌入的注意力机制可实现复杂的视觉推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.08508)\n\\-\n\n#### 3.5.1 3D 推理\n\n- `2023\u002F08` | `PointLLM` | [PointLLM：赋能大语言模型理解点云](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.16911)\n\\-\n\n- `2023\u002F08` | `3D-VisTA` | [3D-VisTA：用于3D视觉与文本对齐的预训练Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.04352)\n\\-\n\n- `2023\u002F07` | `3D-LLM` | [3D-LLM：将3D世界注入大语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.12981)\n\\-\n\n- `2022\u002F10` | `SQA3D` | [SQA3D：3D场景中的情境问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07474)\n\\-\n\n#### 3.5.x 基准、数据集和指标\n- `2025\u002F04` | `VisuLogic` | [VisuLogic：评估多模态大语言模型中视觉推理能力的基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.15279)\n\\-\n\n- `2021\u002F12` | `PTR` | [PTR：基于部件的概念、关系及物理推理的基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.05136)\n\\-\n\n- `2019\u002F05` | `OK-VQA` | [OK-VQA：需要外部知识的视觉问答基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.00067)\n\\-\n\n- `2016\u002F12` | `CLEVR` | [CLEVR：用于组合语言和基础视觉推理的诊断性数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.06890)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.6 音频推理\n\n\u003Cdetails open>\n\u003Csummary>音频推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.6 音频推理](#36-audio-reasoning)\n  - [3.6.1 语音](#361-speech)\n  - [3.6.x 基准、数据集和指标](#36x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F11` | `M2UGen` | [M2UGen：利用大语言模型的力量进行多模态音乐理解和生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.11255)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.11255.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fcrypto-code\u002FM2UGen)]\n\n- `2023\u002F08` | `MU-LLaMA` | [音乐理解LLaMA：通过问答和字幕生成推进文本到音乐的创作](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.11276)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.11276.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fcrypto-code\u002FMU-LLaMA)]\n\n- `2022\u002F05` | [自监督语音表示学习：综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10643)\n\\-\n\n#### 3.6.1 语音\n\n- `2022\u002F03` | `SUPERB-SG` | [SUPERB-SG：增强版语音处理通用性能基准，适用于语义和生成能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.06849)\n\\-\n\n- `2022\u002F02` | `Data2Vec` | [data2vec：一种用于语音、视觉和语言的自监督学习通用框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03555)\n\\-\n\n- `2021\u002F10` | `WavLM` | [WavLM：面向全栈语音处理的大规模自监督预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.13900)\n\\-\n\n- `2021\u002F06` | `HuBERT` | [HuBERT：通过掩码预测隐藏单元进行自监督语音表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.07447)\n\\-\n\n- `2021\u002F05` | `SUPERB` | [SUPERB：语音处理通用性能基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01051)\n\\-\n\n- `2020\u002F10` | `Speech SIMCLR` | [Speech SIMCLR：结合对比和重建目标的自监督语音表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.13991)\n\\-\n\n- `2020\u002F06` | `Wav2Vec 2.0` | [wav2vec 2.0：语音表示自监督学习的框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11477)\n\\-\n\n- `2020\u002F05` | `Conformer` | [Conformer：用于语音识别的卷积增强型Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08100)\n\\-\n\n- `2019\u002F10` | `Mockingjay` | [Mockingjay：使用深度双向Transformer编码器进行无监督语音表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12638)\n\\-\n\n- `2019\u002F04` | `APC` | [一种用于语音表示学习的无监督自回归模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03240)\n\\-\n\n- `2018\u002F07` | `CPC` | [通过对比预测编码进行表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.03748)\n\\-\n\n- `2018\u002F04` | `Speech-Transformer` | Speech-Transformer：一种用于语音识别的无循环序列到序列模型\n\\-\n[[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8462506)]\n\n- `2017\u002F11` | `VQ-VAE` | [神经离散表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.00937)\n\\-\n\n- `2017\u002F08` | [通过师生学习进行大规模领域适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.05466)\n\\-\n\n#### 3.6.x 基准、数据集和指标\n\n- `2022\u002F03` | `SUPERB-SG` | [SUPERB-SG：增强版语音处理通用性能基准，适用于语义和生成能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.06849)\n\\-\n\n- `2021\u002F11` | `VoxPopuli` \u002F `XLS-R` | [XLS-R：大规模自监督跨语言语音表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09296)\n\\-\n\n- `2021\u002F05` | `SUPERB` | [SUPERB：语音处理通用性能基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01051)\n\\-\n\n- `2020\u002F12` | `Multilingual LibriSpeech` | [MLS：用于语音研究的大规模多语言数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.03411)\n\\-\n\n- `2020\u002F05` | `Didi Dictation` \u002F `Didi Callcenter` | [关于基于Transformer的语音识别中无监督预训练的进一步研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.09862)\n\\-\n\n- `2019\u002F12` | `Libri-Light` | [Libri-Light：用于有限或无监督ASR的基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07875)\n\\-\n\n- `2019\u002F12` | `Common Voice` | [Common Voice：一个大规模多语言语音语料库](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06670)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 3.7 多模态推理\n\n\u003Cdetails open>\n\u003Csummary>多模态推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.7 多模态推理](#37-multimodal-reasoning)\n  - [3.7.1 对齐](#371-alignment)\n  - [3.7.2 生成](#372-generation)\n  - [3.7.3 多模态理解](#373-multimodal-understanding)\n  - [3.7.x 基准、数据集和度量](#37x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2023\u002F12` | [GPT-4V的挑战者？Gemini在视觉专长中的早期探索]()\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.12436.pdf)]\n[[项目](https:\u002F\u002Fgithub.com\u002FBradyFU\u002FAwesome-Multimodal-Large-Language-Models)]\n\n#### 3.7.1 对齐\n\n- `2023\u002F01` | `BLIP-2` | [BLIP-2：利用冻结图像编码器和大型语言模型进行语言-图像预训练的自举方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597)\n\\-\n[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fli23q.html)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Fblip2)]\n\n#### 3.7.2 生成\n\n- `2023\u002F10` | `DALL·E 3` | 通过更好的描述文字提升图像生成质量\n\\-\n[[论文](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fdall-e-3.pdf)]\n[[项目](https:\u002F\u002Fopenai.com\u002Fdall-e-3)]\n\n- `2023\u002F06` | `Kosmos-2` | [Kosmos-2：将多模态大型语言模型与现实世界联系起来](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.14824)\n\\-\n\n- `2023\u002F05` | `BiomedGPT` | [BiomedGPT：一种用于视觉、语言和多模态任务的统一且通用的生物医学生成式预训练Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.17100)\n\\-\n\n- `2023\u002F03` | `Visual ChatGPT` | [Visual ChatGPT：与视觉基础模型对话、绘图和编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04671)\n\\-\n\n- `2023\u002F02` | `Kosmos-1` | [语言并非一切：将感知与语言模型对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045)\n\\-\n\n- `2022\u002F07` | `Midjourney`\n\\-\n[[项目](https:\u002F\u002Fwww.midjourney.com\u002Fhome)]\n\n- `2022\u002F04` | `Flamingo` | [Flamingo：用于少样本学习的视觉语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198)\n\\-\n\n- `2021\u002F12` | `MAGMA` | [MAGMA——通过基于适配器的微调增强生成模型的多模态能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.05253)\n\\-\n\n#### 3.7.3 多模态理解\n\n- `2023\u002F09` | `Q-Bench` | [Q-Bench：面向低级视觉的通用基础模型基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.14181)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.14181.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FQ-Future\u002FQ-Bench)]\n\n- `2023\u002F05` | `DetGPT` | [DetGPT：通过推理检测所需内容](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14167)\n\\-\n\n- `2023\u002F03` | `Vicuna` | Vicuna：一款开源聊天机器人，以90%*的ChatGPT质量令人印象深刻\n\\-\n[[博客](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-03-30-vicuna\u002F)]\n[[代码](https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat)]\n\n- `2022\u002F12` | `DePlot` | [DePlot：通过图表到表格的转换实现一次性的视觉语言推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10505)\n\\-\n\n- `2022\u002F12` | `MatCha` | [MatCha：通过数学推理和图表反渲染增强视觉语言预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09662)\n\\-\n\n#### 3.7.x 基准、数据集和度量\n\n- `2023\u002F06` | `LVLM-eHub` | [LVLM-eHub：大型视觉-语言模型的综合评估基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09265)\n\\-\n\n- `2023\u002F06` | `LAMM` | [LAMM：语言辅助的多模态指令微调数据集、框架和基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.06687)\n\\-\n\n- `2023\u002F05` | `AttackVLM` | [关于评估大型视觉-语言模型对抗鲁棒性的研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16934)\n\\-\n\n- `2023\u002F05` | `POPE` | [评估大型视觉-语言模型中的对象幻觉现象](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10355)\n\\-\n\n- `2023\u002F05` | `MultimodalOCR` | [关于大型多模态模型中OCR隐藏奥秘的研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.07895)\n\\-\n\n- `2022\u002F10` | `ObjMLM` | [看似合理未必忠实：探测视觉-语言预训练中的对象幻觉现象](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07688)\n\n- `2022\u002F06` | `RAVEN` \u002F `ARC` | [在概念抽象基准上评估理解能力的研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14187)\n\\-\n\n- `2021\u002F06` | `LARC` | [向人类和机器传达自然程序的研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.07824)\n\\-\n\n- `2014\u002F11` | `CIDEr` \u002F `PASCAL-50S` \u002F `ABSTRACT-50S` | [CIDEr：基于共识的图像描述评估方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.5726)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 3.8 代理推理\n\n\u003Cdetails open>\n\u003Csummary>代理推理\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.8 代理推理](#38-agent-reasoning)\n  - [3.8.1 内省式推理](#381-introspective-reasoning)\n  - [3.8.2 外省式推理](#382-extrospective-reasoning)\n  - [3.8.3 多智能体推理](#383-multi-agent-reasoning)\n  - [3.8.4 驾驶推理](#384-driving-reasoning)\n  - [3.8.x 基准、数据集和度量](#38x-benchmarks-datasets-and-metrics)\n\n\u003Cbr>\n\n- `2024\u002F01` | `AutoRT`\n| `Ahn et al.`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:266906759?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nAutoRT：用于大规模机器人代理编排的具身基础模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12963)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12963.pdf)]\n[[项目](https:\u002F\u002Fauto-rt.github.io\u002F)]\n\n- `2023\u002F11` | `OpenFlamingo` | [视觉-语言基础模型作为有效的机器人模仿者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.01378)\n\\-\n\n- `2023\u002F07` | `RT-2` | [RT-2：视觉-语言-动作模型将网络知识迁移到机器人控制中](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15818)\n\\-\n\n- `2023\u002F05` | `RAP` | [用语言模型进行推理就是用世界模型进行规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)\n\\-\n\n- `2023\u002F03` | `PaLM-E`\n| `Driess et al., ICML 2023`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:257364842?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nPaLM-E：一款具身多模态语言模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03378)]\n[[论文](https:\u002F\u002Ficml.cc\u002Fvirtual\u002F2023\u002Fposter\u002F23969)]\n[[项目](https:\u002F\u002Fpalm-e.github.io\u002F)]\n\n- `2022\u002F12` | `RT-1` | [RT-1：用于大规模真实世界控制的机器人Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817)\n\n- `2022\u002F10` | [通过潜在语言进行技能归纳与规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.01517)\n\\-\n\n- `2022\u002F05` | `Gato` | [一种通用型智能体](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.06175)\n\\-\n\n- `2022\u002F04` | `SMs` | [苏格拉底模型：用语言组合零样本多模态推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.00598)\n\\-\n\n- `2022\u002F02` | [用于交互式决策的预训练语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01771)\n\\-\n\n- `2022\u002F01` | `语言-规划者` | [语言模型作为零样本规划者：为具身智能体提取可操作的知识](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.07207)\n\\-\n\n- `2021\u002F11` | [价值函数空间：以技能为中心的状态抽象，用于长期推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.03189)\n\\-\n\n- `2020\u002F09` | [无需视觉的视觉接地规划：语言模型从高层次指令中推断出详细计划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.14259)\n\\-\n\n- `2016\u002F01` | `AlphaGo` | 通过深度神经网络和树搜索掌握围棋\n\\-\n[[论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature16961)]\n\n- `2014\u002F05` | 理性中的手势：具身视角\n\\-\n[[论文](https:\u002F\u002Fwww.taylorfrancis.com\u002Fchapters\u002Fedit\u002F10.4324\u002F9781315775845-19\u002Fgesture-reasoning-martha-alibali-rebecca-boncoddo-autumn-hostetter)]\n\n#### 3.8.1 内省式推理\n\n- `2022\u002F11` | `PAL` | [PAL：程序辅助语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10435)\n\\-\n\n- `2022\u002F09` | `ProgPrompt` | [ProgPrompt：利用大型语言模型生成情境化的机器人任务计划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302)\n\\-\n\n- `2022\u002F09` | `代码即策略` | [代码即策略：用于具身控制的语言模型程序](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07753)\n\\-\n\n- `2022\u002F04` | `SayCan` | [像我能做的那样做，而不是像我说的那样做：将语言与机器人可用性相结合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.01691)\n\\-\n\n- `2012` | 内省学习与推理\n\\-\n[[论文](https:\u002F\u002Flink.springer.com\u002Freferenceworkentry\u002F10.1007\u002F978-1-4419-1428-6_1802)]\n\n#### 3.8.2 外省式推理\n\n- `2023\u002F06` | `Statler` | [Statler：用于具身推理的状态保持语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17840)\n\\-\n\n- `2023\u002F02` | `规划者-执行者-报告者` | [与语言模型协作进行具身推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00763)\n\\-\n\n- `2023\u002F02` | `Toolformer` | [Toolformer：语言模型可以自我教授如何使用工具](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.04761)\n\\-\n\n- `2022\u002F12` | `LLM-Planner` | [LLM-Planner：利用大型语言模型为具身智能体进行少样本接地规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.04088)\n\\-\n\n- `2022\u002F10` | `ReAct` | [ReAct：在语言模型中协同推理与行动](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)\n\\-\n\n- `2022\u002F10` | `Self-Ask` | [衡量并缩小语言模型中的组合性差距](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03350)\n\\-\n\n- `2022\u002F07` | `内心独白` | [内心独白：通过语言模型规划实现具身推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.05608)\n\\-\n\n#### 3.8.3 多智能体推理\n\n- `2023\u002F07` | `联邦LLM` | [联邦大型语言模型：立场文件](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.08925)\n\\-\n\n- `2023\u002F07` | [基于自适应大型语言模型（LLM）的多智能体系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.06187)\n\\-\n\n- `2023\u002F07` | `Co-LLM-Agents` | [利用大型语言模型模块化构建合作型具身智能体](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02485)\n\\-\n\n- `2023\u002F05` | [通过多智能体辩论提升语言模型的事实性和推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14325)\n\\-\n\n- `2017\u002F02` | `FIoT` | FIoT：一种基于物联网的自适应、自组织应用的代理框架\n\\-\n[[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0020025516313664)]\n\n- `2004` | IBM 自动计算工具包实用指南\n\\-\n[[书籍](https:\u002F\u002Fbooks.google.com.hk\u002Fbooks\u002Fabout\u002FA_Practical_Guide_to_the_IBM_Autonomic_C.html?id=XHeoSgAACAAJ&redir_esc=y)]\n\n#### 3.8.4 驾驶推理\n\n- `2023\u002F12` | `DriveLM` | [DriveLM：基于图的视觉问答进行驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14150)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14150.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveLM)]\n\n- `2023\u002F12` | `LiDAR-LLM` | [LiDAR-LLM：探索大型语言模型在3D LiDAR理解方面的潜力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14074)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14074.pdf)]\n[[项目](https:\u002F\u002Fsites.google.com\u002Fview\u002Flidar-llm)]\n\n- `2023\u002F12` | `DriveMLM` | [DriveMLM：将多模态大型语言模型与自动驾驶的行为规划状态对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.09245)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.09245.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FDriveMLM)]\n\n- `2023\u002F12` | `LMDrive` | [LMDrive：利用大型语言模型实现闭环端到端驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.07488)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.07488.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fopendilab\u002FLMDrive)]\n\n- `2023\u002F10` | [穿越概念僵局：解开自动驾驶中的可解释性瓶颈](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.16639)\n\\-\n\n- `2023\u002F10` | [视觉语言模型在自动驾驶和智能交通系统中的应用](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.14414)\n\\-\n\n- `2023\u002F10` | `DriveGPT4` | [DriveGPT4：通过大型语言模型实现可解释的端到端自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01412)\n\\-\n\n- `2023\u002F09` | `MotionLM` | [MotionLM：将多智能体运动预测视为语言建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16534)\n\\-\n\n- `2023\u002F06` | [端到端自动驾驶：挑战与前沿](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16927)\n\\-\n\n- `2023\u002F04` | [基于图的驾驶场景拓扑推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05277)\n\\-\n\n- `2022\u002F09` | [深入探讨鸟瞰感知的难点：综述、评估与解决方案](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05324)\n\\-\n\n- `2021\u002F11` | 人工智能：科学研究的强大范式\n\\-\n[[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS2666675821001041)]\n\n#### 3.8.x 基准、数据集和指标\n\n- `2023\u002F12` | `DriveLM` | [DriveLM：基于图的视觉问答进行驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14150)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.14150.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveLM)]\n\n- `2023\u002F09` | `NuPrompt` \u002F `PromptTrack` | [自动驾驶的语言提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.04379)\n\\-\n\n- `2023\u002F07` | `LCTGen` | [语言条件下的交通流生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07947)\n\n- `2023\u002F05` | `NuScenes-QA` | [NuScenes-QA：自动驾驶场景的多模态视觉问答基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14836)\n\\-\n\n- `2022\u002F06` | `BEHAVIOR-1K` | [BEHAVIOR-1K：包含1,000项日常活动和真实模拟的具身AI基准](https:\u002F\u002Fproceedings.mlr.press\u002Fv205\u002Fli23a.html)\n\\-\n\n- `2021\u002F08` | `iGibson` | [iGibson 2.0：以物体为中心的仿真，用于机器人学习日常家务任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.03272)\n\\-\n\n- `2021\u002F06` | `Habitat 2.0` | [Habitat 2.0：训练家庭助手重新布置其栖息地](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.14405)z\n\\-\n\n- `2020\u002F04` | `RoboTHOR` | [RoboTHOR：一个开放的仿真到现实的具身AI平台](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06799)\n\\-\n\n- `2019\u002F11` | `HAD` | [为自动驾驶车辆提供人车交互建议的接地](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06978)\n\\-\n\n- `2019\u002F04` | `Habitat` | [Habitat：用于具身AI研究的平台](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01201)\n\\-\n\n- `2018\u002F08` | `Gibson` | [Gibson环境：具身智能体的真实世界感知](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.10654)\n\\-\n\n- `2018\u002F06` | `VirtualHome` | [VirtualHome: 通过程序模拟家庭活动](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.07011)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n\n### 3.9 其他任务与应用\n\n\u003Cdetails open>\n\u003Csummary>其他任务与应用\u003C\u002Fsummary>\n\n[推理任务（返回顶部）](#3-reasoning-tasks)\n\n- [3.9 其他任务与应用](#39-other-tasks-and-applications)\n  - [3.9.1 心理理论 (ToM)](#391-theory-of-mind-tom)\n  - [3.9.2 大型语言模型在天气预报中的应用](#392-llms-for-weather-prediction)\n  - [3.9.3 抽象推理](#393-abstract-reasoning)\n  - [3.9.4 可废止推理](#394-defeasible-reasoning)\n  - [3.9.5 医学推理](#395-medical-reasoning)\n  - [3.9.6 生物信息学推理](#396-bioinformatics-reasoning)\n  - [3.9.7 长链推理](#397-long-chain-reasoning)\n\n#### 3.9.1 心理理论 (ToM)\n\n- `2023\u002F02` | `ToM` | [大型语言模型中可能自发出现心理理论](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.02083)\n\\-\n\n#### 3.9.2 大型语言模型在天气预报中的应用\n\n- `2022\u002F09` | `MetNet-2` | 基于深度学习的十二小时降水预报\n\\-\n[[论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41467-022-32483-x)]\n\n- `2023\u002F07` | `Pangu-Weather` | 利用三维神经网络实现高精度的中期全球天气预报\n\\-\n[[论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06185-3)]\n\n#### 3.9.3 抽象推理\n\n- `2023\u002F05` | [大型语言模型并非强大的抽象推理者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19555)\n\\-\n\n#### 3.9.4 可废止推理\n\n- `2023\u002F06` | `BoardgameQA` | [BoardgameQA：一个包含矛盾信息的自然语言推理数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07934)\n\\-\n\n- `2021\u002F10` | `CURIOUS` | [想一想！通过先建模问题场景来提升可废止推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.12349)\n\\-\n\n- `2020\u002F11` | `Defeasible NLI` \u002F `δ-NLI` | [像怀疑论者一样思考：自然语言中的可废止推理](https:\u002F\u002Faclanthology.org\u002F2020.findings-emnlp.418\u002F)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002F2020.findings-emnlp.418.pdf)]\n\n- `2020\u002F04` | `KACC` | [KACC：一个用于知识抽象、具体化和补全的多任务基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.13631)\n\\-\n\n- `2009\u002F01` | 可废止推理的递归语义\n\\-\n[[论文](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-0-387-98197-0_9)]\n\n#### 3.9.5 医学推理\n\n- `2024\u002F01` | `CheXagent` \u002F `CheXinstruct` \u002F `CheXbench`\n| `Chen 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267069358?fields=citationCount&query=%24.citationCount&label=citations)\n![星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FStanford-AIMI\u002FCheXagent.svg?style=social&label=Star) \u003Cbr>\nCheXagent：迈向胸部X光片解读的基础模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.12208)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.12208.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FStanford-AIMI\u002FCheXagent)]\n[[项目页面](https:\u002F\u002Fstanford-aimi.github.io\u002Fchexagent.html)]\n[[Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fstanford-crfm\u002FBioMedLM)]\n\n- `2024\u002F01` | `EchoGPT`\n| `Chao 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267042212?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nEchoGPT：用于超声心动图报告摘要的大语言模型 \u003Cbr>\n[[medRxiv](https:\u002F\u002Fwww.medrxiv.org\u002Fcontent\u002F10.1101\u002F2024.01.18.24301503)]\n[[论文](https:\u002F\u002Fwww.medrxiv.org\u002Fcontent\u002F10.1101\u002F2024.01.18.24301503v1.full.pdf)]\n\n- `2023\u002F10` | `GPT4V-医学报告`\n| `Yan 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:264805701?fields=citationCount&query=%24.citationCount&label=citations)\n![星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FZhilingYan\u002FGPT4V-医学报告.svg?style=social&label=Star) \u003Cbr>\n面向医疗应用的多模态ChatGPT：GPT-4V的实验研究 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.19061)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15189.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002FZhilingYan\u002FGPT4V-医学报告)]\n\n- `2023\u002F10` | `VisionFM`\n| `Qiu 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:263828921?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nVisionFM：一种通用眼科人工智能的多模态多任务视觉基础模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04992)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.04992.pdf)]\n\n- `2023\u002F09`\n| `Yang 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:263310951?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nLMMs 的曙光：使用 GPT-4V(ision) 的初步探索 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17421)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.17421.pdf)]\n\n- `2023\u002F09` | `RETFound`\n| `Zhou 等人，Nature`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:264168236?fields=citationCount&query=%24.citationCount&label=citations)\n![星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Frmaphoh\u002FRETFound_MAE.svg?style=social&label=Star) \u003Cbr>\n一种用于从视网膜图像中进行泛化疾病检测的基础模型 \u003Cbr>\n[[论文](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06555-x)]\n[[代码](https:\u002F\u002Fgithub.com\u002Frmaphoh\u002FRETFound_MAE)]\n\n- `2023\u002F08` | `ELIXR`\n| `Xu 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:260378981?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\nELIXR：通过对齐大语言模型和放射影像编码器，迈向通用X射线人工智能系统 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01317)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.01317.pdf)]\n\n- `2023\u002F07` | `Med-Flamingo`\n| `Moor 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:260316059?fields=citationCount&query=%24.citationCount&label=citations)\n![星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fsnap-stanford\u002Fmed-flamingo.svg?style=social&label=Star) \u003Cbr>\nMed-Flamingo：一种多模态医学小样本学习模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15189)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15189.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fsnap-stanford\u002Fmed-flamingo)]\n\n- `2023\u002F07` | `Med-PaLM M`\n| `Tu 等人`\n![引用数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:260164663?fields=citationCount&query=%24.citationCount&label=citations)\n![星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fkyegomez\u002FMed-PaLM.svg?style=social&label=Star) \u003Cbr>\n迈向通用生物医学人工智能 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.14334)]\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.14334.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fkyegomez\u002FMed-PaLM)]\n\n- `2023\u002F06` | `Endo-FM`\n| `Wang et al., MICCAI 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:259287248?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmed-air\u002FEndo-FM.svg?style=social&label=Star) \u003Cbr>\n基于大规模自监督预训练的内窥镜视频分析基础模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16741)]\n[[paper](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-031-43996-4_10)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmed-air\u002FEndo-FM)]\n\n- `2023\u002F06` | `XrayGPT`\n| `Thawkar et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:259145194?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmbzuai-oryx\u002FXrayGPT.svg?style=social&label=Star) \u003Cbr>\nXrayGPT：利用医学视觉-语言模型进行胸部X光片摘要生成 \u003Cbr>\n\\-\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07971)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.07971.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FXrayGPT)]\n\n- `2023\u002F06` | `LLaVA-Med`\n| `Li et al., NeurIPS 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258999820?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FLLaVA-Med.svg?style=social&label=Star) \u003Cbr>\nLLaVA-Med：一天内训练一个用于生物医学的大规模语言-视觉助手 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00890)]\n[[paper](https:\u002F\u002Fneurips.cc\u002Fvirtual\u002F2023\u002Fposter\u002F73643)]\n[[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FLLaVA-Med)]\n\n- `2023\u002F05` | `HuatuoGPT`\n| `Zhang et al., Findings of EMNLP 2023`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258865566?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFreedomIntelligence\u002FHuatuoGPT.svg?style=social&label=Star) \u003Cbr>\nHuatuoGPT：朝着驯服语言模型成为医生的目标迈进 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15075)]\n[[paper](https:\u002F\u002Faclanthology.org\u002F2023.findings-emnlp.725\u002F)]\n[[code](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FHuatuoGPT)]\n\n- `2023\u002F05` | `Med-PaLM 2`\n| `Singhal et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:258715226?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\n迈向使用大型语言模型实现专家级医学问答 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.09617)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.09617.pdf)]\n\n- `2022\u002F12` | ` Med-PaLM` \u002F `MultiMedQA` \u002F `HealthSearchQA`\n| `Singhal et al., Nature`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:255124952?fields=citationCount&query=%24.citationCount&label=citations) \u003Cbr>\n大型语言模型编码临床知识 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.13138)]\n[[paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06291-2)]\n\n#### 3.9.6 生物信息学推理\n\n- `2023\u002F07` | `Prot2Text` | [Prot2Text：利用GNN和Transformer进行多模态蛋白质功能生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.14367)\n\\-\n\n- `2023\u002F07` | `Uni-RNA` | [Uni-RNA：通用预训练模型革新RNA研究](https:\u002F\u002Fwww.biorxiv.org\u002Fcontent\u002F10.1101\u002F2023.07.11.548588v1)\n\\-\n\n- `2023\u002F07` | `RFdiffusion` | 利用RFdiffusion从头设计蛋白质结构和功能\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-023-06415-8)]\n\n- `2023\u002F06` | `HyenaDNA` | [HyenaDNA：以单核苷酸分辨率进行长距离基因组序列建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.15794)\n\\-\n\n- `2023\u002F06` | `DrugGPT` | [DrugGPT：一种基于GPT的策略，用于设计针对特定蛋白质的潜在配体](https:\u002F\u002Fwww.biorxiv.org\u002Fcontent\u002F10.1101\u002F2023.06.29.543848v1)\n\\-\n\n- `2023\u002F04` | `GeneGPT` | [GeneGPT：通过领域工具增强大型语言模型，以更好地获取生物医学信息](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09667)\n\\-\n\n- `2023\u002F04` | 制药公司正在定制ChatGPT：以下是具体做法\n\\-\n[[News](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41587-023-01788-7)]\n\n- `2023\u002F01` | `ProGen` | 大型语言模型可生成跨不同家族的功能性蛋白质序列\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41587-022-01618-2)]\n\n- `2022\u002F06` | `ProGen2` | [ProGen2：探索蛋白质语言模型的边界](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.13517)\n\\-\n\n- `2021\u002F07` | `AlphaFold` | 使用AlphaFold实现高精度的蛋白质结构预测\n\\-\n[[Paper](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-021-03819-2)]\n\n#### 3.9.7 长链推理\n\n- `2022\u002F12` | `Fine-tune-CoT` | [大型语言模型是推理教师](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10071)\n\\-\n\n- `2021\u002F09` | `PlaTe` | [PlaTe：在程序性任务中利用Transformer进行视觉接地规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.04869)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C\u002Fdetails>\n\n\n\n\n## 4 推理技术\n\n\u003Cdetails open>\n\u003Csummary>推理技术\u003C\u002Fsummary>\n\n[(返回顶部)](#table-of-contents)\n\n### 目录 - 4\n\n\u003Cdetails open>\n\u003Csummary>推理技术（目录）\u003C\u002Fsummary>\n\n- [4 推理技术](#4-reasoning-techniques)\n  - [4.1 预训练](#41-pre-training)\n    - [4.1.1 数据](#411-data)\n      - [a. 数据 - 文本](#a-data---text)\n      - [b. 数据 - 图像](#b-data---image)\n      - [c. 数据 - 多模态](#c-data---multimodality)\n    - [4.1.2 网络架构](#412-network-architecture)\n      - [a. 编码器-解码器](#a-encoder-decoder)\n      - [b. 仅解码器](#b-decoder-only)\n      - [c. CLIP 变体](#c-clip-variants)\n      - [d. 其他](#d-others)\n  - [4.2 微调](#42-fine-tuning)\n    - [4.2.1 数据](#421-data)\n    - [4.2.2 参数高效微调](#422-parameter-efficient-fine-tuning)\n      - [a. Adapter 微调](#a-adapter-tuning)\n      - [b. 低秩适应](#b-low-rank-adaptation)\n      - [c. 提示词微调](#c-prompt-tuning)\n      - [d. 部分参数微调](#d-partial-parameter-tuning)\n      - [e. 模态混合适应](#e-mixture-of-modality-adaption)\n  - [4.3 对齐训练](#43-alignment-training)\n    - [4.3.1 数据](#431-data)\n      - [a. 数据 - 人类](#a-data---human)\n      - [b. 数据 - 合成](#b-data---synthesis)\n    - [4.3.2 训练流程](#432-training-pipeline)\n      - [a. 在线人类偏好训练](#a-online-human-preference-training)\n      - [b. 离线人类偏好训练](#b-offline-human-preference-training)\n  - [4.4 混合专家模型 (MoE)](#44-mixture-of-experts-moe)\n  - [4.5 上下文学习](#45-in-context-learning)\n    - [4.5.1 示例选择](#451-demonstration-example-selection)\n      - [a. 先验知识方法](#a-prior-knowledge-approach)\n      - [b. 检索方法](#b-retrieval-approach)\n    - [4.5.2 思维链](#452-chain-of-thought)\n      - [a. 零样本思维链](#a-zero-shot-cot)\n      - [b. 少样本思维链](#b-few-shot-cot)\n      - [c. 多路径聚合](#c-multiple-paths-aggregation)\n    - [4.5.3 多轮提示](#453-multi-round-prompting)\n      - [a. 学习型精炼器](#a-learned-refiners)\n      - [b. 提示型精炼器](#b-prompted-refiners)\n  - [4.6 自主智能体](#46-autonomous-agent)\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.1 预训练\n\n\u003Cdetails open>\n\u003Csummary>预训练\u003C\u002Fsummary>\n\n[推理技术（返回顶部）](#4-reasoning-techniques)\n\n- [4.1 预训练](#41-pre-training)\n  - [4.1.1 数据](#411-data)\n    - [a. 数据 - 文本](#a-data---text)\n    - [b. 数据 - 图像](#b-data---image)\n    - [c. 数据 - 多模态](#c-data---multimodality)\n  - [4.1.2 网络架构](#412-network-architecture)\n    - [a. 编码器-解码器](#a-encoder-decoder)\n    - [b. 仅解码器](#b-decoder-only)\n    - [c. CLIP 变体](#c-clip-variants)\n    - [d. 其他](#d-others)\n\n#### 4.1.1 数据\n\n##### a. 数据 - 文本\n\n- `2023\u002F07` | `peS2o` | peS2o（在 S2ORC 上高效预训练）数据集\n\\-\n[[代码](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fpes2o)]\n\n- `2023\u002F05` | `ROOTS` \u002F `BLOOM` | [BigScience ROOTS 语料库：一个 1.6TB 的多语言复合数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03915)\n\\-\n\n- `2023\u002F04` | `RedPajama` | RedPajama：用于训练大型语言模型的开放数据集\n\\-\n[[代码](https:\u002F\u002Fgithub.com\u002Ftogethercomputer\u002FRedPajama-Data)]\n\n- `2020\u002F12` | `The Pile` | [The Pile：用于语言建模的 800GB 多样化文本数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.00027)\n\\-\n\n- `2020\u002F04` | `Reddit` | [构建开放域聊天机器人的配方](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.13637)\n\\-\n\n- `2020\u002F04` | `CLUE` | [CLUE：中文语言理解评估基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.05986)\n\\-\n\n- `2019\u002F10` | `C4` | [探索统一文本到文本变换器迁移学习的极限](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683)\n\\-\n\n- `2013\u002F10` | `Gutenberg` | [单词搭配网络的复杂性：初步结构分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1310.5111)\n\n##### b. 数据 - 图像\n\n- `2023\u002F06` | `I2E` \u002F `MOFI` | [MOFI：从带有噪声实体标注的图像中学习图像表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07952)\n\\-\n\n- `2022\u002F01` | `SWAG` [重新审视弱监督视觉感知模型的预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.08371)\n\\-\n\n- `2021\u002F04` | `ImageNet-21K` | [面向大众的 ImageNet-21K 预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10972)\n\\-\n\n- `2017\u002F07` | `JFT` | [重新审视深度学习时代数据的不合理有效性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.02968)\n\\-\n\n- `2014\u002F09` | `ImageNet` | [ImageNet 大规模视觉识别挑战赛](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.0575)\n\\-\n\n##### c. 数据 - 多模态\n\n- `2023\u002F09` | `Point-Bind` | [Point-Bind & Point-LLM：将点云与多模态对齐，用于 3D 理解、生成和指令遵循](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00615)\n\\-\n\n- `2023\u002F05` | `ImageBind` | [ImageBind：一个嵌入空间绑定一切](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05665)\n\\-\n\n- `2023\u002F04` | `DataComp` | [DataComp：寻找下一代多模态数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14108)\n\\-\n\n- `2022\u002F10` | `LAION-5B` | [LAION-5B：用于训练下一代图文模型的开放大规模数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08402)\n\\-\n\n- `2022\u002F08` | `Shutterstock` | [质量而非数量：关于数据集设计与 CLIP 鲁棒性的相互作用](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.05516)\n\\-\n\n- `2022\u002F08` | `COYO-700M` | COYO-700M：图文配对数据集\n\\-\n[[代码](https:\u002F\u002Fgithub.com\u002Fkakaobrain\u002Fcoyo-dataset)]\n\n- `2022\u002F04` | `M3W` | [Flamingo：用于少样本学习的视觉语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198)\n\\-\n\n- `2021\u002F11` | `RedCaps` | [RedCaps：由人民创建、为人民服务的网络精选图文数据](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.11431)\n\\-\n\n- `2021\u002F11` | `LAION-400M` | [LAION-400M：经过 CLIP 过滤的 4 亿图文配对的开放数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.02114)\n\\-\n\n- `2021\u002F03` | `WIT` | [WIT：基于维基百科的多模态多语言机器学习图文数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.01913)\n\\-\n\n- `2011\u002F12` | `Im2Text` \u002F `SBU` | [Im2Text：使用 100 万张带说明的照片描述图像](https:\u002F\u002Fpapers.nips.cc\u002Fpaper_files\u002Fpaper\u002F2011\u002Fhash\u002F5dd9db5e033da9c6fb5ba83c7a7ebea9-Abstract.html)\n\\-\n\n#### 4.1.2 网络架构\n\n- `2023\u002F04` | [仅解码器还是编码器-解码器？将语言模型解释为正则化的编码器-解码器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04052)\n\\-\n\n##### a. 编码器-解码器\n\n- `2019\u002F10` | `BART` | [BART：用于自然语言生成、翻译和理解的去噪序列到序列预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.13461)\n\\-\n\n- `2019\u002F10` | `T5` | [探索统一文本到文本变换器迁移学习的极限](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683)\n\\-\n\n- `2018\u002F10` | `BERT` | [BERT: 用于语言理解的深度双向 Transformer 预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)\n\\-\n[[论文](https:\u002F\u002Faclanthology.org\u002FN19-1423.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert)]\n[[博客](https:\u002F\u002Fblog.research.google\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html)]\n\n- `2017\u002F06` | `Transformer` | [注意力就是一切](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762)\n\\-\n\n##### b. 仅解码器\n\n- `2023\u002F07` | `Llama 2` | [Llama 2：开放的基础及微调聊天模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09288)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[博客](https:\u002F\u002Fai.meta.com\u002Fllama\u002F)]\n\n- `2023\u002F02` | `LLaMA` | [LLaMA：开放且高效的通用语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13971)\n\\-\n[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.13971.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)]\n[[博客](https:\u002F\u002Fai.meta.com\u002Fblog\u002Flarge-language-model-llama-meta-ai\u002F)]\n\n- `2022\u002F11` | `BLOOM` | [BLOOM：一个拥有 1760 亿参数的开源多语言语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.05100)\n\\-\n\n- `2022\u002F10` | `GLM` | [GLM-130B：一个开放的双语预训练模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02414)\n\\-\n\n- `2022\u002F05` | `OPT` | [OPT：开放的预训练 Transformer 语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068)\n\\-\n\n- `2021\u002F12` | `Gopher` | [扩展语言模型：训练 Gopher 的方法、分析与见解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.11446)\n\\-\n\n- `2021\u002F05` | `GPT-3` | [语言模型是少样本学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)\n\\-\n[[论文](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-3)]\n\n- `2019\u002F02` | `GPT-2` | 语言模型是无监督的多任务学习者\n\\-\n[[论文](https:\u002F\u002Fcdn.openai.com\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)]\n\n- `2018\u002F06` | `GPT-1` | 通过生成式预训练提升语言理解能力\n\\-\n[[论文](https:\u002F\u002Fcdn.openai.com\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf)]\n\n##### c. CLIP 变体\n\n- `2023\u002F05` | `LaCLIP` | [通过语言重写改进 CLIP 训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20088)\n\\-\n\n- `2023\u002F04` | `DetCLIPv2` | [DetCLIPv2：基于词—区域对齐的可扩展开放词汇目标检测预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04514)\n\\-\n\n- `2022\u002F12` | `FLIP` | [通过掩码扩展语言—图像预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.00794)\n\\-\n\n- `2022\u002F09` | `DetCLIP` | [DetCLIP：面向开放世界检测的字典增强型视觉—概念并行预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09407)\n\\-\n\n- `2022\u002F04` | `K-LITE` | [K-LITE：利用外部知识学习可迁移的视觉模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.09222)\n\\-\n\n- `2021\u002F11` | `FILIP` | [FILIP：细粒度交互式语言—图像预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07783)\n\\-\n\n- `2021\u002F02` | `CLIP` | [从自然语言监督中学习可迁移的视觉模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)\n\\-\n[[论文](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fradford21a\u002Fradford21a.pdf)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP)]\n[[博客](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fclip)]\n\n##### d. 其他\n\n- `2023\u002F09` | `StreamingLLM` | [具有注意力汇流的高效流式语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17453)\n\\-\n\n- `2023\u002F07` | `RetNet` | [Retentive Network：大型语言模型的 Transformer 继任者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.08621)\n\\-\n\n- `2023\u002F07` | | [LongNet：将 Transformer 扩展至 10 亿标记](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02486)\n\\-\n\n- `2023\u002F05` | `RWKV` | [RWKV：为 Transformer 时代重新发明 RNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13048)\n\\-\n\n- `2023\u002F02` | `Hyena` | [Hyena 层次结构：迈向更大的卷积语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10866)\n\\-\n\n- `2022\u002F12` | `H3` | [饥饿的河马：迈向使用状态空间模型的语言建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14052)\n\\-\n\n- `2022\u002F06` | `GSS` | [通过门控状态空间实现长距离语言建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.13947)\n\\-\n\n- `2022\u002F03` | `DSS` | [对角线状态空间与结构化状态空间同样有效](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14343)\n\\-\n\n- `2021\u002F10` | `S4` | [利用结构化状态空间高效建模长序列](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.00396)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 4.2 微调\n\n\u003Cdetails open>\n\u003Csummary>微调\u003C\u002Fsummary>\n\n[推理技术（返回顶部）](#4-推理技术)\n\n- [4.2 微调](#42-微调)\n  - [4.2.1 数据](#421-数据)\n  - [4.2.2 参数高效微调](#422-参数高效微调)\n    - [a. Adapter微调](#a-adapter微调)\n    - [b. 低秩适应](#b-低秩适应)\n    - [c. 提示词微调](#c-提示词微调)\n    - [d. 部分参数微调](#d-部分参数微调)\n    - [e. 多模态混合适应](#e-多模态混合适应)\n\n#### 4.2.1 数据\n\n- `2023\u002F09` | `MetaMath` | [MetaMath: 自举式生成大型语言模型的数学问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.12284)\n\\-\n\n- `2023\u002F09` | `MAmmoTH` | [MAmmoTH: 通过混合指令微调构建数学通才模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05653)\n\\-\n\n- `2023\u002F08` | `WizardMath` | [WizardMath: 基于强化进化指令提升大型语言模型的数学推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.09583)\n\\-\n\n- `2023\u002F08` | `RFT` | [大型语言模型学习数学推理的规模关系](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01825)\n\\-\n\n- `2023\u002F05` | `PRM800K` \u002F `` | [让我们逐步验证](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20050)\n\\-\n\n- `2023\u002F05` | `Distilling Step-by-Step` | [逐步提炼！用更少的训练数据和更小的模型规模超越更大的语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02301)\n\\-\n\n- `2023\u002F01` | [将小型语言模型专门化用于多步推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12726)\n\\-\n\n- `2022\u002F12` | `Fine-tune-CoT` | [大型语言模型是推理教师](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10071)\n\\-\n\n- `2022\u002F12` | [教导小型语言模型进行推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08410)\n\\-\n\n- `2022\u002F10` | [大型语言模型可以自我改进](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.11610)\n\\-\n\n- `2022\u002F10` | [大型语言模型的解释使小型推理者表现更好](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06726)\n\\-\n\n#### 4.2.2 参数高效微调\n\n##### a. Adapter微调\n\n- `2023\u002F03` | `LLaMA-Adapter` | [LLaMA-Adapter: 使用零初始化注意力机制高效微调语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16199)\n\\-\n\n- `2022\u002F05` | `AdaMix` | [AdaMix: 用于参数高效模型微调的混合适配器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12410)\n\\-\n\n- `2021\u002F10` | [迈向参数高效迁移学习的统一视角](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.04366)\n\\-\n\n- `2021\u002F06` | `Compacter` | [Compacter: 高效的低秩超复数适配器层](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.04647)\n\\-\n\n- `2020\u002F04` | `MAD-X` | [MAD-X: 基于适配器的多任务跨语言迁移框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00052)\n\\-\n\n- `2019\u002F02` | `Adapter` | [NLP中的参数高效迁移学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00751)\n\\-\n\n##### b. 低秩适应\n\n- `2023\u002F09` | `LongLoRA` \u002F `LongAlpaca-12k`\n| `Chen et al., ICLR 2024`\n![引用次数](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:262084134?fields=citationCount&query=%24.citationCount&label=citations)\n![星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdvlab-research\u002FLongLoRA.svg?style=social&label=Star) \u003Cbr>\nLongLoRA: 长上下文大型语言模型的高效微调 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.12307)]\n[[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=6PmJoRfdaK)]\n[[代码](https:\u002F\u002Fgithub.com\u002Fdvlab-research\u002FLongLoRA)]\n\n- `2023\u002F05` | `QLoRA` | [QLoRA: 量化LLM的高效微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14314)\n\\-\n\n- `2023\u002F03` | `AdaLoRA` | [参数高效微调中的自适应预算分配](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10512)\n\\-\n\n- `2022\u002F12` | `KronA` | [KronA: 使用克罗内克适配器进行参数高效微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10650)\n\\-\n\n- `2022\u002F10` | `DyLoRA` | [使用动态无搜索低秩适应对预训练模型进行参数高效微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07558)\n\\-\n\n- `2021\u002F06` | `LoRA` | [LoRA: 大型语言模型的低秩适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.09685)\n\\-\n\n##### c. 提示词微调\n\n- `2021\u002F10` | `P-Tuning v2` | [P-Tuning v2: 提示词微调在不同规模和任务中可与微调相媲美](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.07602)\n\\-\n\n- `2021\u002F04` | `Prompt Tuning` | [规模的力量：参数高效提示词微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691)\n\\-\n\n- `2021\u002F04` | `OptiPrompt` | [事实探针是[MASK]：学习还是学会回忆](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05240)\n\\-\n\n- `2021\u002F03` | `P-Tuning` | [GPT也懂得](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.10385)\n\\-\n\n- `2021\u002F01` | `Prefix-Tuning` | [前缀微调：优化连续提示以进行生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.00190)\n\\-\n\n##### d. 部分参数微调\n\n- `2023\u002F04` | `DiffFit` | [DiffFit: 通过简单的参数高效微调解锁大型扩散模型的迁移性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06648)\n\\-\n\n- `2022\u002F10` | `SSF` | [缩放与平移你的特征：高效模型微调的新基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08823)\n\\-\n\n- `2021\u002F09` | `Child-Tuning` | [在大型语言模型中培养孩子：迈向有效且通用的微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.05687)\n\\-\n\n- `2021\u002F06` | `BitFit` | [BitFit: 适用于基于Transformer的掩码语言模型的简单参数高效微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.10199)\n\\-\n\n##### e. 多模态混合适应\n\n- `2023\u002F10` | `LLaVA-1.5` | [通过视觉指令微调改进基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03744)\n\\-\n\n- `2023\u002F05` | `MMA` \u002F `LaVIN` | [廉价快速：大型语言模型的高效视觉-语言指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15023)\n\\-\n\n- `2023\u002F04` | `LLaMA-Adapter V2` | [LLaMA-Adapter V2: 参数高效的视觉指令模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.15010)\n\\-\n\n- `2023\u002F04` | `LLaVA` | [视觉指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n\\-\n\n- `2023\u002F02` | `RepAdapter` | [通过结构重参数化实现高效视觉适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.08106)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 4.3 对齐训练\n\n\u003Cdetails open>\n\u003Csummary>对齐训练\u003C\u002Fsummary>\n\n[推理技术（返回顶部）](#4-推理技术)\n\n- [4.3 对齐训练](#43-对齐-training)\n  - [4.3.1 数据](#431-数据)\n    - [a. 数据 - 人类](#a-数据---人类)\n    - [b. 数据 - 合成](#b-数据---合成)\n  - [4.3.2 训练流程](#432-训练流程)\n    - [a. 在线人类偏好训练](#a-在线人类偏好训练)\n    - [b. 离线人类偏好训练](#b-离线人类偏好训练)\n\n#### 4.3.1 数据\n\n##### a. 数据 - 人类\n\n- `2023\u002F06` | `Dolly` | 免费Dolly：推出全球首个真正开放的指令微调大模型\n\\-\n[[代码](https:\u002F\u002Fgithub.com\u002Fdatabrickslabs\u002Fdolly)]\n\n- `2023\u002F04` | `LongForm` | [LongForm：通过语料库提取优化长文本生成的指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08460)\n\\-\n\n- `2023\u002F04` | `COIG` | [中文开放指令通用模型：初步发布](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.07987)\n\\-\n\n- `2023\u002F04` | `OpenAssistant Conversations` | [OpenAssistant Conversations——民主化大型语言模型对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.07327)\n\\-\n\n- `2023\u002F01` | `Flan 2022` | [The Flan Collection：设计用于有效指令微调的数据与方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.13688)\n\\-\n\n- `2022\u002F11` | `xP3` | [通过多任务微调实现跨语言泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01786)\n\\-\n\n- `2022\u002F04` | `Super-NaturalInstructions` | [Super-NaturalInstructions：通过1600多个NLP任务的声明式指令实现泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07705)\n\\-\n\n- `2021\u002F11` | `ExT5` | [ExT5：迈向迁移学习的极端多任务扩展](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.10952)\n\\-\n\n- `2021\u002F10` | `MetaICL` | [MetaICL：在上下文中学习如何学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.15943)\n\\-\n\n- `2021\u002F10` | `P3` | [多任务提示训练实现零样本任务泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.08207)\n\\-\n\n- `2021\u002F04` | `CrossFit` | [CrossFit：NLP中跨任务泛化的少样本学习挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08835)\n\\-\n\n- `2021\u002F04` | `NATURAL INSTRUCTIONS` | [通过自然语言众包指令实现跨任务泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08773)\n\\-\n\n- `2020\u002F05` | `UnifiedQA` | [UnifiedQA：用单一问答系统跨越格式界限](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00700)\n\\-\n\n##### b. 数据 - 合成\n\n- `2023\u002F08` | `指令反译` | [通过指令反译进行自我对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.06259)\n\\-\n\n- `2023\u002F05` | `Dynosaur` | [Dynosaur：一种用于指令微调数据整理的动态增长范式](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14327)\n\\-\n\n- `2023\u002F05` | `UltraChat` | [通过扩展高质量指令对话提升聊天语言模型性能](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14233)\n\\-\n\n- `2023\u002F05` | `CoT Collection` | [The CoT Collection：通过思维链微调改善语言模型的零样本和少样本学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14045)\n\\-\n\n- `2023\u002F05` | `CoEdIT` | [CoEdIT：通过特定任务指令微调进行文本编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.09857)\n\\-\n\n- `2023\u002F04` | `LaMini-LM` | [LaMini-LM：从大规模指令中蒸馏出的多样化模型群](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14402)\n\\-\n\n- `2023\u002F04` | `GPT-4-LLM` | [使用GPT-4进行指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03277)\n\\-\n\n- `2023\u002F04` | `Koala` | Koala：一款用于学术研究的对话模型\n\\-\n[[博客](https:\u002F\u002Fbair.berkeley.edu\u002Fblog\u002F2023\u002F04\u002F03\u002Fkoala\u002F)]\n\n- `2023\u002F03` | `Alpaca` | Alpaca：一款强大且可复现的指令遵循模型\n\\-\n[[博客](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html)]\n\n- `2023\u002F03` | `GPT4All` | GPT4All：利用从GPT-3.5-Turbo蒸馏的大规模数据训练助理型聊天机器人\n\\-\n[[代码](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all)]\n\n- `2022\u002F12` | `OPT-IML` \u002F `OPT-IML Bench` | [OPT-IML：从泛化角度扩展语言模型指令元学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.12017)\n\\-\n\n- `2022\u002F12` | `Self-Instruct` | [Self-Instruct：用自动生成的指令对齐语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10560)\n\\-\n\n- `2022\u002F12` | `Unnatural Instructions` | [不自然指令：几乎无需人工即可微调语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09689)\n\\-\n\n#### 4.3.2 训练流程\n\n##### a. 在线人类偏好训练\n\n- `2023\u002F06` | `APA` | [利用优势诱导策略对齐微调语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02231)\n\\-\n\n- `2023\u002F04` | `RAFT` | [RAFT：奖励排序微调用于生成式基础模型对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06767)\n\\-\n\n- `2022\u002F03` | `InstructGPT` \u002F `RLHF` | [通过人类反馈训练语言模型遵循指令](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155)\n\\-\n\n##### b. 离线人类偏好训练\n\n- `2023\u002F06` | `PRO` | [为人类对齐优化偏好排序](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17492)\n\\-\n\n- `2023\u002F05` | `DPO` | [直接偏好优化：你的语言模型其实是一个奖励模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18290)\n\\-\n\n- `2023\u002F04` | `RRHF` | [RRHF：无需泪水，仅凭响应排序就能让语言模型与人类反馈保持一致](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05302)\n\\-\n\n- `2022\u002F09` | `SLiC` | [校准序列似然性可改善条件语言生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00045)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n### 4.4 混合专家模型（MoE）\n\n\u003Cdetails open>\n\u003Csummary>混合专家模型\u003C\u002Fsummary>\n\n[推理技术（返回顶部）](#4-reasoning-techniques)\n\n- `2024\u002F01` | `MoE-LLaVA`\n| `Lin et al.`\n![citations](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdynamic\u002Fjson?url=https:\u002F\u002Fapi.semanticscholar.org\u002Fgraph\u002Fv1\u002Fpaper\u002FCorpusID:267311517?fields=citationCount&query=%24.citationCount&label=citations)\n![Star](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FPKU-YuanGroup\u002FMoE-LLaVA.svg?style=social&label=Star) \u003Cbr>\nMoE-LLaVA：用于大型视觉-语言模型的混合专家模型 \u003Cbr>\n[[arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.15947)]\n[[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.15947.pdf)]\n[[code](https:\u002F\u002Fgithub.com\u002FPKU-YuanGroup\u002FMoE-LLaVA)]\n\n- `2023\u002F06` | [通过多任务异构训练实现高效通用模块化视觉模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17165)\n\\-\n\n- `2023\u002F03` | `MixedAE` | [用于自监督视觉表征学习的混合自动编码器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17152)\n\\-\n\n- `2022\u002F12` | `Mod-Squad` | [Mod-Squad：将混合专家设计为模块化的多任务学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08066)\n\\-\n\n- `2022\u002F04` | `MoEBERT` | [MoEBERT：通过重要性引导的适应从BERT到混合专家模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07675)\n\\-\n\n- `2021\u002F12` | `GLaM` | [GLaM：利用混合专家模型高效扩展语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.06905)\n\\-\n\n- `2021\u002F07` | `WideNet` | [与其加深网络，不如拓宽网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.11817)\n\\-\n\n- `2021\u002F01` | `Switch Transformers` | [Switch Transformers：通过简单高效的稀疏性扩展至万亿参数模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.03961)\n\\-\n\n- `2020\u002F06` | `GShard` | [GShard：利用条件计算和自动分片扩展巨型模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.16668)\n\\-\n\n- `2017\u002F01` | `稀疏门控混合专家模型` | [超大规模神经网络：稀疏门控混合专家层](https:\u002F\u002Farxiv.org\u002Fabs\u002F1701.06538)\n\\-\n\n- `1991\u002F03` | 自适应局部专家混合模型\n\\-\n[[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F6797059)]\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n### 4.5 上下文学习\n\n\u003Cdetails open>\n\u003Csummary>上下文学习\u003C\u002Fsummary>\n\n[推理技术（返回顶部）](#4-reasoning-techniques)\n\n- [4.5 上下文学习](#45-in-context-learning)\n  - [4.5.1 示范样例选择](#451-demonstration-example-selection)\n    - [a. 先验知识方法](#a-prior-knowledge-approach)\n    - [b. 检索方法](#b-retrieval-approach)\n  - [4.5.2 思维链](#452-chain-of-thought)\n    - [a. 零样本思维链](#a-zero-shot-cot)\n    - [b. 少样本思维链](#b-few-shot-cot)\n    - [c. 多路径聚合](#c-multiple-paths-aggregation)\n  - [4.5.3 多轮提示](#453-multi-round-prompting)\n    - [a. 学习型精炼器](#a-learned-refiners)\n    - [b. 提示型精炼器](#b-prompted-refiners)\n\n\u003Cbr>\n\n- `2022\u002F10` | `FLAN-T5` | [扩展指令微调的语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.11416)\n\\-\n\n- `2021\u002F05` | `GPT-3` | [语言模型是少样本学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)\n\\-\n[[Paper](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-3)]\n\n#### 4.5.1 示范样例选择\n\n##### a. 先验知识方法\n\n- `2022\u002F12` | [多样化的示范样例提升上下文组合泛化能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06800)\n\\-\n\n- `2022\u002F11` | [互补解释促进有效的上下文学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.13892)\n\\-\n\n- `2022\u002F10` | `Auto-CoT` | [大型语言模型中的自动思维链提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03493)\n\\-\n\n- `2022\u002F10` | `Complex CoT` | [基于复杂度的多步推理提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00720)\n\\-\n\n- `2022\u002F10` | `EmpGPT-3` | [GPT-3能否生成共情对话？一种新颖的上下文样例选择方法及共情对话生成的自动评估指标](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.56\u002F)\n\\-\n[[Paper](https:\u002F\u002Fgithub.com\u002Fpassing2961\u002FEmpGPT-3)]\n\n- `2022\u002F09` | [选择性标注使语言模型成为更好的少样本学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.01975)\n\\-\n\n- `2021\u002F01` | [什么样的上下文样例对GPT-3有效？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06804)\n\\-\n\n##### b. 检索方法\n\n- `2023\u002F10` | `DQ-LoRe` | [DQ-LoRe：低秩近似重排的双重查询用于上下文学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.02954)\n\\-\n\n- `2023\u002F07` | `LLM-R` | [学习检索大型语言模型的上下文样例](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.07164)\n\\-\n\n- `2023\u002F05` | `Dr.ICL` | [Dr.ICL：演示检索式上下文学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14128)\n\\-\n\n- `2023\u002F02` | `LENS` | [寻找上下文学习的支持样例](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13539)\n\\-\n\n- `2023\u002F02` | `CEIL` | [用于上下文学习的组合示例](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05698)\n\\-\n\n- `2021\u002F12` | [学习检索上下文学习的提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08633)\n\\-\n\n#### 4.5.2 思维链\n\n##### a. 零样本思维链\n- `2023\u002F09` | `LoT` | [通过逻辑增强大型语言模型中的零样本思维链推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.13339)\n\\-\n[[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.13339)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fxf-zhao\u002FLoT)]\n\n- `2023\u002F05` | `Plan-and-Solve` | [计划与解决提示：改善大型语言模型的零样本思维链推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04091)\n\\-\n\n- `2022\u002F05` | `Zero-shot-CoT` | [大型语言模型是零样本推理者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11916)\n\\-\n[[Paper](https:\u002F\u002Fopenreview.net\u002Fpdf?id=e2TBb5y0yFf)]\n[[Code](https:\u002F\u002Fgithub.com\u002Fkojima-takeshi188\u002Fzero_shot_cot)]\n\n##### b. 少样本思维链\n\n- `2023\u002F07` | `SoT` | [思维骨架：大型语言模型可以进行并行解码](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15337)\n\\-\n\n- `2023\u002F05` | `代码提示` | [代码提示：一种用于大型语言模型复杂推理的神经符号方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18507)\n\\-\n\n- `2023\u002F05` | `GoT` | [超越思维链，大型语言模型中的有效图思维链推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16582)\n\\-\n\n- `2023\u002F05` | `ToT` | [思维之树：使用大型语言模型进行深思熟虑的问题解决](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)\n\\-\n\n- `2023\u002F03` | `MathPrompter` | [MathPrompter：利用大型语言模型进行数学推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05398)\n\\-\n\n- `2022\u002F11` | `PoT` | [思维程序提示：将计算与推理分离以处理数值推理任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.12588)\n\\-\n\n- `2022\u002F11` | `PAL` | [PAL：程序辅助语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10435)\n\\-\n\n- `2022\u002F10` | `Auto-CoT` | [大型语言模型中的自动思维链提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03493)\n\\-\n\n- `2022\u002F10` | `Complex CoT` | [基于复杂度的多步推理提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00720)\n\\-\n\n- `2022\u002F05` | `由简入繁提示法` | [由简入繁提示法赋能大型语言模型进行复杂推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625)\n\\-\n\n- `2022\u002F01` | [思维链提示法激发大型语言模型的推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903)\n\\-\n\n##### c. 多路径聚合\n\n- `2023\u002F05` | `RAP` | [利用语言模型进行推理即是在用世界模型进行规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14992)\n\\-\n\n- `2023\u002F05` | [基于大型语言模型的自动模型选择用于推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14333)\n\\-\n\n- `2023\u002F05` | `AdaptiveConsistency` | [让我们逐步采样：面向高效推理与编码的自适应一致性方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11860)\n\\-\n\n- `2023\u002F05` | `ToT` | [思维之树：利用大型语言模型进行审慎的问题解决](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)\n\\-\n\n- `2023\u002F05` | `ToT` | [大型语言模型引导的思维之树](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.08291)\n\\-\n\n- `2023\u002F05` | [自我评估引导的束搜索用于推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.00633)\n\\-\n\n- `2022\u002F10` | `Complex CoT` | [基于复杂度的提示方法用于多步推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00720)\n\\-\n\n- `2022\u002F06` | `DIVERSE` | [通过步骤感知验证器使大型语言模型成为更好的推理者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.02336)\n\\-\n\n- `2022\u002F03` | [自我一致性提升语言模型中的思维链推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)\n\\-\n\n#### 4.5.3 多轮提示法\n\n##### a. 学习型精炼器\n\n- `2023\u002F02` | `LLM-Augmenter` | [核对事实并再试一次：借助外部知识和自动化反馈改进大型语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12813)\n\\-\n\n- `2022\u002F10` | `Self-Correction` | [通过学习自我修正来生成序列](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.00053)\n\\-\n\n- `2022\u002F08` | `PEER` | [PEER：一种协作式语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.11663)\n\\-\n\n- `2022\u002F04` | `R3` | [阅读、修订、重复：人机协作迭代文本修订系统演示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03685)\n\\-\n\n- `2021\u002F10` | `CURIOUS` | [好好想想！通过先建模问题场景来改进可废止推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.12349)\n\\-\n\n- `2020\u002F05` | `DrRepair` | [基于图的自监督程序修复，利用诊断反馈](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.10636)\n\\-\n\n##### b. 提示型精炼器\n\n- `2023\u002F06` | `InterCode` | [InterCode：标准化并基准化带有执行反馈的交互式编程](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.14898)\n\\-\n\n- `2023\u002F06` | [自我修复是代码生成的万能药吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09896)\n\\-\n\n- `2023\u002F05` | [通过多智能体辩论提升语言模型的事实性和推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14325)\n\\-\n\n- `2023\u002F05` | `CRITIC` | [CRITIC：大型语言模型可通过工具交互式批评实现自我修正](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11738)\n\\-\n\n- `2023\u002F05` | `GPT-Bargaining` | [通过自我博弈及从AI反馈中进行上下文学习来改进语言模型谈判](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10142)\n\\-\n\n- `2023\u002F05` | `Self-Edit` | [Self-Edit：面向代码生成的故障感知代码编辑器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04087)\n\\-\n\n- `2023\u002F04` | `PHP` | [渐进式提示法提升大型语言模型的推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09797)\n\\-\n\n- `2023\u002F04` | `Self-collaboration` | [通过ChatGPT实现自我协作式代码生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.07590)\n\\-\n\n- `2023\u002F04` | `Self-Debugging` | [教导大型语言模型进行自我调试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05128)\n\\-\n\n- `2023\u002F04` | `REFINER` | [REFINER：针对中间表示的推理反馈](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.01904)\n\\-\n\n- `2023\u002F03` | `Self-Refine` | [Self-Refine：基于自我反馈的迭代精炼](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17651)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C!--  -->\n\n\n### 4.6 自主代理\n\n\u003Cdetails open>\n\u003Csummary>自主代理\u003C\u002Fsummary>\n\n[推理技术（返回顶部）](#4-reasoning-techniques)\n\n- `2023\u002F10` | `规划标记` | [利用规划标记引导语言模型推理](https:\u002F\u002Faps.arxiv.org\u002Fabs\u002F2310.05707)\n\\-\n\n- `2023\u002F09` | `AutoAgents` | [AutoAgents：一个用于自动生成代理的框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17288)\n\\-\n\n- `2023\u002F06` | `AssistGPT` | [AssistGPT：一款能够规划、执行、检查和学习的通用多模态助手](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.08640)\n\\-\n\n- `2023\u002F05` | `SwiftSage` | [SwiftSage：一种具有快慢思维的生成式代理，适用于复杂的交互任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.17390)\n\\-\n\n- `2023\u002F05` | `MultiTool-CoT` | [MultiTool-CoT：GPT-3 可以在思维链提示下使用多种外部工具](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16896)\n\\-\n\n- `2023\u002F05` | `Voyager` | [Voyager：一个基于大型语言模型的开放式具身代理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291)\n\\-\n\n- `2023\u002F05` | `ChatCoT` | [ChatCoT：基于聊天的大规模语言模型上的工具增强型思维链推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14323)\n\\-\n\n- `2023\u002F05` | `CREATOR` | [CREATOR：为解耦大型语言模型的抽象与具体推理而创建工具](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14318)\n\\-\n\n- `2023\u002F05` | `TRICE` | [通过执行反馈使语言模型成为更好的工具学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13068)\n\\-\n\n- `2023\u002F05` | `ToolkenGPT` | [ToolkenGPT：通过工具嵌入将大量工具增强到冻结的语言模型中](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11554)\n\\-\n\n- `2023\u002F04` | `Chameleon` | [Chameleon：利用大型语言模型实现即插即用的组合式推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09842)\n\\-\n\n- `2023\u002F04` | `OpenAGI` | [OpenAGI：当LLM遇到领域专家时](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04370)\n\\-\n\n- `2023\u002F03` | `CAMEL` | [CAMEL：用于探索大型语言模型社会“心智”的交流型代理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17760)\n\\-\n\n- `2023\u002F03` | `HuggingGPT` | [HuggingGPT：利用ChatGPT及其在Hugging Face中的伙伴解决AI任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580)\n\\-\n\n- `2023\u002F03` | `Reflexion` | [Reflexion：具备言语强化学习的语言代理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)\n\\-\n\n- `2023\u002F03` | `ART` | [ART：面向大型语言模型的自动多步推理与工具使用](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09014)\n\\-\n\n- `2023\u002F03` | `Auto-GPT` | Auto-GPT：一项自主GPT-4实验\n\\-\n[[代码](https:\u002F\u002Fgithub.com\u002Fantony0596\u002Fauto-gpt)]\n\n- `2023\u002F02` | `Toolformer` | [Toolformer：语言模型可以自我教授如何使用工具](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.04761)\n\\-\n\n- `2022\u002F11` | `VISPROG` | [视觉编程：无需训练即可进行组合式视觉推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11559)\n\\-\n\n- `2022\u002F10` | `ReAct` | [ReAct：在语言模型中协同推理与行动](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629)\n\\-\n\n---\n\n\u003C\u002Fdetails>\n\n\u003C\u002Fdetails>","# Awesome-Reasoning-Foundation-Models 快速上手指南\n\n`Awesome-Reasoning-Foundation-Models` 并非一个可直接安装运行的单一软件包，而是一个**精选资源列表仓库**，旨在整理和分类用于推理任务的大型基础模型（Foundation Models）、相关论文、数据集及技术方法。本指南将指导开发者如何利用该仓库查找资源，并快速启动列表中推荐的典型推理模型（以 Llama 2\u002FQwen 为例）。\n\n## 环境准备\n\n在开始探索和使用列表中的模型前，请确保您的开发环境满足以下基本要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+) 或 macOS。Windows 用户建议使用 WSL2。\n*   **Python 版本**: Python 3.8 或更高版本。\n*   **硬件要求**:\n    *   **GPU**: 推荐 NVIDIA GPU (显存 ≥ 16GB 以运行 7B 参数模型，更大模型需更多显存或使用量化版本)。\n    *   **CUDA**: 已安装与 PyTorch 版本匹配的 CUDA 驱动。\n*   **前置依赖**:\n    *   `git`: 用于克隆仓库。\n    *   `pip` 或 `conda`: 用于管理 Python 环境。\n    *   `PyTorch`: 深度学习框架基础。\n\n## 安装步骤\n\n由于本仓库是资源列表，主要“安装”步骤是获取该列表并配置运行具体模型的环境。\n\n### 1. 克隆资源仓库\n首先，将包含所有推理模型资源和论文链接的仓库克隆到本地，以便查阅目录和最新进展。\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fagiresearch\u002FAwesome-Reasoning-Foundation-Models.git\ncd Awesome-Reasoning-Foundation-Models\n```\n\n### 2. 选择并部署具体模型\n根据仓库中 **[2. Foundation Models](#2-foundation-models)** 章节（如 `2.1 Language Foundation Models`），选择您需要的模型。以下以部署 **Llama 2** 或 **Qwen** 为例，展示通用的环境配置流程。\n\n**创建虚拟环境：**\n```bash\nconda create -n reasoning-env python=3.10 -y\nconda activate reasoning-env\n```\n\n**安装核心依赖（以 Hugging Face Transformers 为例）：**\n国内开发者推荐使用镜像源加速安装：\n```bash\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\npip install transformers accelerate sentencepiece protobuf -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n**获取模型权重：**\n*   **Llama 2**: 需在 Meta 官网申请授权后，通过 Hugging Face CLI 下载。\n    ```bash\n    # 需先登录 huggingface-cli login\n    git lfs install\n    git clone https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FLlama-2-7b-chat-hf\n    ```\n*   **Qwen**: 可直接从 ModelScope (魔搭) 或 Hugging Face 下载。\n    ```bash\n    # 使用 ModelScope 加速 (推荐国内用户)\n    pip install modelscope -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n    ```\n\n## 基本使用\n\n本仓库的核心价值在于提供模型入口。以下展示如何使用 Python 加载仓库中推荐的模型进行简单的**逻辑推理**测试。\n\n### 示例：使用 Qwen 模型进行数学推理\n\n```python\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n# 如果使用国内源，可指定 model_scope 或直接使用本地路径\nmodel_name = \"Qwen\u002FQwen-7B-Chat\" \n\n# 加载分词器和模型\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_name, \n    device_map=\"auto\", \n    trust_remote_code=True\n).eval()\n\n# 构建推理提示 (Prompt)\nquery = \"小明有 5 个苹果，吃了 2 个，又买了 3 个，现在有几个？请逐步推理。\"\nhistory = []\n\n# 生成回答\nresponse, history = model.chat(tokenizer, query, history=history)\n\nprint(f\"用户: {query}\")\nprint(f\"助手: {response}\")\n```\n\n### 示例：查阅特定推理任务资源\n若您想研究 **多模态推理 (Multimodal Reasoning)**，可直接在克隆的仓库中查看对应章节或访问其整理的 GitHub 列表：\n\n1.  打开本地 `README.md` 文件。\n2.  跳转至 **[3.7 Multimodal Reasoning](#37-multimodal-reasoning)** 章节。\n3.  点击该章节下列出的论文链接（如 arXiv）或代码库链接，获取针对视觉 + 语言推理的专用模型和数据集。\n\n> **提示**：对于仓库中列出的特定技术（如 `In-Context Learning` 或 `Chain-of-Thought`），请参考 **[4 Reasoning Techniques](#4-reasoning-techniques)** 章节对应的论文实现具体的 Prompt 工程策略，无需额外安装库，只需调整输入给模型的文本格式即可。","某高校人工智能实验室的研究团队正致力于开发一款能解决复杂数学应用题的教育大模型，急需筛选最适合的推理架构与基准测试方案。\n\n### 没有 Awesome-Reasoning-Foundation-Models 时\n- **文献检索如大海捞针**：研究人员需在 arXiv 上手动搜索\"reasoning\"、\"math\"、\"CoT\"等关键词，面对海量论文难以快速识别哪些是真正针对基础模型推理能力的最新成果。\n- **技术路线选择盲目**：缺乏系统分类，团队难以厘清“预训练”、“微调”与“思维链（In-context Learning）”在不同推理任务（如逻辑推理 vs 因果推理）中的具体适用性，导致实验方向频繁试错。\n- **基准测试标准混乱**：找不到权威且统一的评测榜单，不同论文使用的数据集各异，导致团队无法客观评估自家模型在数学或常识推理上的真实水平，复现对比极其耗时。\n\n### 使用 Awesome-Reasoning-Foundation-Models 后\n- **资源获取一站式完成**：直接查阅该仓库整理的精选列表，迅速定位到最新的语言、视觉及多模态推理模型论文，将文献调研时间从数周缩短至几天。\n- **技术决策有的放矢**：利用其清晰的分类体系（如数学推理、代理推理），团队快速锁定了适合教育场景的“混合专家（MoE）”与“对齐训练”技术组合，大幅减少了无效实验。\n- **评估体系科学规范**：参考仓库中汇总的权威基准测试（Benchmarks），建立了标准化的评估流程，不仅能准确量化模型提升效果，还能直接与业界最先进水平进行公平对标。\n\nAwesome-Reasoning-Foundation-Models 通过系统化梳理前沿论文与评测标准，将研究团队从繁琐的信息筛选中解放出来，使其能专注于核心算法的创新与落地。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Freasoning-survey_Awesome-Reasoning-Foundation-Models_fa97ddc3.jpg","reasoning-survey",null,"https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Freasoning-survey_af4b0577.png","https:\u002F\u002Fgithub.com\u002Freasoning-survey",654,61,"2026-03-21T14:40:25","MIT",1,"","未说明",{"notes":84,"python":82,"dependencies":85},"该仓库是一个综述列表（Awesome List），主要整理了关于推理基础模型（Reasoning Foundation Models）的论文、代码库和项目链接，本身不是一个可直接运行的软件工具或框架，因此 README 中未包含具体的操作系统、硬件配置、Python 版本或依赖库等运行环境需求。用户需根据列表中具体引用的模型（如 Llama, Qwen, Mistral 等）前往其各自的官方仓库查询相应的部署要求。",[],[35,14,87,13],"其他",[89,90,91,92,93,94,95],"llm","llm-reasoning","reasoning","reasoning-language-models","foundation-models","multimodal","reasoning-agent","2026-03-27T02:49:30.150509","2026-04-08T01:10:06.738777",[],[100],{"id":101,"version":102,"summary_zh":73,"released_at":103},136938,"v1.0.0","2023-12-08T08:18:36"]