[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-LightChen233--Awesome-Long-Chain-of-Thought-Reasoning":3,"tool-LightChen233--Awesome-Long-Chain-of-Thought-Reasoning":61},[4,18,26,36,44,52],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",142651,2,"2026-04-06T23:34:12",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":10,"last_commit_at":50,"category_tags":51,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":53,"name":54,"github_repo":55,"description_zh":56,"stars":57,"difficulty_score":10,"last_commit_at":58,"category_tags":59,"status":17},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[14,15,13,60],"视频",{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":77,"owner_twitter":77,"owner_website":77,"owner_url":78,"languages":77,"stars":79,"forks":80,"last_commit_at":81,"license":77,"difficulty_score":82,"env_os":83,"env_gpu":84,"env_ram":84,"env_deps":85,"category_tags":88,"github_topics":89,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":105,"updated_at":106,"faqs":107,"releases":108},4775,"LightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning","Awesome-Long-Chain-of-Thought-Reasoning","Latest Advances on Long Chain-of-Thought Reasoning","Awesome-Long-Chain-of-Thought-Reasoning 是一个专注于“长思维链（Long CoT）”推理技术的开源资源库，旨在系统梳理大语言模型在复杂推理领域的最新进展。随着 OpenAI-o1、DeepSeek-R1 等模型在数学和代码等高难度任务中展现出卓越能力，其背后的长思维链机制成为关键，但学界此前缺乏对该技术与传统短思维链差异的系统性总结。\n\n该项目填补了这一空白，不仅收录了超过 1000 篇相关前沿论文，还配套发布了详尽的综述文章《Towards Reasoning Era》。它清晰界定了长思维链的核心特征——深度推理、广泛探索与有效反思，并深入探讨了“过度思考”、“测试时扩展”等关键现象，为理解模型如何解决 intricate 问题提供了统一视角。\n\n无论是希望快速建立领域知识的研究人员，还是致力于优化模型推理能力的开发者，都能从中获益。对于初学者，项目提供了友好的双语教程和技术分类指南，帮助不同背景的用户迅速掌握核心概念；对于资深专家，它则是追踪多模态推理融合、效率提升及知识框架增强等未来方向的重要参考。通过结构化的知识整理，Awesome-Lo","Awesome-Long-Chain-of-Thought-Reasoning 是一个专注于“长思维链（Long CoT）”推理技术的开源资源库，旨在系统梳理大语言模型在复杂推理领域的最新进展。随着 OpenAI-o1、DeepSeek-R1 等模型在数学和代码等高难度任务中展现出卓越能力，其背后的长思维链机制成为关键，但学界此前缺乏对该技术与传统短思维链差异的系统性总结。\n\n该项目填补了这一空白，不仅收录了超过 1000 篇相关前沿论文，还配套发布了详尽的综述文章《Towards Reasoning Era》。它清晰界定了长思维链的核心特征——深度推理、广泛探索与有效反思，并深入探讨了“过度思考”、“测试时扩展”等关键现象，为理解模型如何解决 intricate 问题提供了统一视角。\n\n无论是希望快速建立领域知识的研究人员，还是致力于优化模型推理能力的开发者，都能从中获益。对于初学者，项目提供了友好的双语教程和技术分类指南，帮助不同背景的用户迅速掌握核心概念；对于资深专家，它则是追踪多模态推理融合、效率提升及知识框架增强等未来方向的重要参考。通过结构化的知识整理，Awesome-Long-Chain-of-Thought-Reasoning 正推动人工智能逻辑推理技术迈向新的阶段。","# \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_8b2e6abbc4fb.jpg\" alt=\"SVG Image\" width=\"40px\"> Awesome-Long-Chain-of-Thought-Reasoning\n\n[![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-Long_Chain_of_Thought-b31b1b.svg)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567) \n[![Paper](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-908-green.svg)](https:\u002F\u002Fgithub.com\u002F\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning)\n[![Last Commit](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flast-commit\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning)](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning)\n[![Contribution Welcome](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FContributions-welcome-blue)]()\n\n\\[[English Tutorial](README.md)\\] | \\[[中文教程](README-zh.md)\\] | \\[[Arxiv](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567)\\]\n\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_221787a541a9.png)\n\n\n\u003C!-- omit in toc -->\n# 🔥 News\n- **2025.07**: 🎉🎉🎉 We have updated the number of reviewed papers to over 1000. Additionally, we have added bilingual supports and updated our repository more friendly for Long-CoT beginner.\n- **2025.04**: 🎉🎉🎉 We have updated the number of reviewed papers to over 900. Additionally, we have enhanced the presentation with more engaging teaser figure.\n- **2025.03**: 🎉🎉🎉 We have published a survey paper titled \"[Towards Reasoning Era: A Survey of Long Chain-of-Thought for Reasoning Large Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567)\". Please feel free to cite or open pull requests for your awesome studies.\n\n\u003C!-- omit in toc -->\n# 🌟 Introduction\n\nWelcome to the repository associated with our survey paper, \"Towards Reasoning Era: A Survey of Long Chain-of-Thought for Reasoning Large Language Models\". This repository contains **resources and updates** related to our ongoing Long CoT research. For a detailed introduction, please refer to [our survey paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567).\n\nRecent advancements in reasoning with large language models (RLLMs), such as OpenAI-O1 and DeepSeek-R1, have demonstrated their impressive capabilities in complex domains like mathematics and coding. A central factor in their success lies in the application of long chain-of-thought (Long CoT) characteristics, which enhance reasoning abilities and enable the solution of intricate problems.\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_718db790ab3e.png)\n\nHowever, despite these developments, a comprehensive survey on Long CoT is still lacking, limiting our understanding of its distinctions from traditional short chain-of-thought (Short CoT) and complicating ongoing debates on issues like \"overthinking\" and \"test-time scaling.\" This survey seeks to fill this gap by offering a unified perspective on Long CoT.\n1. We first distinguish Long CoT from Short CoT and introduce a novel taxonomy to categorize current reasoning paradigms.\n2. Next, we explore the key characteristics of Long CoT: deep reasoning, extensive exploration, and feasible reflection, which enable models to handle more complex tasks and produce more efficient, coherent outcomes compared to the shallower Short CoT.\n3. We then investigate key phenomena such as the emergence of Long CoT with these characteristics, including overthinking, and test-time scaling, offering insights into how these processes manifest in practice.\n4. Finally, we identify significant research gaps and highlight promising future directions, including the integration of multi-modal reasoning, efficiency improvements, and enhanced knowledge frameworks. \n\nBy providing a structured overview, this survey aims to inspire future research and further the development of logical reasoning in artificial intelligence.\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_2dfcfad442d1.jpg)\n\n\u003C!-- omit in toc -->\n\n\n# 🕹️ Content\n## 0. How to Learn \\& About Us\nWe aim to help newcomers quickly establish domain knowledge, so our design concept is as follows: briefly introduce the main technologies involved in reasoning large models and Long CoT, allowing everyone to understand which problems different technologies can address, so that when they wish to delve deeper into the field in the future, they will have a clear starting point.\n\nWe are a team of beginners in reasoning large models, and we hope that through our own learning experiences, we can offer some assistance to future learners, accelerating the popularization and application of reasoning large models. We welcome more friends to join our project, and we are also open to friendship and academic collaboration. For any inquiries, please feel free to contact us via email at [charleschen2333@gmail.com](mailto:charleschen2333@gmail.com).\n\n**Daily Knowledge Resources**\n- **Social Media:**\n  - Recommended WeChat Public Accounts: JIQIZHIXIN, Paper Weekly, MLNLP...\n  - Recommended Twitter Accounts: [AK](https:\u002F\u002Fx.com\u002F_akhaliq), [elvis](https:\u002F\u002Fx.com\u002Fomarsar0), [Philipp Schmid](https:\u002F\u002Fx.com\u002F_philschmid), ...\n- **Cutting-edge Courses:** [CS336](https:\u002F\u002Fstanford-cs336.github.io\u002Fspring2025\u002F)\n- **Community Sharing:** [MLNLP](https:\u002F\u002Fspace.bilibili.com\u002F168887299), [JIQIZHIXIN](https:\u002F\u002Fspace.bilibili.com\u002F73414544), [BAAI](https:\u002F\u002Fhub.baai.ac.cn\u002F), [NICE Academic](https:\u002F\u002Fspace.bilibili.com\u002F507524288)\n\n## 1. Classical Reasoning Model\n- [OpenAI-o1 \u002F o3 \u002F o4](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002F#o3): The earliest reasoning large language models exploring Long CoT, developed by OpenAI’s first-tier models.\n- [Gemini](https:\u002F\u002Fgithub.com\u002Fgoogle-gemini): First-tier reasoning large language models developed by Google.\n- [Deepseek-r1](https:\u002F\u002Fgithub.com\u002Fdeepseek-ai\u002FDeepSeek-R1): The first open-source reasoning large language model with Long CoT.\n- [QwQ](https:\u002F\u002Fqwenlm.github.io\u002Fzh\u002Fblog\u002Fqwq-32b-preview\u002F): The first open-source large-scale reasoning large language model with Long CoT.\n- [Qwen3](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3): The most commonly used open-source reasoning large language models for Long CoT developed by Alibaba.\n- [Seed-Thinking-v1.5](https:\u002F\u002Fgithub.com\u002FByteDance-Seed\u002FSeed-Thinking-v1.5\u002Fblob\u002Fmain\u002Fseed-thinking-v1.5.pdf): ByteDance’s open-source reasoning model for Long CoT.\n- [Kimi-k1.5](https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-k1.5): The earliest multimodal reasoning model for Long CoT developed by Moonshot.\n- [MiniMax-m1](https:\u002F\u002Fgithub.com\u002FMiniMax-AI\u002FMiniMax-M1): The open-source reasoning model for Long CoT developed by MiniMax.\n\n\n## 2. Introduction to Long-CoT Capabilities\nIn this chapter, we will provide the most representative technologies for each capability, along with the latest developments. A detailed list of papers can be found in the [complete list](pages\u002Fpaper.md).\n\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_0ef89bb498f9.jpg)\n\n\n### 2.1 Deep Reasoning\n\nThe core of deep reasoning ability lies in the need for sufficient logical depth to manage a large number of reasoning nodes. Without this capability, the performance of reasoning large language models (RLLMs) significantly degrades. Current methods for enhancing deep reasoning can be categorized into two main approaches: Deep Reasoning Format and Deep Reasoning Learning.\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_6516b5a46db9.png\" style=\"width: 580pt\">\n\n#### 2.1.1 Deep Reasoning Format\nSince reasoning models heavily depend on the format of reasoning, they tend to achieve the deepest reasoning paths in the forms they excel at. As a result, some works have begun exploring better reasoning formats for deeper reasoning.\n\n\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_f6f35d5f1877.jpg\" style=\"width: 580pt\">\n\n**Natural Language Deep Reasoning**\n\n- **Core Idea:** Aims to express deep reasoning through natural language formats.\n- **Representative Works:**\n  - [Natural Program](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002F72393bd47a35f5b3bee4c609e7bba733-Paper-Conference.pdf): ensures more structured and rigorous logical analysis.\n  - [Code I\u002FO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.07316): restructures code-based reasoning patterns into natural language forms, further unleashing the reasoning potential of RLLMs.\n\n---\n\n**Structured Language Deep Reasoning**\n\n- **Core Idea:** Aims to enhance deep reasoning through programmatic or symbolic language formats. Current research primarily focuses on using code to improve mathematical reasoning capabilities.\n- **Representative Works:**\n  - [Program-of-Thought](https:\u002F\u002Fopenreview.net\u002Fforum?id=YfZ4ZPt8zd): enables models to think using code language, thereby enhancing their reasoning capabilities.\n  - [DeepSeek-Prover](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14333): converts natural language questions into formal statements, filters out low-quality statements, and generates proofs to create synthetic data, enhancing LLM’s theorem proving ability.\n  - [RBF](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Fhash\u002F62ab1c2cb4b03e717005479efb211841-Abstract-Conference.html): demonstrates why structured language is more effective than natural language in scenarios that require strong planning.\n\n\n\n---\n\n**Latent Space Deep Reasoning**\n\n- **Core Idea:** Enhances LLM reasoning ability through continuous latent space operations.\n- **Representative Works:**\n  1. **Token-driven:** Early studies introduced implicit \"planning tokens\" or \"thinking tokens\" to guide the reasoning process in latent space.\n     - [Coconut (Chain of Continuous Thought)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06769): further expands this method by maintaining multiple parallel reasoning paths, enhancing complexity while ensuring efficiency.\n     - [Heima](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.19201): performs efficient reasoning through latent hidden spaces, innovatively compressing the entire Long CoT process into a single token, resulting in significant computational resource savings.\n  2. **Vector-driven:** Inserts an additional vector to guide the reasoning process in latent space.\n     - [LTMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.01567): innovatively abstracts each layer of the LLM into \"thinking blocks\" and introduces the concept of a \"thinking vector\" for each layer. Through iterative deep computation in latent space, the model dynamically scales the computational load during testing.\n  3. **Manager-driven:** Proposes a continuous manager mechanism to manage the latent space states.\n     - [Recurrent Block](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.05171): uses iterative control over trained \"recurrent blocks\" as recursive \"thinking blocks\" to integrate deeper model layers during reasoning, enhancing performance without the need for specialized training data.\n     - [Implicit Thought Transformer (ITT)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13842): leverages raw Transformer layers as recursive \"thinking blocks,\" using adaptive token routing to select key tokens and residual thinking connections to control reasoning depth, thereby achieving efficient processing of key tokens.\n- **Relevant Repositories:**\n  - [Awesome-Latent-CoT](https:\u002F\u002Fgithub.com\u002FEIT-NLP\u002FAwesome-Latent-CoT): provides an overview of various thought chain representations in latent space, capturing complex non-linguistic thoughts that cannot be expressed by language alone.\n\n---\n\n#### 2.1.2 Deep Reasoning Learning\nThe deficiency of deep reasoning abilities in RLLMs can significantly reduce model performance. As a result, the academic focus has shifted towards enhancing reasoning capabilities through training. Supervised fine-tuning (SFT), as a memory process, can stabilize model output, while reinforcement learning (RL) facilitates generalization and self-learning.\n\n---\n\n**Deep Reasoning Imitation**\n\n- **Core Idea:** By imitating advanced reasoning systems, deep reasoning in RLLMs can be effectively achieved, enabling models to learn complex reasoning patterns and generalize across tasks.\n- **Representative Works:**\n  1. **Imitation from Human**\n     - [GSM8K\u002FGPT-Verifier](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.14168): Introduces early imitative learning based on human-annotated deep reasoning samples.\n     - [ALT](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002F8678da90126aa58326b2fc0254b33a8c-Paper-Conference.pdf): Enhances deep reasoning in RLLMs by generating a large-scale dataset of human-annotated logical templates.\n  2. **Imitation from Advanced RLLMs**\n     - [AceMath](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.15084): Uses few-shot prompting to distill Long CoT samples from advanced LLMs, improving performance through multi-stage quality-guided SFT.\n     - [DART-Math](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002F0ef1afa0daa888d695dcd5e9513bafa3-Paper-Conference.pdf): Effectively distills difficulty-dependent deep reasoning samples through rejection sampling in the synthesis stage.\n     - [OpenThoughts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.04178) \u002F [OpenCodeReasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.01943) \u002F [NaturalThoughts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.01921): Extends this paradigm to mathematics, code, and general scenarios.\n  3. **Imitation from Scaling-augmented RLLMs**\n     - [Bansal et al.](https:\u002F\u002Fopenreview.net\u002Fforum?id=HuYSURUxs2): Find that expanding the sampling scale and length improves data quality.\n     - [Qwen-Math](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12122) \u002F [PromptCoT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.02324): Further combine large-scale sampling with reward model sample selection to generate Olympic-level difficulty deep reasoning samples.\n     - [FastMCTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11476): Utilizes Monte Carlo Tree Search (MCTS) to identify optimal deep reasoning paths.\n- **Latest Developments:**\n     - [Journey P2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.16489): Knowledge distilled from advanced RLLM APIs such as o1, R1 significantly boosts small LLMs' performance, with supervised fine-tuning methods surpassing teacher models in complex mathematical reasoning tasks.\n     - [s1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.19393) \u002F [LIMO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03387): A small number of high-quality samples is sufficient to activate deep reasoning abilities in base LLMs.\n\n---\n\n**Deep Reasoning Self-Learning**\n\n- **Core Idea:** Although simple imitation yields excellent performance, current models still heavily rely on human annotations or outputs from advanced models during imitation and distillation. To break through this limitation, research focuses on self-learning techniques to achieve more advanced reasoning capabilities.\n- **Representative Works:**\n  1. **Self-Learning from Direct Sampling**\n     - [STaR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465): Uses in-context learning (ICL) to sample deep reasoning results and treats the correctness of the final answer as an implicit reward for self-learning.\n     - [Reinforced Self-Training (ReST)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08998): Proposes the \"Grow-Improve\" paradigm, where the self-generated reasoning process is rewarded, enhancing with offline reinforcement learning.\n     - [ReST$^{EM}$](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.06585): Generates rewards and iteratively optimizes LLMs to achieve peak performance on validation sets, significantly improving robustness.\n     - [TOPS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.18080): Finds that self-learning with deep reasoning samples at an appropriate reasoning depth is the most efficient.\n  2. **Self-Learning from Tree Search**\n     - [PGTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.06813): Uses policy-guided tree search, combining reinforcement learning with structured tree exploration.\n     - [ReST-MCTS*](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.03816): Optimizes MCTS behavior through progressive trajectory extraction and curriculum preference learning, significantly improving LLMs' reasoning ability.\n- **Latest Developments:** Introduce an error-correction adaptive mechanism by training a verifier or using entropy to filter and optimize the reward process, thus enhancing the quality of self-learning.\n   - [UnCert-CoT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.15341): Dynamically schedules thought chains based on entropy-aware uncertainty, activating multi-path reasoning only under high-entropy situations, significantly improving code generation accuracy and efficiency.\n   - [Wang et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.01939): Analyzes the impact of verifiable reward-based reinforcement learning on large language models' reasoning capabilities from the token entropy perspective, where high-entropy \"branching\" tokens dominate adjustments to multi-path reasoning strategies. Policy gradient optimization is applied exclusively to these high-entropy tokens.\n   - [CoT-Valve](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.09601): Dynamically adjusts to reduce reasoning path length based on task difficulty, thus reducing computational overhead.\n\n\n\n---\n\n### 2.2 Feasible Reflection\n\n#### 2.2.1 Feedback\n\nFeedback mechanisms provide multi-granularity evaluation signals for Long CoT, ranging from Overall Feedback, which evaluates the final outcome, to Process Feedback, which supervises individual steps of the reasoning process, and Hybrid Feedback, which combines both types. These mechanisms not only support reward modeling and path optimization but also lay the foundation for subsequent self-correction, serving as a crucial bridge to move RLLMs from static generation to dynamic evaluation.\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_82dc61bbe310.png\" style=\"width: 580pt\">\n\n---\n\n**Overall Feedback**\n\n- **Core Idea:** Overall feedback evaluates the complete reasoning process and the final result from a global perspective, commonly used to guide large language models in improving reasoning quality during reinforcement learning or self-optimization. Feedback forms include numerical rewards, rule checks, and natural language evaluations.\n- **Representative Works:**\n  1. **Outcome Reward Models(ORM):** Provide numeric reward signals to optimize output quality, suitable for tasks where accuracy is hard to assess directly.\n     - [Gen-Verifier](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.14168): introduces the first generation verification framework based on reasoning accuracy;\n     - [Critic-RM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.16646): combines natural language criticism and reward prediction, significantly optimizing feedback quality;\n     - [Self-Rewarding LMs (SRLMs)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.08922): introduce consistency mechanisms, achieving self-supervised rewards without human annotation.\n  2. **Rule Extraction:** Uses in-task rules to verify and correct answers, enhancing the stability of feedback.\n     - [STaR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465) \u002F [ReST](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08998): show that rule-based feedback based on final answers outperforms ORM in mathematical tasks;\n     - [OpenCodeInterpreter](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14658) \u002F [AceCoder](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.01718): generate program-level feedback using automated test cases in coding tasks.\n  3. **RLLMs Feedback (LLM-as-a-Judge):**\n      The model self-critiques and evaluates in natural language, enhancing reflection and error-correction capabilities.\n     - [EvalPlanner](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18099): distinguishes feedback between planning and reasoning;\n     - [RoT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.12323): combines reverse reasoning and reflection to assist models in discovering knowledge gaps;\n     - [AutoRace](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05221): provides task-specific evaluation criteria to improve feedback relevance.\n- **Relevant Repositories:**\n  - [RewardBench](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench)For system evaluation of ORM methods.\n\n**Process Feedback**\n\n- **Core Idea:**\n   Process feedback evaluates each step in the reasoning chain progressively, often in conjunction with reinforcement learning or tree search, guiding the model to fine-tune without relying on human annotations. The feedback sources primarily include process reward models and language models driven by natural language.\n- **Representative Works:**\n  1. **Process Feedback from Process Reward Models (PRMs):**\n      Use automatically constructed or minimally annotated data to train stepwise reward functions, the mainstream approach in Long CoT.\n     - [PRM800K](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20050): Pioneers using human-annotated stepwise supervision to enhance reward stability;\n     - [Math-Shepherd](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.08935): Automatically generates stepwise feedback using tree search to enhance PRM generalization;\n     - [Full-Step-DPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14356): Rewards the entire reasoning chain, encouraging holistic optimization;\n     - [AdaptiveStep](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13943): Dynamically segments reasoning steps based on confidence, enabling token-level fine-grained feedback.\n  2. **Process Feedback from RLLMs:** Leverages the model’s own generation of natural language feedback to simulate reward signals, improving the flexibility and scalability of process supervision.\n     - [React](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629) \u002F [Reflexion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366): Generate language feedback after each action, enhancing decision-making rationality;\n     - [Step-DPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.18629): Introduces a self-validation mechanism to construct positive-negative contrastive samples, optimizing the training process;\n     - [CACE](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.07165): Proposes causal impact metrics between reasoning steps to make the entire chain more interpretable;\n     - [ORPS](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2412.15118v1): Automatically optimizes reasoning strategies with program execution feedback, reducing human reliance.\n- **Relevant Repositories:**\n  - [ProcessBench](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FProcessBench): Evaluates stepwise reasoning and reward model performance;\n  - [PRMBench](https:\u002F\u002Fgithub.com\u002Fssmisya\u002FPRMBench): Focuses on comparative analysis of PRM methods in mathematical tasks.\n\n**Hybrid Feedback**\n\n- **Core Idea:** Hybrid feedback mechanisms combine the strengths of both overall and process feedback, assessing final outputs while focusing on intermediate reasoning steps. This unified multi-granularity evaluation system enhances the overall reasoning quality and error-correction capabilities of language models.\n- **Representative Works:**\n  - [Consensus Filtering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.07301): Combines Monte Carlo estimation with LLM-as-Judge to integrate overall and stepwise feedback, enhancing reasoning consistency and accuracy;\n  - [Step-KTO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.10799): Merges PRM and ORM binary feedback mechanisms, emphasizing reflection-driven error correction, guiding the model to form more coherent Long CoT structures.\n\n#### 2.2.2 Refinement\n\nThe Refinement mechanism focuses on self-correction capabilities based on feedback information, serving as a key step in achieving closed-loop optimization in Long CoT. Through Prompt-based Refinement, spontaneous reflection is achieved; SFT-based Refinement facilitates imitation learning; and RL-based Refinement strengthens self-correction strategies. As a result, the model gradually develops the ability of \"self-diagnosis—self-updating,\" making the reasoning chain more robust and controllable.\n\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_37b27a4cf133.png\" style=\"width: 580pt\">\n\n---\n\n**Prompt-based Refinement**\n\n- **Core Idea:** By guiding the model to generate initial responses via prompts, and allowing for self-feedback and multi-round corrections in subsequent rounds, this method improves reasoning accuracy, reduces hallucinations, and supports stronger automated reflection capabilities.\n- **Representative Works:**\n  - [ReAct](https:\u002F\u002Fgithub.com\u002Fysymyth\u002FReAct) \u002F [Reflexion](https:\u002F\u002Fgithub.com\u002Fnoahshinn\u002Freflexion): A typical implementation of the multi-round reflection and self-correction mechanism;\n  - [Self-Backtracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.04404) \u002F [Refiner](https:\u002F\u002Faclanthology.org\u002F2024.eacl-long.67\u002F) \u002F [BackMath](https:\u002F\u002Faclanthology.org\u002F2025.coling-industry.40\u002F): Supports the model in autonomously backtracking and modifying during the reasoning process, streamlining decision paths;\n  - [MCTSr](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07394) \u002F [ReST-MCTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.03816): Combines tree search with confidence updates to enable multi-round dynamic reflection;\n  - [LLM2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.20372)\u002F [ReARTeR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.07861): Promotes the automatic evolution and stable convergence of refinement strategies in Long CoT tasks.\n  \n\n**SFT-based Refinement**\n\n- **Core Idea:** By utilizing high-quality reflection data for supervised fine-tuning, the model imitates the self-correction behaviors of more advanced models, enhancing its step-by-step error correction and reflective capabilities. This is suitable for small model capability transfer and fine-grained training.\n- **Representative Works:**\n  - [rStar](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.06195): Enhances small model self-improvement capabilities through self-play methods;\n  - [Math-Minos](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14024): Trains the model using step-by-step rationale labels for fine-grained reasoning;\n  - [Journey Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.16489): Combines MCTS backtracking to generate supervision signals;\n  - [MM-Verify](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13383): Expands the refinement mechanism to multimodal image-text reasoning.\n\n**RL-based Refinement**\n\n- **Core Idea:** Through reinforcement learning mechanisms, the model is guided to self-reflect and correct during testing or reasoning processes, emphasizing self-refinement capabilities under reward guidance, thus reducing dependence on manual supervision.\n- **Representative Works:**\n  - [SCoRe](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12917): Enhances the model's self-refinement ability during testing through self-generated correction trajectories and regularization;\n  - [DeepSeek-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948): Uses result-level reinforcement learning to activate the model's natural feedback and \"aha\" moments for corrections;\n  - [S$^2$R](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12853): Combines process-level reinforcement learning to achieve dynamic refinement in reasoning;\n  - [ReVISE](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14565): Introduces an internal verifier to decide when to trigger RL-guided reflective behaviors.\n\n### 2.3 Extensive Exploration\nExtensive Exploration enables reasoning large language models (RLLMs) to explore multiple reasoning paths more deeply and comprehensively when dealing with complex problems, thereby improving problem-solving accuracy and robustness. From the perspective of exploration types, extensive exploration techniques can be divided into three categories: Exploration Scaling, Internal Exploration, and External Exploration.\n\n#### 2.3.1 Exploration Scaling\nExploration Scaling aims to enhance the model's ability to solve more complex problems by increasing the number or length of reasoning paths. This approach is typically suitable when the reasoning task is more complex, and a single reasoning path may not effectively lead to the correct answer.\n\n---\n\n**Sequential Scaling**\n\n- **Core Idea:** By extending the reasoning chain of a single path, the model gradually deepens its thinking, thereby improving its understanding and handling of complex problems. This is especially applicable to tasks with Long CoT that require multi-step reasoning to draw conclusions, such as mathematical proofs, logical deductions, and multi-step planning.\n- **Representative Works:**\n  - [OpenAI-o1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16720) \u002F [Deepseek-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948): \n  Extends the reasoning chain to provide detailed multi-step reasoning processes, effectively improving the ability to solve complex problems in mathematics, coding, and other areas.\n  - [ITT(Inner Thinking Transformer:)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13842): redefines the layer computation in Transformer as \"thinking steps,\" dynamically allocating computation resources to deeper reasoning on key tokens without increasing the total number of model parameters.\n\n---\n\n**Parallel Scaling**\n\n- **Core Idea:** By generating multiple reasoning paths in parallel and combining, voting, or verifying their results, the model can effectively avoid the issue of a single path getting trapped in local optima or errors, thus improving robustness and accuracy in situations with high ambiguity, multiple possible solutions, or unclear outcomes.\n- **Representative Works:**\n  - [Self-Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171): Proposes generating multiple reasoning paths and selecting the most frequent answer from the results, effectively improving the stability and accuracy of the final answer.\n  - [ECM(Electronic Circuit Model)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03325): Borrowing the concepts of parallel and series circuits in electronics, combines reasoning paths in parallel or series, considering various possibilities and improving decision quality.\n\n---\n\n#### 2.3.2 Internal Exploration\nInternal Exploration primarily refers to large reasoning models (RLLMs) actively exploring and optimizing reasoning paths through their internal mechanisms (usually reinforcement learning strategies and reward mechanisms), allowing for more efficient and deeper solutions to complex reasoning problems. This method enables the model to autonomously adjust its reasoning strategy, reducing reliance on external guiding data.\n\n---\n\n**RL Strategies**\n- **Core Idea:** Leverage reinforcement learning (RL) algorithms to guide models in actively learning and exploring diverse reasoning paths. This approach overcomes the limitations of overly uniform patterns in reasoning processes, enhancing model performance in tasks that involve high uncertainty or heavily depend on autonomous decision-making.\n- **Representative Works:**\n  - [PPO(Proximal Policy Optimization)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06347): A classic RL algorithm that efficiently optimizes a model's internal decision-making mechanism through a policy-gradient-based approach, suitable for path exploration and optimization in complex environments.\n  - [DivPO(Diverse Preference Optimization)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18101): Encourages models to explore a greater variety of reasoning paths to maintain decision diversity, preventing convergence to local optima.\n  - [GRPO(Guided Reward Policy Optimization)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.03300): Designs a guided reward mechanism that enables models to more effectively explore within complex logical reasoning spaces.\n\n---\n\n**Reward Strategies**\n\n- **Core Idea:** Directly guide models to explore and optimize effective reasoning paths through carefully designed reward functions, which are particularly useful in scenarios where there are explicit optimization goals or specific reasoning bottlenecks to address.\n- **Representative Works:**\n  - [Deepseek-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948): Proposes a specially designed reward function to incentivize models to optimize intermediate reasoning steps, aiding the model in building a high-quality internal reasoning process.\n  - [ReST-MCTS*](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.03816): Combines Monte Carlo Tree Search (MCTS) with reward strategies, guiding the tree search algorithm through process rewards for more accurate exploration of effective reasoning paths, improving overall reasoning quality.\n\n---\n\n#### 2.3.3 External Exploration\nExternal exploration refers to the assistance of external tools, human knowledge, or other models in guiding the model to more effectively explore diverse reasoning paths and improve its ability to solve complex problems. This approach is often used in scenarios where fine-grained guidance or external knowledge is essential for effective problem-solving. External exploration can be subdivided into two types: Human-driven Exploration and Model-driven Exploration.\n\n---\n\n**Human-driven Exploration**\n- **Core Idea:** Utilizes human intuition, experience, or feedback to guide the model in selecting and adjusting reasoning paths, especially in situations where the model's autonomous exploration ability is limited or the reasoning task is complex and requires decomposition into multiple sub-tasks.\n- **Representative Works:**\n  - [Least-to-Most](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625): Breaks down complex problems into simpler subproblems, solving each and using prior answers as inputs for subsequent steps, ultimately synthesizing a solution for the overall problem. This method was proposed to address the \"difficulty generalization\" bottleneck in traditional Chain-of-Thought approaches.\n  - [ToT (Tree-of-Thought)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601): Expands the traditional \"left-to-right\" token generation reasoning process into a \"tree structure exploration,\" where each node represents a thought unit. This supports multi-path attempts, backtracking, forward reasoning, and self-evaluation within the reasoning process.\n\n---\n\n**Model-driven Exploration**\n\n- **Core Idea:** Uses auxiliary models or algorithms to automatically guide the current model's reasoning process, reducing the need for human intervention and allowing for the efficient search and optimization of numerous complex reasoning paths, thus improving automation and overall efficiency.\n- **Representative Works:**\n  - [PPO-MCTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15028): Integrates MCTS (Monte Carlo Tree Search) with PPO-based training to enhance reasoning. The key is to retain the value network obtained during PPO training and use it during the reasoning phase to guide the MCTS in selecting more desirable output sequences, thereby improving the quality and consistency of the generated text.\n  - [MindStar](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16265):  Reformulates complex reasoning problems (particularly mathematical ones) as search problems, where structured searches are performed over different reasoning paths to select the optimal one.\n  - [rStar-Math](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04519):  Develops a strong mathematical reasoning system through MCTS + small model reward mechanisms + self-evolution processes, enabling small models to outperform o1-preview in mathematical capabilities.\n\n---\n\n## 3. Key Phenomena and Related Principles\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_899dad1ca48d.jpg\" style=\"width: 580pt\">\n\n### 3.1 Reasoning Emergence Phenomenon\n\nLong CoT abilities naturally emerge after training, demonstrated by the model's ability to generate multi-step, coherent reasoning processes by internalizing logical structures and contextual examples from pretraining data, even in the absence of direct supervision. Related studies have described this phenomenon as follows:\n\n- [Wang et al.](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.153\u002F) found that a small number of high-quality contextual examples can effectively guide RL-based language models (RLLMs) to generate clear, logically consistent reasoning chains, indicating that the model has internalized basic reasoning patterns during pretraining.\n- [Madaan et al.](https:\u002F\u002Faclanthology.org\u002F2023.findings-emnlp.0\u002F) demonstrated that even without specific problem entities, the model can still generate reasonable reasoning chains by retaining only the logical structure information, showcasing its inductive and transfer abilities regarding structural information.\n- [Stechly et al.](https:\u002F\u002Fopenreview.net\u002Fforum?id=kPBEAZU5Nm) pointed out that by adjusting decoding strategies or constructing specialized prompts, latent  CoT abilities within the model can be explicitly activated, resulting in multi-step reasoning in complex tasks.\n- [Guo et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948) showed that rule-based RL strategies can directly induce models to form coherent reasoning chains during pretraining, significantly improving performance in multi-step tasks.\n\n### 3.2 Reasoning Boundary Phenomenon\n\nLarge language models exhibit clear performance boundaries in Long CoT: when the depth or complexity of reasoning exceeds a certain threshold, model performance significantly degrades, sometimes even resulting in logical collapse. This phenomenon suggests that current models have a \"reasoning boundary,\" which is the upper limit of reasoning complexity that can be supported by their parameter space and computational resources. Existing research has systematically explored this phenomenon from both theoretical modeling and empirical analysis:\n\n- [Chen et al.](https:\u002F\u002Fopenreview.net\u002Fforum?id=pC44UMwy2v.) formally introduced the \"reasoning boundary\" concept, experimentally quantifying the performance critical points of models under different task complexities, and indicating that accuracy sharply declines when the reasoning task exceeds the model's capacity.\n- [Bi et al.](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F29721) observed that performance deteriorates drastically when the model attempts to mimic overly complex CoT examples in code generation tasks, indicating that beyond a certain complexity, Long CoT examples become counterproductive.\n- [Feng et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15408) proposed a mathematical model showing that models with fixed parameter sizes cannot perform numerical calculations exceeding a certain complexity, revealing a hard limit in accuracy.\n- [Zhou et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.05252) constructed the GSM-Infinite dataset and demonstrated through experiments that the upper limits of reasoning abilities vary significantly across different tasks, further emphasizing that reasoning boundaries are related to task structure.\n\n### 3.3 Overthinking Phenomenon\n\nIn Long CoT, extending the reasoning chain does not always lead to performance improvement. Studies have shown that once the reasoning length exceeds the model’s capacity, accuracy decreases, a phenomenon known as \"overthinking,\" which reflects the non-linear marginal benefits of reasoning and error accumulation in the process.\n\n- [Chen et al.](https:\u002F\u002Fopenreview.net\u002Fforum?id=pC44UMwy2v.) found that when the number of reasoning steps exceeds the model’s boundary, reasoning accuracy significantly drops, indicating that there is an optimal depth range for reasoning.\n- [Wolf et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.18028) emphasized that the fundamental reason for performance degradation is the amplification of errors in intermediate reasoning steps, which affects the final judgment.\n- [Xie et al.](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2502.14768v1) experimentally showed that reasoning length does not have a monotonic relationship with accuracy, challenging the intuition that \"longer CoT leads to better reasoning.\"\n- [Wu et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.07266) established a mathematical model defining the \"optimal reasoning length\" interval under different models and task conditions, suggesting that performance reverses once the length exceeds the optimal range.\n- [Chen et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03325) introduced the \"reasoning chain Ohm’s law,\" analogizing the non-linear relationship between reasoning length and performance to information flow resistance in the model.\n\n### 3.4 Inference Test-Time Scaling Phenomenon\n\nThe inference test-time scaling phenomenon refers to the increase in reasoning performance by extending the computational process (e.g., reasoning chain length or sample number) during inference. This phenomenon reveals the \"dynamic amplification\" potential of the model, but also comes with a trade-off between exploration depth and computational cost.\n\n- [Brown et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.21787) observed that by repeating multiple rounds of inference attempts, even if the initial attempt fails, the correct answer can be found within a certain number of trials, introducing the \"language monkey\" phenomenon.\n- [o1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16720) demonstrated that simply increasing the reasoning chain length improves accuracy, particularly in complex mathematical tasks.\n- [Jin et al.](https:\u002F\u002Faclanthology.org\u002Fvolumes\u002F2024.findings-acl\u002F) pointed out that while increasing the reasoning chain length initially leads to performance improvement, beyond a certain threshold, performance deteriorates, resulting in a typical nonlinear growth curve.\n- [Wu et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.00724) found that there is a logarithmic relationship between the number of inference samples and the lower bound of error, suggesting an asymptotic relationship between computational complexity (FLOPs) and inference performance.\n- [Chen et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03325) established the theoretical upper-bound of parallel inference, indicating that no matter how the sample size is increased, the model’s verification performance cannot exceed its internal reasoning ceiling.\n\n### 3.5 PRM and ORM Selction Phenomenon\n\nIn reinforcement learning optimization, Long CoT tasks involve supervision of the model generation process. Researchers distinguish between two main strategies: Process Reward Model (PRM), which focuses on the reasoning process itself, and Outcome Reward Model (ORM), which only concerns whether the final output is correct. The two strategies differ significantly in terms of generalization ability, learning stability, and supervision cost.\n\n- [Lampinen et al.](https:\u002F\u002Faclanthology.org\u002F2022.findings-emnlp.38) validated the causal relationship between intermediate steps and final answers in qualitative experiments, providing theoretical support for the rationale behind process supervision.\n- [Jia et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.10581) theoretically proved that under sufficiently diverse data, ORM is not harder to optimize than PRM, with the two only differing by polynomial factors in terms of sample complexity.\n- [Guo et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948) showed that rule-based PRM reinforcement learning significantly improves the model's Long CoT capabilities in complex tasks but also faces the risk of reward hacking.\n- [Tan](https:\u002F\u002Faclanthology.org\u002F2023.blackboxnlp-1.12.) emphasized the importance of reward distribution in intermediate reasoning steps for complex reasoning paths, which ORM cannot provide.\n- [Jiang et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.03124) pointed out that PRM is more costly in terms of data collection, as it requires labeling each reasoning step, limiting its large-scale application.\n\n### 3.6 Aha Moment Phenomenon\n\nThe Aha Moment refers to the sudden integration of information during the reasoning process, leading to a key turning point in judgment, resembling human reflection and self-correction. This phenomenon highlights the model's dynamic cognitive adjustment abilities, but its occurrence depends on the collaboration between external stimuli and internal mechanisms.\n\n- [Guo et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948) first triggered the Aha Moment behavior under unsupervised conditions through rule-based rewards, with models reflecting on intermediate reasoning and self-correcting.\n- [Xie et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14768) further demonstrated through experiments that this behavior can be replicated across multiple models, verifying that it is not an偶然 event but rather an inducible strategy.\n- [Zhou et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.05132) extended the Aha Moment phenomenon to multimodal tasks, showing that it is not specific to text-based tasks but reflects the model's broader cognitive abilities.\n- [Liu et al.](https:\u002F\u002Foatllm.notion.site\u002Foat-zero) pointed out that in certain reinforcement learning frameworks (e.g., R1-Zero), Aha behavior may not genuinely exist, with lengthening the generation more likely a result of reward optimization rather than actual reflection.\n- [Yang et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.02956) found that the Aha behavior often involves human-like language enhancement and dynamic uncertainty regulation, with the model more inclined to use expressions like \"I think\" under high-pressure tasks, reflecting its coping mechanisms for task stress.\n\n\n\n\n## 4. Algorithms\n\n### 4.1 Supervised Fine-Tuning (SFT)\n\nIn advancing large models to possess powerful Long CoT reasoning abilities, Supervised Fine-Tuning (SFT) plays a crucial role, bridging pre-training with more advanced alignment methods such as Reinforcement Learning from Human Feedback (RLHF). The core goal of SFT is to teach models how to follow instructions and initially master the ability to generate structured, step-by-step reasoning chains, thus laying the foundation for more complex reasoning tasks.\n\n\n\n- **In the context of deep reasoning, SFT is especially critical.**\nAlthough the lack of sufficient reasoning depth in RLLMs significantly reduces performance, SFT stabilizes the model’s output format through a memorization process, allowing it to learn reasoning from human-labeled or distilled data. In contrast to reinforcement learning (RL), which focuses more on generalization and self-learning, SFT plays a vital role in deep reasoning imitation. It allows RLLMs to learn complex reasoning patterns by mimicking high-quality reasoning examples generated by humans, advanced RLLMs, or enhanced RLLMs, and generalizing them to new tasks. SFT not only significantly improves the model’s reasoning performance but, in some cases, enables even a small number of high-quality samples to activate the underlying LLM's deep reasoning capabilities, allowing it to predict events outside the model's knowledge base. This makes SFT one of the key technologies for enhancing reasoning levels and generalization abilities in RLLMs.\n\n- **For feasible reflection, SFT primarily focuses on optimization-based imitation (Refinement Imitation).**\nIn reflection-based LLM reasoning, SFT is a key mechanism for enabling self-optimization and error correction in the model. Through SFT, the model can directly learn the error-correction processes of advanced LLMs, significantly enhancing its reflective abilities, such as performing self-play reasoning, iterative feedback error correction, and even justifying and reflecting on the reasoning process through incremental natural language feedback. Additionally, SFT can integrate visual and textual reasoning in multimodal scenarios, improving the model’s critical thinking and self-correction abilities. SFT enhances the reasoning accuracy of LLMs through iterative feedback and self-correction strategies, which is especially beneficial for smaller models.\n\n\n#### 4.1.1 Core Technology\n\nSFT consists of two core concepts: **Instruction Tuning** and **Parameter-Efficient Fine-Tuning, PEFT**。\n\n**Instruction Tuning**\n-   **Core Idea:** By fine-tuning the model on a large number of instructions covering various tasks, the model’s zero-shot generalization ability on unseen tasks can be significantly enhanced. This enables the model to learn the skill of \"following instructions.\"\n-   **Representative Works:**\n    -   [Finetuned Language Models Are Zero-Shot Learners (FLAN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01652): Google’s pioneering work demonstrating that multi-task instruction fine-tuning unlocks zero-shot capabilities in LLMs for unseen tasks.\n    -   [Instruction Tuning for Large Language Models: A Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.10792): A comprehensive survey systematically introducing methods, datasets, challenges, and future directions in instruction tuning.\n\n\n\n**参数高效微调(PEFT)**\n\n- **Core Idea:** Given the high cost of full fine-tuning (Full Fine-tuning) for LLMs, PEFT methods have emerged. These methods achieve near-full fine-tuning effects by updating only a small subset of the model’s parameters, greatly reducing hardware requirements.\n-   **Representative Works:**\n    -   [LoRA: Low-Rank Adaptation of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.09685): A revolutionary LoRA technique proposed to efficiently fine-tune models by injecting low-rank adaptation matrices, currently one of the most widely used PEFT methods.\n    -   [QLoRA: Efficient Finetuning of Quantized LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14314): A further optimization of LoRA, combining 4-bit quantization, double-weight quantization, and page optimizers, making it possible to fine-tune massive models on a single consumer-grade GPU.\n    -   [Adapter Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00751): Inserts small neural network modules (adapters) between layers of the Transformer, updating only the parameters of these adapters during training.\n    -   [Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691) \u002F [P-Tuning](https:\u002F\u002Faclanthology.org\u002F2022.acl-short.8\u002F): Instead of modifying the model’s weights, it learns one or more trainable virtual tokens (soft prompts) at the input end, guiding the model to perform downstream tasks more effectively.\n\n**Technical Comparison**\n\n| Technology Type                            | Core Idea                                                                                                     | Advantages                                                                                                    | Disadvantages                                                                                                             |\n| :----------------------------------------- | :------------------------------------------------------------------------------------------------------------ | :------------------------------------------------------------------------------------------------------------ | :------------------------------------------------------------------------------------------------------------------------ |\n| **Full Fine-tuning**                       | Update all model weights.                                                                                     | Highest performance ceiling, can fully adapt to new data.                                                     | Extremely high training cost (memory, time), prone to catastrophic forgetting, requires storing the entire model.         |\n| **Parameter-Efficient Fine-tuning (PEFT)** | Freeze most of the original parameters, only update a small set of additional parameters or specific subsets. | Very low training cost, fast, resistant to forgetting, small fine-tuning products (Adapters), easy to deploy. | Performance may be slightly inferior to full fine-tuning, and its adaptation to extremely complex tasks might be limited. |\n\n#### 4.1.2 Learning Resources\n\n| Resource Name                 | Speaker\u002FAuthor    | Features                                                                                                                                                          | Link                                                                                |\n| :---------------------------- | :---------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------------------------------------------------------------- |\n| Let's build GPT: from scratch | Andrej Karpathy   | A hands-on guide to building GPT from scratch, deeply understanding the fundamentals of Transformer and training processes; a prerequisite for understanding SFT. | [YouTube](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ) |\n| Hugging Face SFT Course       | Hugging Face      | Official SFT series tutorial, using the Hugging Face TRL codebase for SFT code practice.                                                                          | [Course Link](https:\u002F\u002Fhuggingface.co\u002Flearn\u002Fllm-course\u002Fchapter11\u002F1)                  |\n| Hugging Face SFT Trainer Doc  | Hugging Face      | Advanced documentation for Hugging Face SFTTrainer.                                                                                                               | [Documentation Link](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftrl\u002Fsft_trainer)                   |\n| Hugging Face PEFT Course      | Hugging Face      | Official PEFT series tutorial, explaining the theory and code practices of various efficient fine-tuning techniques like LoRA.                                    | [Course Link](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fpeft\u002Findex)                               |\n| LLMs-from-scratch             | Sebastian Raschka | Tutorial code for the official book, \"Build a Large Language Model (From Scratch).\"                                                                               | [Course Link](https:\u002F\u002Fgithub.com\u002Frasbt\u002FLLMs-from-scratch)                           |\n\n#### 4.1.3 Development Frameworks\n\n| Framework            | Features                                                                                                                                                               | Main Use Case                                                                                            | Resource Link                                      |\n| :------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------- | :------------------------------------------------- |\n| **Hugging Face TRL** | Official Hugging Face library, integrating various training methods like SFT, RLHF, DPO, seamlessly connecting with ecosystems (`transformers`, `peft`, `accelerate`). | Provides the standardized SFT trainer `SFTTrainer`, simplifying the training process.                    | [GitHub](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftrl)       |\n| **LLaMA-Factory**    | One-stop LLM fine-tuning platform with a Web UI, enabling users with no coding experience to easily perform SFT, PEFT, and model evaluation.                           | Highly user-friendly, supports massive models and datasets, suitable for beginners and quick validation. | [GitHub](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory) |\n\n\n\n#### 4.1.4 Best Practices and Common Pitfalls\n\n1.  **Data Quality is Far More Important than Quantity:**\n    -   **Core Principle**: It is better to use 1,000 high-quality, diverse data points than 10,000 low-quality, homogeneous ones. Low-quality data can teach the model incorrect patterns.\n    -   **Format Consistency**: Ensure that all training data follows a unified dialogue template (e.g.,[ChatML](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fmain\u002Fen\u002Fchat_templating)), which is crucial for training the model to recognize roles and dialogue boundaries.\n2.  **Choose the Right Fine-Tuning Strategy**:\n    -   For most applications with limited resources, **QLoRA** should be prioritized as it strikes the best balance between efficiency and effectiveness.\n    -   If seeking optimal performance and with sufficient resources, **full fine-tuning** may be considered, though care should be taken to avoid the risk of overfitting.\n3.  **Tuning Key Hyperparameters**:\n    -   **Learning Rate**: SFT typically uses smaller learning rates than pretraining, usually ranging from `1e-5` to `5e-5`.\n    -   **Epochs**: Usually, 1 to 3 epochs are sufficient. Too many epochs can lead to overfitting on small datasets, causing the model to \"forget\" the general knowledge learned during pretraining.\n    -   **Batch Size**: Within the memory limits, increasing the batch size appropriately helps stabilize training.\n4.  **Evaluation and Iteration**:\n    -   **Comprehensive Evaluation**: Do not rely solely on the loss function. Combine it with **objective evaluation** benchmarks (such as MMLU) and subjective **human evaluation** for a more thorough assessment of model performance.\n    -   **Iterative Optimization**: SFT is an ongoing iterative process. Based on evaluation results, continuously clean the data, adjust hyperparameters, and optimize the model.\n\n#### 4.1.5 Relevant Paper Repositories\n- [LLM4NLP](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FAwesome-LLM-for-NLP)\n\n### 4.2 Reinforcement Learning\n\n#### 4.2.1 Reinforcement Learning\n\n- [West Lake University’s \"Mathematical Principles of Reinforcement Learning\"](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1sd4y167NS\u002F)\n  - Features: Starts with MDP and the Bellman Equation, derived using the policy gradient theorem.\n  - Prerequisite Knowledge: Linear algebra, probability theory.\n  - Focus: The mathematical essence of value iteration and policy optimization.\n- [Book-Mathematical-Foundation-of-Reinforcement-Learning](https:\u002F\u002Fgithub.com\u002FMathFoundationRL\u002FBook-Mathematical-Foundation-of-Reinforcement-Learning) (Beginner-friendly)\n\n#### 4.2.2 Core Algorithms of Reinforcement Learning\n\n**Authoritative Courses**\n\n| Course                                        | Lecturer      | Features                                                                 | Resources                                                                      |\n| --------------------------------------------- | ------------- | ------------------------------------------------------------------------ | ------------------------------------------------------------------------------ |\n| Foundations of Deep RL                        | Pieter Abbeel | 6 concise lectures (Q-learning → PPO)                                    | [YouTube](https:\u002F\u002Fyoutube.com\u002Fplaylist?list=PLkFD6_40KJIwhWpGazJ9VSj9CFMkb79A) |\n| UC Berkeley CS285                             | Sergey Levine | Includes SAC\u002FInverse Reinforcement Learning and other advanced topics    | [Course Website](http:\u002F\u002Frail.eecs.berkeley.edu\u002Fdeeprlcourse\u002F)                  |\n| Reinforcement Learning by Hung-yi Lee |  Hung-yi Lee     | In Chinese + Practical Exercises with EasyRL                             | [Bilibili](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1UE411G78S)                        |\n| Reinforcement Learning: An Overview           | Kevin Murphy  | Continuously updated resources on Deep Reinforcement Learning algorithms | [Arxiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05265)                                      |\n\n**Essential Basic Algorithms**\n\n- **Basic Reinforcement Learning Algorithms**\n\n  - **DQN**: The beginning of Deep Reinforcement Learning\n  - **PPO**: A key method in policy optimization, widely used in industrial applications\n  - **SAC**: Incorporates exploration entropy, robust for continuous action spaces\n  - **TD3**: Improves off-policy reinforcement learning using double delayed networks\n\n- **Model-Based Reinforcement Learning Algorithms**\n\n  - **[dreamer](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fdreamer)**: A model-based reinforcement learning algorithm\n  - **[tdmpc2](https:\u002F\u002Fgithub.com\u002Fnicklashansen\u002Ftdmpc2)**: Significant advancement in model-based reinforcement learning algorithms\n\n- **Offline Reinforcement Learning Algorithms**\n\n  - **[CQL](https:\u002F\u002Fgithub.com\u002Faviralkumar2907\u002FCQL)**: Introduces conservative constraints, foundational work in offline reinforcement learning\n  - **[decision-transformer](https:\u002F\u002Fgithub.com\u002Fkzl\u002Fdecision-transformer)**: Introduces autoregressive models into offline reinforcement learning\n\n- **Large-Scale Model Reinforcement Learning Algorithms**\n\n  - **PPO**: Applying classic PPO to large language models\n  - **DPO**: Preference optimization without rewards, an offline reinforcement learning algorithm for large models\n  - **GRPO**: Group Relative Policy Optimization, core algorithm in DeepSeek-R1\n\n**Cutting-Edge Algorithms for Large-Scale Model Reinforcement Learning**\n\n- **[DAPO](https:\u002F\u002Fgithub.com\u002FBytedTsinghua-SIA\u002FDAPO)**: Four improvements on GRPO\n- **[LUFFY](https:\u002F\u002Fgithub.com\u002FElliottYan\u002FLUFFY)**: Off-policy version of GRPO, introduces high-quality external trajectories\n- **[Absolute-Zero-Reasoner](https:\u002F\u002Fgithub.com\u002FLeapLabTHU\u002FAbsolute-Zero-Reasoner)**: A large-scale reinforcement learning algorithm requiring no annotations\n- **[One-Shot-RLVR](https:\u002F\u002Fgithub.com\u002Fypwang61\u002FOne-Shot-RLVR)**: One-shot optimization for large model inference\n- **[SPIRAL](https:\u002F\u002Fgithub.com\u002Fspiral-rl\u002Fspiral)**: Reinforcement learning in self-play game environments, successfully enhancing mathematical reasoning abilities\n- **[High-Entropy Minority Tokens Drive Effective RLVR](https:\u002F\u002Fshenzhi-wang.github.io\u002Fhigh-entropy-minority-tokens-rlvr\u002F)**: Reinforcement learning driven by high-entropy tokens (20%)\n- **[Spurious\\_Rewards](https:\u002F\u002Fgithub.com\u002Fruixin31\u002FSpurious_Rewards)**: Random rewards can also enhance LLM reasoning abilities\n- **[SwS](https:\u002F\u002Fgithub.com\u002FMasterVito\u002FSwS)**: Reasoning reinforcement learning driven by self-perceived weaknesses\n\n#### 4.2.3 Development Frameworks for Reinforcement Learning\n\n**Basic Reinforcement Learning Frameworks**\n\n- **[stable-baselines3](https:\u002F\u002Fgithub.com\u002FDLR-RM\u002Fstable-baselines3)** (Quick experimentation, with well-established and stable baselines)\n- **[legged\\_gym](https:\u002F\u002Fgithub.com\u002Fleggedrobotics\u002Flegged_gym)** (Quadruped robot control)\n\n**Large-Scale Model Reinforcement Learning Frameworks**\n\n- **[verl](https:\u002F\u002Fgithub.com\u002Fvolcengine\u002Fverl)**: A high-performance and user-friendly open-source reinforcement learning training library based on Ray, vLLM, ZeRO-3, and HuggingFace Transformers, with features like efficient resource utilization, scalability, and production readiness. (Complex structure, highly reusable, excellent performance)\n- **[OpenRLHF](https:\u002F\u002Fgithub.com\u002FOpenLLMAI\u002FOpenRLHF)**: An open-source RLHF framework released by teams such as NVIDIA, based on Ray, vLLM, ZeRO-3, and HuggingFace Transformers. It supports algorithms like PPO, GRPO, and REINFORCE++, and provides dynamic sampling and asynchronous agent mechanisms to accelerate training.\n- **[AReaL](https:\u002F\u002Fgithub.com\u002FinclusionAI\u002FAReaL)**: Asynchronous reinforcement learning framework\n- **[ROLL](https:\u002F\u002Fgithub.com\u002Falibaba\u002FROLL)**: Supports training large models with 600+ billion parameters\n- **[Hugging Face TRL](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftrl)**: An RLHF full-stack library maintained by Hugging Face, integrating SFT, GRPO, DPO, Reward Modeling, and other modules. It supports multiple model architectures and distributed scaling, making it one of the most active RLHF tools in the community. (User-friendly, quick start, active community)\n- **[RL4LMs](https:\u002F\u002Fgithub.com\u002Fallenai\u002FRL4LMs)**: An open-source RLHF library for language models, providing end-to-end tools for reward model construction and policy network training, helping researchers quickly build custom RLHF pipelines.\n\nAdditionally, there are some interesting extension repositories:\n\n- **[Sachin19\u002Ftrlp](https:\u002F\u002Fgithub.com\u002FSachin19\u002Ftrlp)**: An end-to-end RLHF library based on the TRL stack, supporting not only language models but also extending to Stable Diffusion models. It includes steps like SFT, reward modeling, and PPO, with example code for experimentation.\n- **[OpenRLHF-M](https:\u002F\u002Fgithub.com\u002FOpenRLHF\u002FOpenRLHF-M)**: An extension of OpenRLHF, optimized for multimodal models. It leverages DeepSpeed and HuggingFace Transformers to achieve higher throughput and richer training scenarios.\n- **[HumanSignal-RLHF](https:\u002F\u002Fgithub.com\u002FHumanSignal\u002FRLHF)**: An archived resource repository, gathering links and tutorials on RLHF data collection, system construction, and best practices, suitable for beginners to quickly understand the full RLHF pipeline.\n- **[MichaelEinhorn\u002Ftrl-textworld](https:\u002F\u002Fgithub.com\u002FMichaelEinhorn\u002Ftrl-textworld)**: A derivative version of TRL focused on performing RLHF experiments in the TextWorld environment, demonstrating how to train models like GPT2 using PPO to generate text that meets specific feedback requirements.\n\n\n### 4.2.4 Test Environments\n\n**Classical RL Tests**\n\n- **OpenAI Gym**: Classic Control\n\n| Environment ID   | Task Description                       | Features                                                                             |\n| ---------------- | -------------------------------------- | ------------------------------------------------------------------------------------ |\n| `CartPole-v1`    | Balance an inverted pendulum           | 4-dimensional state\u002Fdiscrete actions, termination if pole tilts > 12° or steps ≥ 500 |\n| `MountainCar-v0` | Swing car to the top                   | 2-dimensional state\u002Fdiscrete actions, requires potential energy swing                |\n| `Pendulum-v1`    | Control pendulum to stay vertical      | 3-dimensional state\u002Fcontinuous actions, no physical termination condition            |\n| `Acrobot-v1`     | Swing double-link to touch target line | 6-dimensional state\u002Fdiscrete actions, termination when target line is touched        |\n\n- **Atari 2600**: Games\n\n| Environment ID     | Game Type      | Challenges                                                 |\n| ------------------ | -------------- | ---------------------------------------------------------- |\n| `Pong-v5`          | Ping Pong      | 210×160 RGB input, requires image preprocessing            |\n| `Breakout-v5`      | Breakout       | Dense rewards, suitable for DQN training                   |\n| `SpaceInvaders-v5` | Space Invaders | Multiple enemies coordinated attack, complex reward system |\n\n- **Box2D**: Physics Simulation\n\n| Environment ID     | Physics System | Core Challenges                                                        |\n| ------------------ | -------------- | ---------------------------------------------------------------------- |\n| `LunarLander-v2`   | Lunar Lander   | 8-dimensional state\u002Fdiscrete actions, fuel control and precise landing |\n| `BipedalWalker-v3` | Bipedal Walker | 24-dimensional state\u002Fcontinuous actions, balancing on complex terrain  |\n| `CarRacing-v2`     | Car Racing     | 96×96 RGB input, vision + continuous control combined                  |\n\n- **MuJoCo**: Robotic Control\n\n| Environment ID   | Robot Model    | Task Type                                          |\n| ---------------- | -------------- | -------------------------------------------------- |\n| `HalfCheetah-v4` | Cheetah Robot  | High-speed running control (17-dimensional state)  |\n| `Ant-v4`         | Ant Robot      | Complex terrain navigation (111-dimensional state) |\n| `Humanoid-v4`    | Humanoid Robot | Bipedal balance walking (376-dimensional state)    |\n\n- **Other Special Environments**\n\n| Category      | Example Environment | Application Area                    |\n| ------------- | ------------------- | ----------------------------------- |\n| Text Game     | `TextFlappyBird-v0` | RL based on character interfaces    |\n| Multi-agent   | `PistonBall-v6`     | Multi-agent cooperation\u002Fcompetition |\n| 3D Navigation | `AntMaze-v4`        | Complex maze path planning          |\n\n**Extended Resources**:\n\n- **Safe RL**: `Safety-Gymnasium` (task with constraints)\n- **Autonomous Driving**: `CARLA`\u002F`AirSim` (high fidelity simulation)\n- **Multi-agent**: `PettingZoo` (compatible with Gymnasium API)\n\n> 💡 Full environment list can be found at:\n> [Gymnasium Documentation](https:\u002F\u002Fgymnasium.farama.org\u002F) | [OpenAI Gym Wiki](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym\u002Fwiki\u002FTable-of-environments)\n\n**Large Model RL Tests**\n\n| Environment   | Purpose                                      |\n| ------------- | -------------------------------------------- |\n| Math-500      | Mathematical reasoning                       |\n| AIME2024\u002F2025 | Mathematical competition                     |\n| AMC           | Mathematical competition                     |\n| GPQA          | PhD-level biophysics and chemistry reasoning |\n\n### 4.3 Agent\n\nThe ability of LLM Agents to solve complex problems fundamentally relies on their reasoning and planning capabilities. The core mechanism of this ability is Long CoT, which breaks down complex tasks into smaller, logical steps. The characteristics of Long CoT, particularly its depth of inference, extensive exploration, and feasibility reflection, are not just additional features but the foundation for realizing these abilities. If an agent cannot \"think longer\" and engage in a \"thinking-critique-improvement\" cycle, its ability to make independent decisions and adapt in unfamiliar scenarios will be severely limited, causing it to revert to \"predefined pipelines\" or \"iterative interactions with humans.\" Models such as o1 and DeepSeek-R1 have made breakthroughs in using Long CoT to solve complex tasks, directly proving this causal relationship: enhanced reasoning depth directly leads to an improvement in agent capabilities (autonomy in complex tasks). Therefore, the future development of AI agents will be closely linked to breakthroughs in Long CoT.\n\n**AI Agent Online Courses and Resources**\n\n- [Andrew Ng's \"How to Build, Evaluate, and Iterate LLM Agents\"](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1Ew4m1R7ju\u002F?vd_source=a39056a294c1d415f3413ef933024e2b): A seminar by LlamaIndex and TruEra team experts (March 2024), explaining how to build LLM agents using tool frameworks like LlamaIndex and evaluate agent performance, detect hallucinations and biases using observability tools like TruLens. The video provides both Chinese and English subtitles, making it suitable for learning about agent development and evaluation methods in production environments.\n- [Coursera AI Agent Developer Specialization (Vanderbilt University)](https:\u002F\u002Fwww.coursera.org\u002Fspecializations\u002Fai-agents): A series of 6 courses for beginners with Python experience, focusing on building and deploying intelligent AI agents using Python, tools, memory, and reasoning. Topics include creating custom GPTs, applying prompt engineering, designing reliable AI systems, and implementing multi-agent collaboration systems.\n- [Hugging Face Agent Course](https:\u002F\u002Fhuggingface.co\u002Flearn\u002Fagents-course\u002Funit0\u002Fintroduction): A free online course introducing agents.\n\n**Open Source Frameworks for Building LLM AI Agents**\n\n- [LangChain](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Flangchain): The most widely used framework for LLM agent development, offering a modular and extensible architecture, unified LLM interfaces, pre-built agent toolkits (for CSV, JSON, SQL), Python and Pandas integration, and vector storage capabilities. It supports React-style agents and provides a memory module to maintain context.\n- [CrewAI](https:\u002F\u002Fgithub.com\u002FcrewAIInc\u002FcrewAI): An open-source framework for orchestrating role-playing AI agents, emphasizing multi-agent collaboration through defined roles and shared goals. It is independent, streamlined, and offers deep customization, supporting \"Crew\" (team) and \"Flow\" (event-driven workflows).\n- [Dify](https:\u002F\u002Fgithub.com\u002Flanggenius\u002Fdify): An open-source framework for LLM applications with a visual prompt orchestration interface, long context integration, API-based development, multi-model support, and RAG pipelines.\n- [OpenAI Agent Demo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fopenai-cs-agents-demo): OpenAI's official platform for setting up Agent client services (visual platform, no additional code required).\n\nFor more frameworks, refer to [Awesome LLM Agent Frameworks](https:\u002F\u002Fgithub.com\u002Fkaushikb11\u002Fawesome-llm-agents\u002Fblob\u002Fmain\u002FREADME.md).\n\n**End-to-End RL Learning for Complex Agent Trajectories**\n\n- [Agent-R1](https:\u002F\u002Fgithub.com\u002F0russwest0\u002FAgent-R1): An open-source framework aimed at accelerating research and development at the intersection of RL and agents. It uses end-to-end reinforcement learning to train agents in specific environments, allowing developers to define domain-specific tools and reward functions without complex process engineering. It supports multi-round tool calls and multi-tool coordination.\n- [RAGEN](https:\u002F\u002Fgithub.com\u002FRAGEN-AI\u002FRAGEN): A framework for training LLM reasoning agents with RL in interactive, stochastic, and multi-round environments. It introduces the StarPO (State-Think-Act-Reward Policy Optimization) framework, which features staggered rollout and update phases for trajectory-level optimization.\n\n**RL-enhanced Tool Use and Search Capabilities**\n\n- [ReCall](https:\u002F\u002Fgithub.com\u002FAgent-RL\u002FReCall): A novel framework that trains LLMs for tool invocation reasoning with RL, without the need for supervised data on tool usage trajectories or reasoning steps. It is designed to enable LLMs to use and combine any user-defined tools in an agent-like manner.\n- [OpenManus-RL](https:\u002F\u002Fgithub.com\u002FOpenManus\u002FOpenManus-RL): An extension of the OpenManus framework, specifically focused on enhancing AI agents via RL techniques like GRPO, to enable training across multiple environments and performance tuning for specific tasks.\n- [R1-Searcher](https:\u002F\u002Fgithub.com\u002FRUCAIBox\u002FR1-Searcher), [Search-R1](github.com\u002FPeterGriffinJin\u002FSearch-R1): Research exploring the use of RL to enhance the search capabilities of LLMs.\n\n\n**Awesome Blog**\n\n- [Neptune.ai blog](https:\u002F\u002Fneptune.ai\u002Fblog\u002Fbuilding-llm-agents-with-autogen): Provides a detailed step-by-step guide, such as \"How to Build LLM Agents with AutoGen,\" covering components, RAG pipelines, planning, tools, and memory integration.\n- [n8n.io blog](https:\u002F\u002Fblog.n8n.io\u002Fllm-agents\u002F): Offers insights into the capabilities of LLM agents (such as strategic planning, memory, and tool integration) and includes a practical tutorial on building agents.\n- [NVIDIA Developer Blog](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Fan-easy-introduction-to-llm-reasoning-ai-agents-and-test-time-scaling\u002F): Provides an introductory article on LLM reasoning and AI agents.\n- [Botpress blog](https:\u002F\u002Fbotpress.com\u002Fblog\u002Fchain-of-thought): Explains chain-of-thought prompting and discusses various AI agent frameworks.\n- [SuperAnnotate blog](https:\u002F\u002Fwww.superannotate.com\u002Fblog\u002Fllm-agents): Offers a comprehensive overview of LLM agents, their capabilities, and their future.\n- [Smythos blog](https:\u002F\u002Fsmythos.com\u002Fdevelopers\u002Fagent-development\u002Fllm-agents\u002F): Discusses how LLM agents are revolutionizing task automation and AI integration.\n- [Unite.ai](https:\u002F\u002Fwww.unite.ai\u002Freinforcement-learning-meets-chain-of-thought-transforming-llms-into-autonomous-reasoning-agents\u002F): Provides a detailed discussion on how reinforcement learning, combined with chain-of-thought, transforms LLMs into autonomous reasoning agents.\n- [Holistic AI blog](https:\u002F\u002Fwww.holisticai.com\u002Fblog\u002Fllm-agents-use-cases-risks): Delves into the architecture of LLM agents, including multimodal enhancement, tool usage, and memory.\n- [ProjectPro](https:\u002F\u002Fwww.projectpro.io\u002Farticle\u002Fagentic-ai-design-patterns\u002F1126) and [Lightrains blog](https:\u002F\u002Flightrains.com\u002Fblogs\u002Fai-agent-design-patterns-cxo\u002F): Discuss various AI agent design patterns, including reflection, tool usage, and planning patterns.\n\n**Awesome GitHub Repositories**\n\n- [Awesome-LLM-Agents](https:\u002F\u002Fgithub.com\u002Fkaushikb11\u002Fawesome-llm-agents\u002Fblob\u002Fmain\u002FREADME.md): A curated list of various LLM agent frameworks, serving as a valuable starting point for exploring the ecosystem.\n- [Awesome-LLM-Agents-Scientific-Discovery](https:\u002F\u002Fgithub.com\u002Fzhoujieli\u002FAwesome-LLM-Agents-Scientific-Discovery): A curated list of papers focused on LLM-driven AI agents' applications in biomedical research and broader scientific discovery.\n- [Awesome-Agent-RL](https:\u002F\u002Fgithub.com\u002F0russwest0\u002FAwesome-Agent-RL): A specialized collection of papers and resources focused on unleashing the potential of AI agents through reinforcement learning.\n- [Awesome-LLM-APPs](https:\u002F\u002Fgithub.com\u002FShubhamsaboo\u002Fawesome-llm-apps): A curated collection of excellent LLM applications built using RAG, AI agents, multi-agent teams, MCP, speech agents, and more.\n\u003C!-- ## 4. Behavior Analysis \\& Rationale -->\n\n\n## 5. Datasets\n\n### 5.1 Benchmarks\n\n#### 5.1.1 Evaluation Frameworks\n\n- **LLM Evaluation Frameworks:**\n\n  - [OpenCompass](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fopencompass) is a comprehensive evaluation platform for large language models (LLMs) that supports assessments of a wide range of open and closed models across over 100 datasets. It covers multiple dimensions such as language understanding, reasoning, and code generation, and supports various evaluation modes, including zero-shot, few-shot, and Chain-of-Thought (CoT), as well as distributed evaluation capabilities.\n  - [DeepEval](https:\u002F\u002Fgithub.com\u002Fconfident-ai\u002Fdeepeval) is an easy-to-use, open-source LLM evaluation framework designed for evaluating and testing large language model systems. It aims to help developers efficiently assess the quality of model-generated content based on key metrics such as relevance, factual consistency, bias, and toxicity. Its usage is similar to the Python unit testing framework, Pytest.\n\n- **MLLM Evaluation Frameworks:**\n\n  - [VLMEvalKit](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fvlmevalkit) is an open-source toolkit launched by OpenCompass specifically designed for the evaluation of large vision-language models. It supports one-click evaluations for over 220 vision-language models across more than 80 benchmark tests, covering tasks such as image question answering, image-text matching, and visual reasoning. It provides evaluation results based on both exact matching and LLM-based answer extraction.\n  - [EvalScope](https:\u002F\u002Fgithub.com\u002Fmodelscope\u002Fevalscope) is a model evaluation framework introduced by the MoTower community, supporting performance benchmarking for various types of models, including large language models, multimodal language models, embedding models, and AIGC models.\n\n- **CoT Evaluation Frameworks:**\n\n  - [ROSCOE](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FParlAI\u002Ftree\u002Fmain\u002Fprojects\u002Froscoe) aims to provide a set of automated metrics to evaluate the reasoning quality of models without requiring reference answers.\n  - [ReCEval](https:\u002F\u002Fgithub.com\u002Farchiki\u002FReCEval) is a reasoning chain evaluation framework proposed by Archiki Prasad and colleagues. It aims to provide a detailed analysis of the multi-step reasoning process generated by large language models through two dimensions: \"correctness\" and \"informativeness.\"\n\n#### 5.1.2 Outcome Benchmarks\n\nThis section focuses on evaluating the final performance of Long CoT reasoning from a holistic perspective, emphasizing whether the reasoning chain is ultimately sound and accurate.\n\n- **Complex Mathematics**\n  \n|      Name     | Number of Problems | Release Date |                                     Authors                                     |                                                                                                                                                                                               Description                                                                                                                                                                                              |                                                          Relevant Links                                                         |\n| :-----------: | :----------------: | :----------: | :-----------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: |\n|     GSM8K     |       \\~8,500      |     2021     |                                      OpenAI                                     |                                                                                                     A dataset of K-12 math word problems provided by OpenAI, each with detailed solution steps. The problems cover basic arithmetic, word problems, etc., requiring multi-step reasoning to solve.                                                                                                     |                                    🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopenai\u002Fgsm8k)                                    |\n|      MATH     |       12,500       |     2021     |                          Hendrycks et al. (UC Berkeley)                         |                                                                           A dataset of challenging math problems from math competitions, each accompanied by a complete step-by-step solution. It includes topics such as algebra, geometry, and probability, designed to evaluate models' mathematical reasoning abilities.                                                                           |                                        🌐[repository](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fmath)                                        |\n|   AIME 2024   |         30         |     2024     |                               AI-MO Project Group                               |                                                                                     American Invitational Mathematics Examination 2024, a high-level high school math competition dataset, including all questions from AIME I and II of 2024. The problems focus on integer solutions and combinatorial reasoning.                                                                                    |                             🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002Faimo-validation-aime)                             |\n|   AIME 2025   |         30         |     2025     |                                   OpenCompass                                   |                                                                                                                     A collection of problems from AIME 2025 I & II. The difficulty is similar to AIME 2024, assessing high school students' complex math problem-solving abilities.                                                                                                                    |                                🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopencompass\u002FAIME2025)                                |\n|    AMC 2023   |         83         |     2024     |                               AI-MO Project Group                               |                                                                                               American Mathematics Competitions 2023, a validation set consisting of 83 problems from the AMC12 competition. It includes questions from the 2022-2023 AMC12 covering topics such as algebra and geometry.                                                                                              |                              🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002Faimo-validation-amc)                             |\n|   USAMO 2025  |          6         |     2025     |                          Balunović et al. (ETH Zurich)                          |                                                                                             A dataset of problems from the USA Mathematical Olympiad 2025. These are final exam questions from the USAMO, typically difficult proof-based problems that test deep mathematical reasoning and proof skills.                                                                                             |                   🌐[website](https:\u002F\u002Fmatharena.ai\u002F) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Feth-sri\u002Fmatharena)                  |\n| OlympiadBench |        8,476       |     2024     |                     He Chaohui et al. (Tsinghua University)                     |                                                    A bilingual multimodal scientific problem dataset at the Olympiad level. It includes 8,476 problems from competitions in subjects like mathematics and physics, each with expert step-by-step solutions, used to comprehensively evaluate the model's cross-disciplinary deep reasoning ability.                                                    | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHothan\u002FOlympiadBench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FOlympiadBench) |\n|  OlympicArena |       11,163       |     2024     | Huang Zhen et al. (Shanghai Jiao Tong University & Shanghai Research Institute) |                                    Also known as OlympiadArena, this comprehensive benchmark covers 62 types of “Olympiad” challenges across 7 categories such as mathematics, physics, chemistry, and biology. It contains 11,163 Olympiad-level problems, categorized by subject and problem type, designed to promote general artificial intelligence reasoning.                                    |   🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FGAIR\u002FOlympicArena) \u003Cbr> 🌐[repository](https:\u002F\u002Fgair-nlp.github.io\u002FOlympicArena)   |\n|  Putnam-AXIOM |      236 + 52      |     2024     |                       Gulati et al. (Stanford University)                       |                                                                      A dataset from the Putnam Mathematics Competition, including 236 problems from the Putnam competition and 52 cross-problems from Putnam AIME. Each problem comes with detailed solution steps and is used to assess models' mathematical reasoning abilities.                                                                     |                                      📄[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=t1mAXb4Cop)                                      |\n|  FrontierMath |          -         |     2024     |                             Glazer et al. (Epoch AI)                            | A collection of frontier mathematical problems collaboratively created by dozens of mathematicians. It covers major branches of modern mathematics, from number theory and real analysis to algebraic geometry. The problems require hours or even days to solve manually. Hundreds of original high-difficulty problems are included, all of which have not been published to avoid training leakage. |                                           📄[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.04872)                                           |\n|   ThinkBench  |        2,912       |     2025     |      Huang Shulin et al. (University of Science and Technology of Shanghai)     |                                                   A dynamic challenge set designed to evaluate the robust reasoning abilities of large language models (LLMs). It contains 2,912 reasoning tasks generated by applying out-of-distribution perturbations to existing problems, aiming to test the model's reasoning accuracy in unfamiliar contexts.                                                   |                                           📄[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.16268)                                           |\n|  MATH-Perturb |      279 \\* 2      |     2025     |                    Huang Kaixuan et al. (Princeton & Google)                    |                    A perturbation set for the most difficult problems in the MATH dataset. It selects 279 of the hardest Level 5 problems from MATH and generates 279 variants for each through \"simple perturbations\" and \"difficult perturbations.\" Model performance on these perturbed problems significantly declines, reflecting its real mathematical generalization ability.                   |                                      📄[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=IkmD3fKBPQ)                                      |\n\n- **Complex Coding**\n\n\n|      Name     | Number of Problems | Release Date |                                     Authors                                     |                                                                                                                                                                                               Description                                                                                                                                                                                              |                                                          Relevant Links                                                         |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n| SWE-bench     | 2,294                       | 2024         | Chen Tianle et al. (Princeton NLP)                              | Software Engineering Bench, a dataset extracted from real software project issues-patch pairs on GitHub. It collects 2,294 issues and their corresponding Pull Request fixes from 12 popular Python libraries. The dataset is used to evaluate models' ability to automatically resolve real code bugs.                                                                                          | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSWE-bench\u002FSWE-bench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FSWE-bench\u002FSWE-bench)              |\n| CodeContests  | \\~10,000                    | 2022         | Li et al. (DeepMind)                                            | A competitive programming dataset proposed by DeepMind for training AlphaCode. It aggregates a vast number of problems and test cases from platforms such as Codeforces and AtCoder. The dataset contains around 10,000 multilingual programming problems, useful for code generation model training and evaluation.                                                                             | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fdeepmind\u002Fcode_contests)                                                                       |\n| LiveCodeBench | \\~400 (increasing annually) | 2024         | Jain et al. (UC Berkeley & MIT)                                 | A \"live\" benchmark for code. Continuously collects the latest publicly available problems from LeetCode, AtCoder, and Codeforces, totaling around 400 high-quality programming problems. In addition to code generation, it also evaluates models' abilities in code debugging, self-repair, and unit test generation.                                                                           | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Flivecodebench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FLiveCodeBench\u002FLiveCodeBench)                     |\n| MHPP          | 210                         | 2025         | Dai Jianbo et al.                                               | Mostly Hard Python Problems, a human-designed collection of difficult Python programming tasks. The dataset contains 210 problems across seven challenge categories, each requiring multi-step reasoning or complex algorithms to solve. It is used to assess the limits of LLMs in code reasoning efficiency and accuracy.                                                                      | 📄[paper](https:\u002F\u002Fopenreview.net\u002Fforum?id=TVFVx8TUbN)                                                                                     |\n| ProBench      | -                           | 2025         | Yang Lei et al. (Shanghai University of Science and Technology) | A benchmark designed specifically for competitive programming. It collects contest problems from Codeforces, Luogu, and Nowcoder platforms in the second half of 2024, with unified difficulty and algorithm tags. The dataset contains several hundred problems, filling the gap in advanced code reasoning evaluation.                                                                         | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fyl-9\u002Fprobench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FYL-9\u002Fprobench)                          |\n| HumanEval Pro | 164                         | 2024         | Yu Zhaojian et al. (Microsoft AI Research)                      | An enhanced version of the OpenAI HumanEval dataset. For the original 164 programming problems, an additional \"sub-question\" is added, requiring the model to first solve a simpler sub-problem before using the result to solve the more complex problem. Compared to the original HumanEval, the Pro version reduces model accuracy by about 20%.                                              | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FCodeEval-Pro\u002Fhumaneval-pro) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FCodeEval-Pro\u002FCodeEval-Pro) |\n| MBPP Pro      | 378                         | 2024         | Yu Zhaojian et al. (Microsoft AI Research)                      | An advanced version of the Google MBPP programming problem dataset. It selects 378 problems from the MBPP test set and constructs additional questions similar to those in HumanEval Pro, making the problems more hierarchical and comprehensive. It is used for a more stringent evaluation of models' multi-step reasoning abilities in basic programming tasks.                              | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FCodeEval-Pro\u002Fmbpp-pro) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FCodeEval-Pro\u002FCodeEval-Pro)      |\n| EquiBench     | 2,400                       | 2025         | Wei Anjiang et al. (Stanford & NYU)                             | A code semantic understanding benchmark. It evaluates LLMs' understanding of program execution semantics through equivalence verification tasks. The dataset provides 2,400 pairs of functionally equivalent\u002Finequivalent programs in four programming languages. Models are required to determine if the outputs of two programs are identical, testing their understanding of deep code logic. | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fanjiangwei\u002FEquiBench-Datasets) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FAnjiang-Wei\u002FEquiBench)  |\n\n\n- **Commonsense Puzzles**\n\nHere is the translation of the table into academic English:\n\n|      Name     | Number of Problems | Release Date |                                     Authors                                     |                                                                                                                                                                                               Description                                                                                                                                                                                              |                                                          Relevant Links                                                         |\n| :------------------: | :-----------------------------: | :----------: | :-----------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |\n|       LiveBench      |          Dynamic Update         |     2025     |   White et al. (NYU & Meta AI)  |                                                                             An online updating comprehensive evaluation framework for LLMs. New tasks are added monthly to ensure the test set is not contaminated by the model's training data. Tasks cover areas like mathematics, logic, programming, and common sense QA. It uses automated scoring and verifiable standard answers to ensure unbiased and objective evaluation.                                                                             | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Flivebench\u002Flivebench-67eaef9bb68b45b17a197a98) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Flivebench\u002Flivebench) \u003Cbr> 🌐[website](https:\u002F\u002Flivebench.ai\u002F) |\n| BIG-Bench Hard (BBH) | 23 Tasks (Over 2,000 Questions) |     2023     | Suzgun et al. (Google Research) |                                        A collection of 23 of the most challenging tasks selected from the BIG-Bench large-scale general benchmark. These tasks show much lower performance on models like GPT-3 compared to human average levels and cover areas like boolean expression evaluation, causal reasoning, date understanding, and complex common sense\u002Flogic problems. It is commonly used as a benchmark for chain-of-thought (CoT) enhancement experiments.                                       |                               🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmaveriq\u002Fbigbenchhard) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fsuzgunmirac\u002FBIG-Bench-Hard)                               |\n|      ZebraLogic      |                -                |     2024     |        Lin et al. (HKUST)       |                                                                                                        A logic reasoning dataset inspired by “zebra puzzles.” It contains a set of complex deductive reasoning problems, often involving non-monotonic reasoning scenarios, generated by models and manually verified. It is used to test the model’s consistency in reasoning under purely logical cues.                                                                                                        |  🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FWildEval\u002FZebraLogic) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FWildEval\u002FZeroEval) \u003Cbr> 🌐[website](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FWildEval\u002FZebraLogic)  |\n|          ARC         |              10,377             |     2018     |        Clark et al. (AI2)       |                                   AI2 Reasoning Challenge, a multiple-choice dataset for natural common sense and science questions. The questions are sourced from US K-12 science exams and are divided into easy and hard sections. It includes 7,787 training questions and 2,590 challenge questions. GPT-4 still struggles to surpass elimination-round performance on the ARC challenge set, making it a common benchmark for general common sense intelligence testing.                                  |                                                                   🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fai2_arc)                                                                   |\n|       JustLogic      |              4,900              |     2024     |    Michael Chen et al. (USYD)   |                            A pure deductive logic reasoning benchmark. It includes 4,900 propositional logic reasoning problems automatically generated by a synthetic algorithm, which do not rely on any common sense knowledge, focusing solely on testing the model’s ability to perform formal logical deductions. Each task provides a set of premises and a proposition conclusion, and the model must determine the truth value of the conclusion: true, false, or uncertain.                            |                                🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FWildEval\u002FZebraLogic) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fmichaelchen-lab\u002FJustLogic)                                |\n|      QuestBench      |              \\~600              |     2025     |       Li et al. (DeepMind)      | Information retrieval reasoning evaluation released by DeepMind. It contains four types of \"incomplete problems\": logic, planning, mathematics (GSM), and formula problems, where each question is missing one key condition. The model must identify the most critical clarifying question to ask and use that information to answer the original question. It includes around 600 such common sense\u002Freasoning problems, designed to evaluate the ability of LLMs to identify and ask for critical information. |                                                                   🌐[repository](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002Fquestbench)                                                                  |\n\n\n- **Scientific Reasoning**\n\n| Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n| GPQA Diamond               | 198                 | 2024         | Rein et al. (NYU)                   | A highly difficult subset of Graduate-level Physics\u002FBiology\u002FChemistry Q\\&A. The GPQA dataset filters out 198 questions that are answered correctly by experts but incorrectly by laypersons. These \"diamond-level\" problems are almost at the graduate level and require models to possess cross-disciplinary deep reasoning abilities.                                                                      | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FIdavidrein\u002Fgpqa) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fidavidrein\u002Fgpqa)                                       |\n| MMLU-Pro                   | \\~12,000            | 2024         | Wang Yubo et al.                    | An enhanced version of the original MMLU benchmark. It includes 12,000 high-quality academic exam questions from 14 major fields (with the number of answer options expanded from 4 to 10), focusing on comprehensive knowledge and complex reasoning. Compared to the original MMLU, the Pro version significantly increases the difficulty, with the model's accuracy dropping by an average of about 20%. | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTIGER-Lab\u002FMMLU-Pro) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FTIGER-AI-Lab\u002FMMLU-Pro)                              |\n| SuperGPQA                  | 26,529              | 2025         | Doubao (Seed) Team                  | A large-scale graduate-level knowledge reasoning benchmark. Covering 285 academic disciplines, it contains 26,529 high-difficulty professional exam questions. Over 42% of the questions require mathematical calculations or formal reasoning, aiming to test the model's reasoning limits in long-tail disciplines.                                                                                        | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fm-a-p\u002FSuperGPQA) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FSuperGPQA\u002FSuperGPQA)                                   |\n| Humanity’s Last Exam (HLE) | 2,500               | 2025         | CAIS & Scale AI                     | \"Humanity's Last Exam,\" designed as the final closed-book test of human knowledge. It includes 2,500 multiple-choice or short-answer questions across dozens of fields such as mathematics, natural sciences, and humanities. Created collaboratively by global experts, it exceeds the difficulty of all previous benchmarks and is considered the most difficult comprehensive exam AI currently faces.    | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcais\u002Fhle) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fcenterforaisafety\u002Fhle) \u003Cbr> 🌐[website](https:\u002F\u002Flastexam.ai\u002F) |\n| TPBench                    | -                   | 2024         | Daniel J.H. Chung et al. (DeepMind) | A Theoretical Physics Benchmark designed to assess models' ability to solve advanced theoretical physics problems. Proposed by Chung et al., this benchmark collects a set of theoretical physics problems requiring advanced knowledge and complex derivations, testing the model's limits in reasoning about physical laws and equation derivations.                                                       | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FZhiqiGao\u002FTPBench) \u003Cbr> 🌐[website](https:\u002F\u002Ftpbench.org\u002F)                                                       |\n\n\n- **Medical Reasoning**\n\n| Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|          MedQA          |       12,723       |     2020     |    Jin et al. (Tsinghua University)    | A medical exam question-answer dataset. Collected from the United States Medical Licensing Examination (USMLE) multiple-choice questions, covering subjects such as anatomy, physiology, pathology, etc. Includes English (12,723 questions) and simplified\u002Ftraditional Chinese versions (approximately 50,000 questions in total). Used to evaluate models' ability to apply medical knowledge and diagnostic reasoning. |                    🌐[Google Drive](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1ImYUSLk9JbgHXOemfvyiDiirluZHPeQw\u002Fview) \u003Cbr> 🌐[Repository](https:\u002F\u002Fgithub.com\u002Fjind11\u002FMedQA)                   |\n| JAMA Clinical Challenge |        1,524       |     2024     | Chen et al. (Johns Hopkins University) |             The Clinical Challenge Case Set from the Journal of the American Medical Association (JAMA). Compiles 1,524 challenging clinical cases published by the journal, each with detailed case descriptions, questions, four options, and professional explanations. Focuses on assessing the model’s diagnostic decision-making ability and interpretability in real-world, complex clinical scenarios.            |                                                      🌐[Website](https:\u002F\u002Fjamanetwork.com\u002Fcollections\u002F44038\u002Fclinical-challenge)                                                     |\n|        Medbullets       |         308        |     2024     | Chen et al. (Johns Hopkins University) |    A simulated clinical Q\\&A dataset. Composed of 308 multiple-choice questions in the USMLE Step 2\u002F3 style, collected from the Twitter medical Q\\&A account. Each question includes a case scenario, five options, and detailed explanations. While based on common clinical scenarios, the questions remain challenging and are used to evaluate model performance in clinical decision-making and interpretability.    |                                                           🌐[Website](https:\u002F\u002Fgithub.com\u002FHanjieChen\u002FChallengeClinicalQA)                                                           |\n|        MedXpertQA       |        4,460       |     2024     |            Tsinghua C3I Team           |                                    A comprehensive benchmark for “expert-level” medical reasoning. Consists of 4,460 high-difficulty clinical knowledge Q\\&A covering 17 specialties and 11 body systems. Available in both pure-text (case + Q\\&A) and multimodal (including medical images) formats, used to evaluate models’ joint reasoning ability over medical texts and images.                                    | 🤗[Dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTsinghuaC3I\u002FMedXpertQA) \u003Cbr> 🌐[Repository](https:\u002F\u002Fgithub.com\u002FTsinghuaC3I\u002FMedXpertQA) \u003Cbr> 🌐[Website](https:\u002F\u002Fmedxpertqa.github.io\u002F) |\n\n#### 5.1.3 Capability Benchmarks\nThe focus is on the local perspective or the individual abilities of the model during the Long CoT reasoning process, examining finer granularity by investigating whether each step of the model's reasoning is correct and logical. For instance, whether it can correctly identify errors and correct them, or whether it can complete complex tasks step by step.\n\n- **Deep Reasoning**\n\n| Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n| ZebraLogic |       \\~1,000      |     2024     |     Bill Yuchen Lin et al.     |                               ZebraLogic is an AI benchmark focusing on logical reasoning, containing complex mathematical and linguistic reasoning problems used to assess advanced reasoning abilities of models. Its problem design is similar to the \"Zebra Puzzle,\" challenging models to perform logical reasoning and problem-solving under constraints.                              | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fallenai\u002FZebraLogic) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FWildEval\u002FZeroEval) \u003Cbr> 🌐[website](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fyuchenlin\u002Fzebra-logic) |\n|   BigGSM   |         610        |     2025     | Qiguang Chen et al. (HIT-SCIR) |                                  A mathematical reasoning benchmark designed to evaluate the performance of large language models on multi-step mathematical problems. It extends the classic GSM8K dataset and includes more challenging mathematical application problems that require models to perform more complex logical reasoning and computations.                                  |                          🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FLightChen2333\u002FBigGSM) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FLightChen233\u002Freasoning-boundary)                          |\n| GSM-Ranges |        30.1k       |     2025     |   Safal Shrestha et al. (NYU)  | GSM-Ranges is a dataset generator built upon the GSM8K benchmark. It systematically modifies numerical values in mathematical word problems to assess the robustness of large language models across a wide range of numerical scales. By introducing numerical perturbations, GSM-Ranges evaluates the ability of LLMs to reason mathematically with numbers beyond the distribution range. |                              🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fguactastesgood\u002FGSM-Ranges) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fminwukim\u002FGSM-Ranges)                             |\n\n\n- **Exploration Benchmarks**\n\n| Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|  Sys2Bench  |          -         |     2025     |         Shubham Parashar et al.        | Sys2Bench is designed to systematically test large language models across various reasoning and planning tasks. The benchmark covers five major types of reasoning: algorithmic reasoning, planning, arithmetic reasoning, logical reasoning, and common-sense reasoning, consisting of 11 sub-tasks ranging from NP-hard problems (such as Rubik's Cube and Bin Packing) to multi-step math problems (such as GSM8K). Sys2Bench places special emphasis on intermediate steps in the reasoning process, highlighting the quality and efficiency of the reasoning path. Additionally, the project introduces AutoHD (Automated Heuristics Discovery) methods, allowing models to autonomously generate heuristic functions during the reasoning process to improve complex task planning capabilities. | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fdive-lab\u002FSys2Bench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fdivelab\u002Fsys2bench) |\n| BanditBench |          -         |     2025     | Allen Nie et al. (Stanford University) |            BanditBench is designed to evaluate the exploration and decision-making abilities of large language models in multi-armed bandit (MAB) and contextual bandit (CB) environments. The benchmark simulates LLMs as agents, relying solely on contextual information for multi-round interactions without updating parameters, to measure their performance in uncertain environments. BanditBench provides various task scenarios, including movie recommendation tasks based on the MovieLens dataset, covering different action numbers and reward distribution types (e.g., Gaussian and Bernoulli distributions). Additionally, researchers have introduced algorithm-guided reasoning support and algorithm distillation methods to enhance the exploration efficiency of LLMs.           |                           🌐[repository](https:\u002F\u002Fgithub.com\u002Fallenanie\u002FEVOLvE?tab=readme-ov-file)                          |\n\n- **Reflection Benchmarks**\n\n| Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|  RewardBench |          2,958         |       2024       |                Nathan Lambert et al. (AI2)               |                                                                                                                                RewardBench is the first systematic reward model evaluation benchmark, jointly released by AI2 and the University of Washington, designed to analyze and compare the performance of reward models under different training methods across alignment quality, reasoning ability, safety, and instruction following, providing a unified evaluation framework.                                                                                                                               |       🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Freward-bench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench) \u003Cbr> 🌐[website](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fallenai\u002Freward-bench)       |\n| ProcessBench |          3,400         |       2024       |              Zheng Chujie et al. (Qwen Team)             | ProcessBench is a mathematical reasoning process evaluation benchmark proposed by Alibaba’s Qwen Team, consisting of 3,400 Olympiad-level problems with step-by-step solutions, where each step is manually annotated for errors. The benchmark requires models to identify the earliest error step in the reasoning process, focusing on process supervision rather than solely on the final answer. Evaluation results show that general language models (e.g., QwQ-32B-Preview) outperform specially trained process reward models (PRMs) in step-by-step critique tasks, approaching the performance level of GPT-4o. |                                            🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FQwen\u002FProcessBench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FProcessBench)                                           |\n|   PRMBench   |          6,216         |       2025       | Mingyang Song et al. (Fudan University, Shanghai AI Lab) |                                                                PRMBench aims to fill the gap in existing benchmarks that primarily focus on step correctness and lack systematic evaluation of PRMs, offering a unified framework for evaluation across multiple dimensions including conciseness, robustness, and sensitivity. Each sample in the benchmark includes a question, a reasoning process with errors, annotations of erroneous steps, and the causes of the errors, aiming to evaluate the fine-grained error detection capabilities of PRMs.                                                                |                   🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhitsmy\u002FPRMBench_Preview) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fssmisya\u002FPRMBench) \u003Cbr> 🌐[website](https:\u002F\u002Fprmbench.github.io\u002F)                   |\n|  CriticBench |         \\~3,800        |       2024       |           Lan Tian et al. (Tsinghua University)          |                        CriticBench, proposed by Tsinghua University and other institutions, is a comprehensive benchmark for evaluating the critique and correction abilities of large language models. It covers five major reasoning areas: mathematics, commonsense, symbolism, programming, and algorithms, integrating 15 datasets to assess 17 LLMs in the stages of generation, critique, and correction. The study finds that models trained specifically for critique perform better in the Generate-Critique-Correct (GQC) task, and that larger models show higher critique consistency.                       |               🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fllm-agents\u002FCriticBench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FCriticBench\u002FCriticBench) \u003Cbr> 🌐[website](https:\u002F\u002Fcriticbench.github.io\u002F)              |\n|  DeltaBench  |          1,236         |       2025       |                      OpenStellarTeam                     |                                                                              DeltaBench, released by the OpenStellar Team, is a benchmark designed to assess large language models' error detection capabilities in Long CoT (Chain of Thought) reasoning tasks. It includes 1,236 samples across areas such as mathematics, programming, physical-chemical-biological (PCB) reasoning, and general reasoning. Each sample is annotated with detailed manual labels identifying erroneous steps, strategy shifts, and reflection efficiency.                                                                              |    🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FOpenStellarTeam\u002FDeltaBench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FOpenStellarTeam\u002FDeltaBench) \u003Cbr> 🌐[website](https:\u002F\u002Fopenstellarteam.github.io\u002FDeltaBench\u002F)   |\n|  ErrorRadar  |          2,500         |       2024       |               Yan Yibo et al. (Squirrel AI)              |                                                                 ErrorRadar is a multimodal mathematical reasoning error detection benchmark designed to evaluate multimodal large language models' ability to identify and classify errors in student problem-solving processes. The benchmark contains 2,500 K-12 mathematics problems from real educational scenarios, incorporating both textual and image information, and annotating erroneous steps and error types. Evaluation tasks include error step localization and error type classification.                                                                | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FErrorRadar\u002FErrorRadar) \u003Cbr> 🌐[repository](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FError-Radar\u002Freadme.md) \u003Cbr> 🌐[website](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FError-Radar) |\n|     MEDEC    |          3,848         |       2024       |            Ben Abacha Asma et al. (Microsoft)            |                                                                                                                              MEDEC is the first public benchmark for medical error detection and correction, jointly released by Microsoft and the University of Washington. It contains 3,848 clinical texts, covering five types of errors, including diagnosis, treatment, and medication, providing a crucial tool for improving the accuracy and safety of medical document generation.                                                                                                                              |                                                                                 🌐[repository](https:\u002F\u002Fgithub.com\u002Fabachaa\u002FMEDEC)                                                                                |\n\n#### 5.1.4 Advanced Benchmarks\nBenchmarks designed specifically to evaluate large language models' capabilities in complex reasoning, cross-domain knowledge integration, and multimodal understanding. As basic evaluations are gradually saturated by top-tier models, researchers have started developing more challenging benchmarks to more accurately measure models' performance on real-world complex tasks.\n\n\n\n- **Agentic & Embodied Reasoning**\n\n| Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|  ToolComp  |                  485                 |       2025       |          Vaskar Nath et al. (Scale AI)          |                                               ToolComp is designed to assess large language models' reasoning and process supervision capabilities in complex multi-step tool usage tasks. The benchmark consists of 485 manually edited and verified prompts, involving the use of 11 different tools, and 1,731 step-by-step supervision labels, offering a comprehensive assessment of models' performance in multi-tool reasoning tasks.                                               |   🌐[website](https:\u002F\u002Fscale.com\u002Fresearch\u002Ftoolcomp-a-multi-tool-reasoning-and-process-supervision-benchmark)   |\n|   OSWorld  |                  369                 |       2025       |   Xie Tianbao et al. (University of Hong Kong)  |                                        OSWorld is a multimodal agent evaluation benchmark jointly released by the University of Hong Kong, Salesforce Research, and other institutions, aiming to test AI's ability to complete open-ended tasks in real computer environments. The benchmark consists of 369 tasks across file operations, web browsing, office software usage, and other scenarios, supporting Ubuntu, Windows, and macOS systems.                                       |       🌐[repository](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld) \u003Cbr> 🌐[website](https:\u002F\u002Fos-world.github.io\u002F)       |\n|   WebShop  | 12,087 Instructions \u002F 1.18M Products |       2022       |     Yao Shunyu et al. (Princeton University)    | WebShop simulates an e-commerce website environment and is designed to evaluate large language models' abilities in real web interactions. The benchmark includes 1.18 million real products and 12,087 user instructions, requiring agents to browse webpages, search, filter, and complete purchase tasks based on natural language instructions. WebShop focuses on evaluating models' performance in understanding complex instructions, handling web noise, and exploring strategies. |   🌐[repository](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FWebShop) \u003Cbr> 🌐[website](https:\u002F\u002Fwebshop-pnlp.github.io\u002F)  |\n|  WebArena  |                  812                 |       2024       | Zhou Shuyan et al. (Carnegie Mellon University) |                                              WebArena is a high-fidelity web environment released by Carnegie Mellon University, designed to evaluate large language models' agent capabilities in real web tasks. The benchmark consists of 812 tasks covering e-commerce, social forums, content management, and collaborative development, requiring models to complete multi-step web interactions through natural language instructions.                                              |        🌐[repository](https:\u002F\u002Fgithub.com\u002Fweb-arena-x\u002Fwebarena) \u003Cbr> 🌐[website](https:\u002F\u002Fwebarena.dev\u002F)        |\n|  WebGames  |                  50+                 |       2025       |      Thomas George et al. (Convergence AI)      |                                                                                         WebGames is a web browsing agent benchmark, covering basic browsing operations, complex input handling, cognitive tasks, and workflow automation. WebGames provides a lightweight, verifiable test environment supporting rapid iteration and evaluation, suitable for developing more powerful web agents.                                                                                        | 🌐[repository](https:\u002F\u002Fgithub.com\u002Fconvergence-ai\u002Fwebgames) \u003Cbr> 🌐[website](https:\u002F\u002Fwebgames.convergence.ai\u002F) |\n| Text2World |                  103                 |       2025       |   Mengkang Hu et al. (University of Hong Kong)  |                                  Text2World is a benchmark proposed by the University of Hong Kong and other institutions, aiming to evaluate large language models' ability to generate symbolic world models from natural language. The benchmark is based on the Planning Domain Definition Language (PDDL) and covers hundreds of diverse domains, employing a multi-criteria, execution-based evaluation method to provide a more robust assessment.                                  |   🌐[repository](https:\u002F\u002Fgithub.com\u002FAaron617\u002Ftext2world) \u003Cbr> 🌐[website](https:\u002F\u002Ftext-to-world.github.io\u002F)   |\n\n- **Multimodal Reasoning**\n    - **Complex Mathematics:**\n\n    | Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    |  MathVista |        6,141       |     2023     |               Pan Lu et al. (UCLA)               |                                                                                                                                                                              MathVista is a multimodal mathematical reasoning evaluation benchmark jointly released by UCLA, the University of Washington, and Microsoft Research. It is designed to systematically assess the mathematical reasoning capabilities of large language models and multimodal models within a visual context.                                                                                                                                                                              |    🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI4Math\u002FMathVista) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Flupantech\u002FMathVista) \u003Cbr> 🌐[website](https:\u002F\u002Fmathvista.github.io\u002F)    |\n    | MathVision |        3,040       |     2024     | Ke Wang et al. (Chinese University of Hong Kong) |                                                                                                         MathVision (MATH-V) is a multimodal mathematical reasoning evaluation benchmark released by the Chinese University of Hong Kong, among others. It aims to systematically evaluate the mathematical reasoning abilities of large vision-language models within visual contexts. The benchmark includes 3,040 problems across 16 mathematical disciplines, divided into five difficulty levels, with problems sourced from real mathematics competitions.                                                                                                         | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FMathLLMs\u002FMathVision) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fmathllm\u002FMATH-V) \u003Cbr> 🌐[website](https:\u002F\u002Fmathllm.github.io\u002Fmathvision\u002F) |\n    |  MathVerse |      \\~15,000      |     2024     | Zimu Lu et al. (Chinese University of Hong Kong) | MathVerse is a multimodal mathematical reasoning evaluation benchmark jointly released by MMLab at the Chinese University of Hong Kong and the Shanghai AI Lab. It is designed to comprehensively assess multimodal large language models' ability to understand mathematical diagrams. The benchmark includes 2,612 problems spanning areas such as plane geometry, solid geometry, and functions, annotated by experts. It generates six versions of multimodal information, totaling approximately 15,000 test samples. MathVerse introduces a Chain-of-Thought (CoT) evaluation strategy, leveraging GPT-4V for fine-grained analysis of model reasoning processes. |                          🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fluzimu\u002FWebGen-Bench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fmnluzimu\u002FWebGen-Bench)                         |\n\n    - **Complex Code:**\n\n    | Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    | HumanEval-V |         253        |     2024     | Fengji Zhang et al. (City University of Hong Kong) |                                                                        HumanEval-V is a multimodal code generation evaluation benchmark proposed by the University of Hong Kong, aiming to test the capabilities of large multimodal models in complex diagram understanding and code generation tasks. This benchmark includes 253 Python programming tasks, each accompanied by key diagrams and function signatures, requiring the model to generate executable code based on visual information.                                                                        | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHumanEval-V\u002FHumanEval-V-Benchmark) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FHumanEval-V\u002FHumanEval-V-Benchmark) \u003Cbr> 🌐[website](https:\u002F\u002Fhumaneval-v.github.io\u002F) |\n    | Code-Vision |       1,000+       |     2025     |       Hanbin Wang et al. (Peking University)       |                                  Code-Vision is a multimodal code generation evaluation benchmark jointly released by Peking University, Northeastern University, and the University of Hong Kong. It aims to test the ability of multimodal large language models to understand flowcharts and generate corresponding code. This benchmark fills the gap in existing benchmarks, which mainly focus on textual reasoning and lack a systematic evaluation of code generation in visual contexts, providing a unified evaluation framework.                                 |                                     🌐[repository](https:\u002F\u002Fgithub.com\u002Fwanghanbinpanda\u002FCodeVision) \u003Cbr> 🌐[website](https:\u002F\u002Fpingshengren0901.github.io\u002Fcodevision.io\u002F)                                     |\n    |  ChartMimic |        4,800       |     2024     |       Cheng Yang et al. (Tsinghua University)      | ChartMimic is a multimodal code generation evaluation benchmark jointly released by Tsinghua University, Tencent AI Lab, and other institutions. It aims to evaluate the cross-modal reasoning abilities of large multimodal models in chart understanding and code generation, addressing the gap in existing benchmarks that focus mainly on textual reasoning and lack systematic evaluation of chart understanding and code generation. It includes two task types: Direct Mimic and Customized Mimic, with data sourced from scientific papers across multiple fields. |              🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FChartMimic\u002FChartMimic) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FChartMimic\u002FChartMimic) \u003Cbr> 🌐[website](https:\u002F\u002Fchartmimic.github.io\u002F)             |\n\n    - **Complex Science:**\n\n    | Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    | ScienceQA |       21,208       |     2022     |        Pan Lu et al. (UCLA)        |                                                                        ScienceQA is a multimodal multiple-choice dataset consisting of 21,208 problems across natural sciences, language sciences, and social sciences, designed for K-12 grade levels. The dataset provides context with images and text, explanations, and detailed answers, supporting Chain-of-Thought (CoT) reasoning, aiming to assess and enhance the multi-step reasoning abilities and interpretability of AI models.                                                                       |         🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTheMrguiller\u002FScienceQA) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Flupantech\u002FScienceQA) \u003Cbr> 🌐[website](https:\u002F\u002Fscienceqa.github.io\u002F)        |\n    |   M3CoT   |       11,459       |     2024     | Qiguang Chen et al. (HIT-SCIR Lab) | M3CoT is a multimodal, multi-domain, multi-step reasoning dataset built upon ScienceQA, designed to assess the capabilities of AI models in complex reasoning tasks. Compared to ScienceQA, M3CoT-Science has an average reasoning step increase from 2.5 to 10.9, and the average text length grows from 48 to 294, significantly increasing task complexity. The dataset spans science, common sense, and mathematics, emphasizing cross-reasoning between image and text information, challenging the reasoning capabilities of existing multimodal large models. | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FLightChen2333\u002FM3CoT) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FM3CoT) \u003Cbr> 🌐[website](https:\u002F\u002Flightchen233.github.io\u002Fm3cot.github.io\u002F) |\n    | MolPuzzle |         234        |     2024     |          Kehan Guo et al.          |                       MolPuzzle is a multimodal, multi-step reasoning dataset designed to evaluate large language models in molecular structure analysis tasks. The dataset involves various spectrometric data types, including infrared spectroscopy (IR), mass spectrometry (MS), and nuclear magnetic resonance (1H-NMR and 13C-NMR), as well as molecular formula information. Tasks are divided into three stages: molecular understanding, spectral analysis, and molecular construction, simulating real chemical reasoning processes.                       |   🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fkguo2\u002FMolPuzzle_data) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FKehanGuo2\u002FMolPuzzle) \u003Cbr> 🌐[website](https:\u002F\u002Fkehanguo2.github.io\u002FMolpuzzle.io\u002F)   |\n\n    - **Commonsense Puzzle:**\n\n    | Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    |   PuzzleVQA  |        2,000       |     2024     |          Yew Ken Chia et al.         | PuzzleVQA is a multimodal reasoning dataset consisting of 2,000 abstract graphic puzzles, designed to evaluate the visual perception, induction, and deduction abilities of large multimodal models in basic concepts such as color, numbers, shapes, and sizes. Experiments show that even advanced models like GPT-4V achieve an average accuracy of only 46.4% on single-concept puzzles, significantly lower than human performance, exposing limitations in abstract pattern recognition and multi-step reasoning. | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fdeclare-lab\u002FPuzzleVQA) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002FLLM-PuzzleTest\u002Ftree\u002Fmaster\u002FPuzzleVQA) \u003Cbr> 🌐[website](https:\u002F\u002Fpuzzlevqa.github.io\u002F) |\n    | LEGO-Puzzles |        1,100       |     2025     | Kexian Tang et al. (Shanghai AI Lab) |                                                                                                  LEGO-Puzzles aims to evaluate the capability of large multimodal language models in multi-step spatial reasoning tasks. The dataset contains 1,100 visual question answering (VQA) tasks based on LEGO bricks, covering 11 task types, including spatial understanding, single-step and multi-step sequence reasoning.                                                                                                 |      🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FKexianTang\u002FLEGO-Puzzles) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002FTangkexian\u002FLEGO-Puzzles) \u003Cbr> 🌐[website](https:\u002F\u002Ftangkexian.github.io\u002FLEGO-Puzzles\u002F)     |\n    |     CVQA     |       10,374       |     2024     |     David Romero et al. (MBZUAI)     |                                                                                                         CVQA is a multimodal visual question answering dataset designed to assess models' abilities to integrate multiple visual cues for combined reasoning. The dataset includes three task types requiring models to extract and synthesize key information from multiple images to answer complex questions.                                                                                                        |                                                    🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fafaji\u002Fcvqa) \u003Cbr> 🌐[website](https:\u002F\u002Fcvqa-benchmark.org\u002F)                                                   |\n\n- **AI4Research:**\n\n| Name | Number of Problems | Release Date | Authors | Description | Relevant Links |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|    SciWorld   | 30 tasks \u002F 6,000+ instances |     2022     |                           Ruoyao Wang et al.                           |                                                                                                                                                                                                      SciWorld aims to evaluate the understanding and reasoning abilities of large multimodal models in complex scientific scenarios. The dataset integrates images, text, and structured data, covering multiple scientific domains and designed with multi-step reasoning tasks, challenging models' abilities to integrate multi-source information, perform causal reasoning, and provide interpretable answers. It consists of 30 tasks, each with multiple variants, totaling over 6,000 instances. The introduction of SciWorld has propelled the application of multimodal models in scientific education and research.                                                                                                                                                                                                     |           🌐[repository](https:\u002F\u002Fgithub.com\u002Fallenai\u002FScienceWorld) \u003Cbr> 🌐[website](https:\u002F\u002Fsciworld.apps.allenai.org\u002F)           |\n|     HardML    |             100             |     2025     |                           Tidor-Vlad Pricope                           | HardML is a benchmark dataset designed specifically to evaluate AI's knowledge and reasoning abilities in the fields of data science and machine learning. Created by independent machine learning engineer Tidor-Vlad Pricope, it contains 100 carefully crafted multiple-choice questions covering topics such as natural language processing, computer vision, statistical modeling, and classical machine learning algorithms. These questions are so challenging that even seasoned machine learning engineers struggle to answer them all correctly. To avoid data contamination, most of the questions are original, reflecting recent advancements in machine learning over the past two years. Current state-of-the-art AI models have an error rate of about 30% on HardML, which is three times higher than on MMLU-ML, demonstrating HardML's effectiveness in distinguishing model capabilities. Additionally, the author has released the slightly easier EasyML dataset, designed for models with fewer parameters. |                                            📄[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.15627)                                           |\n|   MLE-BENCH   |              75             |     2024     |                                 OpenAI                                 |                                                                                                                                                            MLE-bench is a benchmark dataset released by OpenAI, designed to evaluate AI agents' practical capabilities in machine learning engineering (MLE) tasks. The benchmark selects 75 diverse competition tasks from Kaggle, covering fields such as natural language processing, computer vision, signal processing, and more, testing models' engineering skills in data preprocessing, model training, and experimental execution. In the evaluation, OpenAI's o1-preview model, combined with the AIDE framework, achieved Kaggle bronze-level performance on 16.9% of tasks. The research also explores the impact of resource scaling on performance and issues related to pre-training data contamination.                                                                                                                                                           |            🌐[repository](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmle-bench\u002F) \u003Cbr> 🌐[website](https:\u002F\u002Fopenai.com\u002Findex\u002Fmle-bench\u002F)            |\n| SolutionBench |            1,053            |     2025     | Zhuoqun Li et al. (Institute of Software, Chinese Academy of Sciences) |                                                                                                                                                                                                                                                           SolutionBench is a benchmark dataset designed to evaluate the capabilities of AI systems in complex engineering solution design. It aims to fill the gap in current retrieval-augmented generation (RAG) methods in handling multi-constraint engineering problems, characterized by real data sources and structured data. Additionally, the authors introduced a system named SolutionRAG, which, by combining tree search and dual-point thinking mechanisms, achieved leading performance on SolutionBench.                                                                                                                                                                                                                                                          | 🤗[dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flzq2021\u002FSolutionBench) \u003Cbr> 🌐[repository](https:\u002F\u002Fgithub.com\u002Ficip-cas\u002FDeepSolution) |\n\n### 5.2 Training Datasets\n\nTo build and enhance models with strong Long CoT capabilities, numerous open-source training datasets have emerged. These datasets provide foundational supervision signals for various domains such as mathematics, science, medicine, programming, and general reasoning. Based on their construction methods, we classify the datasets into four major categories: Manual Annotation, Direct Distillation, Search-based Distillation, and Validated Distillation.\n\nIn this section, we systematically list representative datasets under each category, covering key information such as their sources, modalities, applicable domains, and data scale, providing researchers and developers seeking suitable training resources with a comprehensive guide and convenient reference.\n\n#### 5.2.1 Manual Annotation\n\nThese datasets are created through manual annotation or rule-based construction, typically offering high-quality samples with interpretable reasoning paths. While smaller in scale, they are critical for guiding the alignment and evaluation of initial models.\n\n- [R1-OneVision](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFancy-MLLM\u002FR1-Onevision) combines high-quality data from LLaVA-OneVision with datasets from specific domains. It bridges the gap between visual and textual understanding, providing rich, context-aware reasoning tasks across natural scenes, science, mathematics, OCR-based content, and complex graphs.\n- [M3CoT](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FM3CoT) lays the foundational work for multi-domain, multi-step, multi-modal chain-of-thought research.\n- [Big-Math-RL-Verified](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSynthLabsAI\u002FBig-Math-RL-Verified) is designed for RL training with large language models (LLMs) such as [PPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06347), [GRPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03300), etc.\n- [GSM8K](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math) is a high-quality, linguistically diverse dataset of elementary school math word problems.\n\n| Name                 | Category             | Source | Modality      | Quantity |\n| :------------------- | :------------------- | :----- | :------------ | :------- |\n| R1-OneVision         | Mathematics, Science | Rule   | Vision + Lang | 119K     |\n| M3CoT                | Mathematics, Science | Human  | Vision + Lang | 11K      |\n| Big-Math-RL-Verified | Mathematics          | Human  | Lang          | 251K     |\n| GSM8K                | Mathematics          | Human  | Lang          | 8K       |\n\n#### 5.2.2 Direct Distillation\n\nThe method utilizes large language models to generate training data through prompt-based or chain-of-thought reasoning. These datasets can be scaled up to millions of examples, covering a wide range of domains.\n\n- [NaturalReasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13124) validates through knowledge distillation experiments that NaturalReasoning can effectively extract and transfer reasoning capabilities from powerful teacher models. It is equally effective for unsupervised self-training using external reward models or self-rewarding.\n- [NuminaMath-CoT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002FNuminaMath-CoT) employs a chain-of-thought (CoT) format for solving each problem. The dataset covers Chinese high school math exercises, US and international Mathematical Olympiad problems, etc., collected from online exam PDFs and math forums. Processing steps include: (a) OCR recognition of the original PDFs; (b) segmentation into problem-solution pairs; (c) translation into English; (d) rearranging to generate chain-of-thought reasoning formats; and (e) final answer formatting.\n- [NuminaMath-TIR](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002FNuminaMath-TIR) focuses on problems that produce numerical outputs selected from the NuminaMath-CoT dataset. A pipeline was constructed using GPT-4 to generate reasoning paths similar to TORA, execute code, and produce results until the final solution is completed, filtering out solutions where the final answer differs from the reference answer.\n- [DART-Math-uniform](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-uniform) constructs datasets through the application of DARS-Uniform.\n- [DART-Math-hard](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-hard) is a mathematical question-answer pair sample constructed using query sets from the DARS-Prop2DiffMATH and GSK8K training datasets. It achieves SOTA results on many challenging mathematical reasoning benchmarks, introducing a deliberate preference for hard queries, in contrast to traditional rejection sampling.\n- [DART-Math-pool-math](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-pool-math) is a data pool synthesized from query sets of the MATH training dataset, including all samples with correct answers and additional metadata generated during the process. DART-Math-\\- datasets are extracted from the DART-Math-pool-\\- data pools.\n- [DART-Math-pool-gsm8k](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-pool-gsm8k) is a data pool synthesized from query sets of the GSM8K training dataset, including all samples with correct answers and additional metadata. DART-Math-\\- datasets are extracted from the DART-Math-pool-\\- data pools.\n- [OpenO1-SFT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FO1-OPEN\u002FOpenO1-SFT) is a dataset for fine-tuning language models on chain-of-thought activations using SFT.\n- [OpenO1-SFT-Pro](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FO1-OPEN\u002FOpenO1-SFT-Pro) is a dataset for fine-tuning language models on chain-of-thought activations using SFT.\n- [OpenO1-SFT-Ultra](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FO1-OPEN\u002FOpenO1-SFT-Ultra) is synthesized based on existing open-source datasets using the openo1-qwen-sft model.\n- [Medical-o1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFreedomIntelligence\u002Fmedical-o1-reasoning-SFT) is an SFT medical reasoning dataset constructed based on medically verifiable questions and an LLM verifier.\n- [AoPS-Instruc](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FDeepStudentLlama\u002FAoPS-Instruct) is a large-scale high-quality question-answer pair dataset for advanced mathematical reasoning created and maintained using a scalable approach.\n- [Orca-Math](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14830) is a high-quality synthetic dataset of 200,000 math problems created in a multi-agent setup where agents collaborate to generate the data.\n- [MATH-plus](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTIGER-Lab\u002FWebInstructSub) collects 10 million naturally occurring instruction data from pre-trained network corpora.\n- [UltraInteract-SFT](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fopenbmb\u002Feurus-660bc40bec5376b3adc9d1c5) is designed for complex reasoning tasks and helps explore preference learning in reasoning tasks. It is applicable for supervised fine-tuning and preference learning. Each instruction includes a preference tree consisting of: (1) reasoning chains with multiple planning strategies and consistent formats; (2) multi-round interaction trajectories with the environment and comments; and (3) paired data for facilitating preference learning.\n- [MathCodeInstruct](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FMathLLMs\u002FMathCodeInstruct) is a novel high-quality dataset containing mathematical problems and their code-based solutions.\n- MathCodeInstruct-Plus [$Paper¹](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03731), [²$](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.07921) is a novel high-quality dataset containing mathematical problems and their code-based solutions.\n- OpenMathInstruct-1[\\[HuggingFace\\]](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fnvidia\u002FOpenMathInstruct-1) is a math instruction adjustment dataset, generating 1.8 million problem-solution pairs using the Mixtral-8x7B model. The problems are sourced from the GSM8K and MATH training subsets; solutions are synthesized by the Mixtral model using text reasoning and Python interpreter-executed code blocks.\n- [OpenMathInstruct-2](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FNeMo-Skills) is a large-scale math reasoning dataset for training large language models (LLM).\n- [AceMath-Instruct](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fnvidia\u002Facemath-678917d12f09885479d549fe) is a training dataset for cutting-edge mathematical reasoning models in AceMath.\n- [QwQ-LongCoT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPowerInfer\u002FQWQ-LONGCOT-500K) integrates prompts from multiple high-quality sources to create diverse and comprehensive training data.\n- [SCP-116K](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FEricLu\u002FSCP-116K) is a high-quality set of scientific question-answer pairs automatically extracted from web-scraped documents. Each question is accompanied by a matching solution extracted from the source material, along with responses and reasoning processes generated by advanced language models.\n- [R1-Distill-SFT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FServiceNow-AI\u002FR1-Distill-SFT) is distilled using DeepSeek-R1-32b; generated using Numina-math and Tulu; each prompt samples a response.\n- [Sky-T1-Data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FNovaSky-AI\u002FSky-T1_data_17k) contains 5k encoded data from APPs and TACO, as well as 10k math data from the AIME, MATH, and Olympiads subsets of the NuminaMATH dataset. It also maintains 1k science and puzzle data from STILL-2.\n- [Bespoke-Stratos-17k](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fbespokelabs\u002FBespoke-Stratos-17k) is a reasoning dataset that includes questions, reasoning traces, and answers. It replicates and improves upon the Berkeley Sky-T1 data pipeline using SFT distilled data from DeepSeek-R1.\n- [s1K](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fsimplescaling\u002Fs1K) contains 1,000 diverse, high-quality, and difficult problem examples (from Gemini Thinking), refining reasoning paths and solutions.\n- MedThoughts-8K\n- [SYNTHETIC-1](https:\u002F\u002Fwww.primeintellect.ai\u002Fblog\u002Fsynthetic-1-release) is the largest open reasoning dataset generated by Deepseek-R1, covering reasoning trajectories for tasks in mathematics, programming, science, etc., with correctness verified by task-specific validators.\n- [Medical-R1-Distill-Data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFreedomIntelligence\u002FMedical-R1-Distill-Data) is an SFT dataset distilled from Deepseek-R1 (Full Power Version), based on HuatuoGPT-o1’s medically verifiable questions.\n- [Medical-R1-Distill-Data-Chinese](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFreedomIntelligence\u002FMedical-R1-Distill-Data-Chinese) is an SFT Chinese version dataset distilled from Deepseek-R1 (Full Power Version), based on HuatuoGPT-o1’s medically verifiable questions.\n- [RLVR-GSM-MATH](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fopen-instruct) is used to train the Tulu3 model.\n- [LIMO](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FGAIR\u002FLIMO) Less is More for Reasoning.\n- [OpenThoughts-114k](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopen-thoughts\u002FOpenThoughts-114k) is an open synthetic reasoning dataset containing 114k high-quality examples covering math, science, code, and puzzles.\n- [Magpie-Reasoning-V2](https:\u002F\u002Fgithub.com\u002Fmagpie-align\u002Fmagpie) generates high-quality alignment data by aligning LLMs using pre-query template prompts.\n- [Dolphin-R1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcognitivecomputations\u002Fdolphin-r1) is an 800k sample dataset similar in composition to the datasets used for training the DeepSeek-R1 Distill model.\n\n\n| Name                            | Category                           | Source                            | Modality | Quantity |\n| :------------------------------ | :--------------------------------- | :-------------------------------- | :------- | :------- |\n| NaturalReasoning                | Science, General                   | Llama3.3-70B                      | Lang     | 1M       |\n| NuminaMath-CoT                  | Mathematics                        | GPT-4o                            | Lang     | 860K     |\n| NuminaMath-TIR                  | Mathematics                        | GPT-4o                            | Lang     | 73K      |\n| DART-Math-uniform               | Mathematics                        | DeepSeekMath-7B-RL                | Lang     | 591K     |\n| DART-Math-hard                  | Mathematics                        | DeepSeekMath-7B-RL                | Lang     | 585K     |\n| DART-Math-pool-math             | Mathematics                        | DeepSeekMath-7B-RL                | Lang     | 1.6M     |\n| DART-Math-pool-gsm8k            | Mathematics                        | DeepSeekMath-7B-RL                | Lang     | 2.7M     |\n| OpenO1-SFT                      | Mathematics, Science, General      | -                                 | Lang     | 78K      |\n| OpenO1-SFT-Pro                  | Mathematics, Science, General      | -                                 | Lang     | 126K     |\n| OpenO1-SFT-Ultra                | Mathematics, Science, General      | -                                 | Lang     | 28M      |\n| Medical-o1                      | Medicine                           | DeepSeek R1                       | Lang     | 50K      |\n| AoPS-Instruct                   | Mathematics                        | Qwen2.5-72B                       | Lang     | 647K     |\n| Orca-Math                       | Mathematics                        | GPT-4                             | Lang     | 200K     |\n| MATH-plus                       | Mathematics                        | GPT-4                             | Lang     | 894K     |\n| UltraInteract-SFT               | Mathematics, Code, Logic           | GPT-4 CoT + PoT                   | Lang     | 289K     |\n| MathCodeInstruct                | Mathematics                        | GPT-4 + Codellama PoT             | Lang     | 79K      |\n| MathCodeInstruct-Plus           | Mathematics                        | -                                 | Lang     | 88K      |\n| OpenMathInstruct-1              | Mathematics                        | Mixtral-8x7B PoT                  | Lang     | 5M       |\n| OpenMathInstruct-2              | Mathematics                        | Llama3.1-405B                     | Lang     | 14M      |\n| AceMath-Instruct                | Mathematics, General               | Qwen2.5-Math-72B + GPT-4o-mini    | Lang     | 5M       |\n| QwQ-LongCoT                     | General                            | QwQ                               | Lang     | 286K     |\n| SCP-116K                        | Science                            | QwQ + O1-mini                     | Lang     | 117K     |\n| R1-Distill-SFT                  | Mathematics                        | DeepSeek-R1-32B                   | Lang     | 172K     |\n| Sky-T1-Data                     | Mathematics, Code, Science, Puzzle | QwQ                               | Lang     | 17K      |\n| Bespoke-Stratos-17k             | Mathematics, Code, Science, Puzzle | DeepSeek R1                       | Lang     | 17K      |\n| s1K                             | Mathematics                        | DeepSeek R1                       | Lang     | 1K       |\n| MedThoughts-8K                  | Medicine                           | DeepSeek R1                       | Lang     | 8K       |\n| SYNTHETIC-1                     | Mathematics, Code, Science         | DeepSeek R1                       | Lang     | 894K     |\n| Medical-R1-Distill-Data         | Medicine                           | DeepSeek R1                       | Lang     | 22K      |\n| Medical-R1-Distill-Data-Chinese | -                                  | -                                 | Lang     | 17K      |\n| RLVR-GSM-MATH                   | Mathematics                        | -                                 | Lang     | 30K      |\n| LIMO                            | Mathematics                        | Human + DeepSeek R1 + Qwen2.5-32B | Lang     | 817      |\n| OpenThoughts-114k               | Mathematics, Code, Science, Puzzle | -                                 | Lang     | 114K     |\n| Magpie-Reasoning-V2             | Mathematics, Code                  | DeepSeek-R1 + Llama-70B           | Lang     | 250K     |\n| Dolphin-R1                      | Mathematics, Science               | DeepSeek R1 + Gemini2 + Dolphin   | Lang     | 814K     |\n\n#### 5.2.3 Search-based Distillation\n\nThe dataset based on search is constructed through an automated search algorithm, which explores the reasoning tree to generate the optimal reasoning trajectory. Although the scale is limited, these datasets typically generate high-quality and deep reasoning samples.\n\n- [STILL-1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.11694) enhances the reasoning capabilities of large language models (LLMs) through a reward-guided tree search algorithm.\n\n| Name    | Category                           | Source                       | Modality | Quantity |\n| :------ | :--------------------------------- | :--------------------------- | :------- | :------- |\n| STILL-1 | Mathematics, Code, Science, Puzzle | LLaMA-3.1-8B-Instruct + MCTS | Lang     | 5K       |\n\n#### 5.2.4 Validated Distillation\nThe validated datasets contain rule-based filtering, test case verification, or LLM validation to ensure quality. These datasets strike a balance between scalability and reliability.\n\n- [KodCode-V1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FKodCode\u002FKodCode-V1) provides verifiable solutions and tests for coding tasks; specifically designed for supervised fine-tuning (SFT) and reinforcement learning (RL) optimization; covering various domains (from algorithms to domain-specific software knowledge) and difficulty levels (from basic coding exercises to interview and competitive programming challenges).\n- [KodCode-V1-SFT-R1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FKodCode\u002FKodCode-V1-SFT-R1) provides verifiable solutions and tests for coding tasks; specifically designed for supervised fine-tuning (SFT) and reinforcement learning (RL) optimization; covering various domains (from algorithms to domain-specific software knowledge) and difficulty levels (from basic coding exercises to interview and competitive programming challenges).\n- [OpenR1-Math](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopen-r1\u002FOpenR1-Math-220k) is a large-scale mathematics reasoning dataset, generated by DeepSeek R1 for NuminaMath version 1.5 problems, with two to four reasoning trajectories per question.\n- [Chinese-DeepSeek-R1-Distill-Data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FCongliu\u002FChinese-DeepSeek-R1-Distill-data-110k) is a Chinese open-source distilled dataset from DeepSeek-R1, containing not only math data but also a significant amount of general-type data.\n- [AM-DeepSeek-R1-Distilled](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fa-m-team\u002FAM-DeepSeek-R1-Distilled-1.4M) includes problems from numerous open-source datasets, which have been semantically deduplicated and cleaned to eliminate test set contamination. The answers are extracted from reasoning models (primarily DeepSeek-R1) and undergo rigorous validation: mathematical problems are verified through answer checking, coding problems through test case validation, and other tasks through reward model evaluation.\n\n\n| Name                             | Category                      | Source                               | Modality | Quantity |\n| :------------------------------- | :---------------------------- | :----------------------------------- | :------- | :------- |\n| KodCode-V1                       | -                             | GPT-4 + Test case validation         | Lang     | 447K     |\n| KodCode-V1-SFT-R1                | Code                          | DeepSeek R1 + Test case validation   | Lang     | 443K     |\n| OpenR1-Math                      | Mathematics                   | DeepSeek R1 + Rule & LLM Validation  | Lang     | 225K     |\n| Chinese-DeepSeek-R1-Distill-Data | Mathematics, Science, General | DeepSeek R1 + Rule & LLM Validation  | Lang     | 110K     |\n| AM-DeepSeek-R1-Distilled         | Mathematics, Code, General    | Reward Model + Rule & LLM Validation | Lang     | 1.4M     |\n\n\n## 7. Paper Lists \\& Awesome Resources\n- [Awesome-Long-Chain-of-Thought-Reasoning](pages\u002Fpaper.md) (Our Official Paper List, 1000+ papers)\n\n- [Awesome-System2-Reasoning-LLM](https:\u002F\u002Fgithub.com\u002Fzzli2022\u002FAwesome-System2-Reasoning-LLM)\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_7dd87192ac55.jpg\" style=\"width: 580pt\">\n\n\n# 🎁 Citation\nIf you find this work useful, welcome to cite us.\n```bib\n@misc{chen2025reasoning,\n      title={Towards Reasoning Era: A Survey of Long Chain-of-Thought for Reasoning Large Language Models}, \n      author={Qiguang Chen and Libo Qin and Jinhao Liu and Dengyun Peng and Jiannan Guan and Peng Wang and Mengkang Hu and Yuhang Zhou and Te Gao and Wanxiang Che},\n      year={2025},\n      eprint={2503.09567},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09567}, \n}\n```\n\n\n# Contribution\nFor any interesting news about Long CoT, you can also @[Qiguang_Chen](https:\u002F\u002Ftwitter.com\u002FQiguangChen) on Twitter or email me at [charleschen2333@gmail.com](mailto:charleschen2333@gmail.com) to follow and update it at our GitHub repo.\n\nHope everyone enjoy the Long CoT era :)\n\n\u003C!-- omit in toc -->\n# ⭐ Star History\n\n\u003Ca href=\"https:\u002F\u002Fstar-history.com\u002F#LightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning&Date\">\n \u003Cpicture>\n   \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_cee7767d680e.png&theme=dark\" \u002F>\n   \u003Csource media=\"(prefers-color-scheme: light)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_cee7767d680e.png\" \u002F>\n   \u003Cimg alt=\"Star History Chart\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_cee7767d680e.png\" \u002F>\n \u003C\u002Fpicture>\n\u003C\u002Fa>\n\n","# \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_8b2e6abbc4fb.jpg\" alt=\"SVG Image\" width=\"40px\"> 令人惊叹的长链式思维推理\n\n[![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-Long_Chain_of_Thought-b31b1b.svg)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567) \n[![Paper](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-908-green.svg)](https:\u002F\u002Fgithub.com\u002F\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning)\n[![Last Commit](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flast-commit\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning)](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning)\n[![Contribution Welcome](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FContributions-welcome-blue)]()\n\n\\[[English Tutorial](README.md)\\] | \\[[中文教程](README-zh.md)\\] | \\[[Arxiv](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567)\\]\n\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_221787a541a9.png)\n\n\n\u003C!-- omit in toc -->\n# 🔥 新闻\n- **2025.07**: 🎉🎉🎉 我们已将审阅论文数量更新至超过1000篇。此外，我们还增加了双语支持，并使我们的仓库对Long-CoT初学者更加友好。\n- **2025.04**: 🎉🎉🎉 我们已将审阅论文数量更新至超过900篇。同时，我们通过更具吸引力的预告图提升了展示效果。\n- **2025.03**: 🎉🎉🎉 我们发表了一篇题为“迈向推理时代：大型语言模型长链式思维推理综述”的调查论文（[arXiv链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567)）。欢迎引用或针对您出色的研究所提交拉取请求。\n\n\u003C!-- omit in toc -->\n# 🌟 简介\n\n欢迎来到与我们的综述论文《迈向推理时代：大型语言模型长链式思维推理综述》相关的仓库。本仓库包含与我们正在进行的Long CoT研究相关的**资源和更新**。如需详细简介，请参阅[我们的综述论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.09567)。\n\n近期，在大型语言模型推理（RLLM）方面的进展，例如OpenAI-O1和DeepSeek-R1，已经展示了它们在数学和编程等复杂领域中的强大能力。其成功的核心因素在于应用了长链式思维（Long CoT）特性，这不仅增强了推理能力，还使得解决复杂问题成为可能。\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_718db790ab3e.png)\n\n然而，尽管取得了这些进展，目前仍缺乏关于Long CoT的全面综述，这限制了我们对其与传统短链式思维（Short CoT）之间区别的理解，并使关于“过度思考”和“测试时缩放”等问题的讨论变得复杂。本综述旨在填补这一空白，提供一个关于Long CoT的统一视角。\n1. 首先，我们将Long CoT与Short CoT区分开来，并提出一种新的分类法来归类当前的推理范式。\n2. 接着，我们探讨了Long CoT的关键特征：深度推理、广泛探索和可行的反思，这些特征使模型能够处理更复杂的任务，并产生比浅层的Short CoT更为高效、连贯的结果。\n3. 然后，我们研究了具有这些特征的Long CoT的出现等关键现象，包括过度思考和测试时缩放，从而揭示这些过程在实践中的表现方式。\n4. 最后，我们识别出显著的研究空白，并指出了未来有前景的方向，包括多模态推理的整合、效率提升以及知识框架的增强。\n\n通过提供一个结构化的概述，本综述旨在激发未来的研究，并进一步推动人工智能领域的逻辑推理发展。\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_2dfcfad442d1.jpg)\n\n\u003C!-- omit in toc -->\n\n\n# 🕹️ 内容\n## 0. 如何学习 & 关于我们\n我们的目标是帮助新手快速建立领域知识，因此我们的设计理念如下：简要介绍涉及大型模型推理和Long CoT的主要技术，让大家了解不同技术可以解决哪些问题，以便在未来深入该领域时，能够有一个清晰的起点。\n\n我们是一支从事大型模型推理的初学者团队，希望通过自身的学习经验，为未来的学习者提供一些帮助，加速大型模型推理的普及和应用。我们欢迎更多朋友加入我们的项目，同时也乐于开展友谊和学术合作。如有任何疑问，请随时通过电子邮件 [charleschen2333@gmail.com](mailto:charleschen2333@gmail.com) 联系我们。\n\n**日常知识资源**\n- **社交媒体：**\n  - 推荐微信公众号：极智资讯、Paper Weekly、MLNLP...\n  - 推荐Twitter账号：[AK](https:\u002F\u002Fx.com\u002F_akhaliq)、[elvis](https:\u002F\u002Fx.com\u002Fomarsar0)、[Philipp Schmid](https:\u002F\u002Fx.com\u002F_philschmid)、...\n- **前沿课程：** [CS336](https:\u002F\u002Fstanford-cs336.github.io\u002Fspring2025\u002F)\n- **社区分享：** [MLNLP](https:\u002F\u002Fspace.bilibili.com\u002F168887299)、[极智资讯](https:\u002F\u002Fspace.bilibili.com\u002F73414544)、[BAAI](https:\u002F\u002Fhub.baai.ac.cn\u002F)、[NICE学术](https:\u002F\u002Fspace.bilibili.com\u002F507524288)\n\n## 1. 经典推理模型\n- [OpenAI-o1 \u002F o3 \u002F o4](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002F#o3)：最早探索Long CoT的推理大型语言模型，由OpenAI的一线模型开发。\n- [Gemini](https:\u002F\u002Fgithub.com\u002Fgoogle-gemini)：由Google开发的一线推理大型语言模型。\n- [Deepseek-r1](https:\u002F\u002Fgithub.com\u002Fdeepseek-ai\u002FDeepSeek-R1)：首个具备Long CoT的开源推理大型语言模型。\n- [QwQ](https:\u002F\u002Fqwenlm.github.io\u002Fzh\u002Fblog\u002Fqwq-32b-preview\u002F)：首个具备Long CoT的开源大规模推理大型语言模型。\n- [Qwen3](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3)：阿里巴巴开发的最常用的开源Long CoT推理大型语言模型。\n- [Seed-Thinking-v1.5](https:\u002F\u002Fgithub.com\u002FByteDance-Seed\u002FSeed-Thinking-v1.5\u002Fblob\u002Fmain\u002Fseed-thinking-v1.5.pdf)：字节跳动的开源Long CoT推理模型。\n- [Kimi-k1.5](https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-k1.5)：由Moonshot开发的最早的多模态Long CoT推理模型。\n- [MiniMax-m1](https:\u002F\u002Fgithub.com\u002FMiniMax-AI\u002FMiniMax-M1)：由MiniMax开发的开源Long CoT推理模型。\n\n\n## 2. Long-CoT能力介绍\n在这一章中，我们将为每种能力提供最具代表性的技术，并附上最新进展。详细的论文列表可在[完整列表](pages\u002Fpaper.md)中找到。\n\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_0ef89bb498f9.jpg)\n\n\n### 2.1 深度推理\n\n深度推理能力的核心在于需要足够的逻辑深度来管理大量的推理节点。如果没有这种能力，大型语言模型推理（RLLM）的表现会显著下降。目前增强深度推理的方法主要分为两种：深度推理格式和深度推理学习。\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_6516b5a46db9.png\" style=\"width: 580pt\">\n\n#### 2.1.1 深度推理格式\n由于推理模型高度依赖于推理格式，它们往往在其擅长的形式中实现最深入的推理路径。因此，一些研究开始探索更优的推理格式，以促进深度推理的发展。\n\n\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_f6f35d5f1877.jpg\" style=\"width: 580pt\">\n\n**自然语言深度推理**\n\n- **核心思想：** 旨在通过自然语言形式表达深度推理。\n- **代表性工作：**\n  - [Natural Program](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002F72393bd47a35f5b3bee4c609e7bba733-Paper-Conference.pdf)：确保更加结构化和严谨的逻辑分析。\n  - [Code I\u002FO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.07316)：将基于代码的推理模式重构为自然语言形式，进一步释放RLLM的推理潜力。\n\n---\n\n**结构化语言深度推理**\n\n- **核心思想：** 旨在通过程序化或符号化语言格式提升深度推理能力。当前研究主要集中在利用代码来提高数学推理能力。\n- **代表性工作：**\n  - [Program-of-Thought](https:\u002F\u002Fopenreview.net\u002Fforum?id=YfZ4ZPt8zd)：使模型能够使用代码语言进行思考，从而增强其推理能力。\n  - [DeepSeek-Prover](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14333)：将自然语言问题转化为形式化陈述，过滤低质量陈述并生成证明以创建合成数据，从而提升LLM的定理证明能力。\n  - [RBF](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Fhash\u002F62ab1c2cb4b03e717005479efb211841-Abstract-Conference.html)：展示了在需要强规划能力的情境下，结构化语言比自然语言更为有效。\n\n\n\n---\n\n**潜在空间深度推理**\n\n- **核心思想：** 通过连续的潜在空间操作来提升LLM的推理能力。\n- **代表性工作：**\n  1. **Token驱动：** 早期研究引入了隐式的“规划token”或“思考token”，用于引导潜在空间中的推理过程。\n     - [Coconut (Chain of Continuous Thought)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06769)：进一步扩展了这一方法，维持多条并行推理路径，提升复杂性的同时保证效率。\n     - [Heima](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.19201)：通过潜在隐藏空间进行高效推理，创新性地将整个Long CoT过程压缩成单个token，从而显著节省计算资源。\n  2. **Vector驱动：** 插入额外的向量来指导潜在空间中的推理过程。\n     - [LTMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.01567)：创新性地将LLM的每一层抽象为“思考块”，并为每层引入“思考向量”的概念。通过在潜在空间中进行迭代式深度计算，模型能够在测试时动态调整计算负载。\n  3. **Manager驱动：** 提出一种持续的管理机制来控制潜在空间状态。\n     - [Recurrent Block](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.05171)：利用经过训练的“递归块”作为递归“思考块”，在推理过程中整合更深的模型层次，无需专门的训练数据即可提升性能。\n     - [Implicit Thought Transformer (ITT)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13842)：将原始Transformer层用作递归“思考块”，通过自适应的token路由选择关键token，并借助残差式思维连接来控制推理深度，从而实现对关键token的有效处理。\n- **相关仓库：**\n  - [Awesome-Latent-CoT](https:\u002F\u002Fgithub.com\u002FEIT-NLP\u002FAwesome-Latent-CoT)：提供了潜在空间中各种思维链表示的概述，捕捉仅靠语言无法表达的复杂非语言性思维。\n\n---\n\n#### 2.1.2 深度推理学习\nRLLM缺乏深度推理能力会显著降低模型性能。因此，学术界的研究重点已转向通过训练来提升推理能力。监督微调（SFT）作为一种记忆过程，可以稳定模型输出；而强化学习（RL）则有助于模型的泛化和自主学习。\n\n---\n\n**深度推理模仿**\n\n- **核心思想：** 通过模仿先进的推理系统，可以有效实现RLLM的深度推理，使模型能够学习复杂的推理模式并在不同任务间进行泛化。\n- **代表性工作：**\n  1. **来自人类的模仿**\n     - [GSM8K\u002FGPT-Verifier](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.14168)：引入了基于人工标注的深度推理样本的早期模仿学习。\n     - [ALT](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002F8678da90126aa58326b2fc0254b33a8c-Paper-Conference.pdf)：通过生成大规模的人工标注逻辑模板数据集，提升RLLM的深度推理能力。\n  2. **来自先进RLLM的模仿**\n     - [AceMath](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.15084)：采用少样本提示从先进LLM中提炼Long CoT样本，通过多阶段质量导向的SFT提升性能。\n     - [DART-Math](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002F0ef1afa0daa888d695dcd5e9513bafa3-Paper-Conference.pdf)：在合成阶段通过拒绝采样有效地提炼出与难度相关的深度推理样本。\n     - [OpenThoughts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.04178) \u002F [OpenCodeReasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.01943) \u002F [NaturalThoughts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.01921)：将这一范式扩展到数学、代码及通用场景。\n  3. **来自规模增强型RLLM的模仿**\n     - [Bansal et al.](https:\u002F\u002Fopenreview.net\u002Fforum?id=HuYSURUxs2)：发现扩大采样规模和长度能够提升数据质量。\n     - [Qwen-Math](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12122) \u002F [PromptCoT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.02324)：进一步结合大规模采样与奖励模型样本选择，生成奥运级别难度的深度推理样本。\n     - [FastMCTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11476)：利用蒙特卡洛树搜索（MCTS）识别最优的深度推理路径。\n- **最新进展：**\n     - [Journey P2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.16489)：从o1、R1等先进RLLM API中提炼的知识显著提升了小型LLM的表现，在复杂数学推理任务中，监督微调方法甚至超越了教师模型。\n     - [s1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.19393) \u002F [LIMO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03387)：少量高质量样本足以激活基础LLM的深度推理能力。\n\n---\n\n**深度推理自我学习**\n\n- **核心思想**：尽管简单的模仿就能取得优异的性能，但当前模型在模仿和蒸馏过程中仍然严重依赖人工标注或先进模型的输出。为了突破这一局限，研究重点转向自学习技术，以实现更高级的推理能力。\n- **代表性工作**：\n  1. **直接采样进行自学习**\n     - [STaR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465)：利用上下文学习（ICL）采样深度推理结果，并将最终答案的正确性作为自学习的隐式奖励。\n     - [Reinforced Self-Training (ReST)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08998)：提出“成长—改进”范式，对自生成的推理过程进行奖励，并结合离线强化学习进一步提升。\n     - [ReST$^{EM}$](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.06585)：通过生成奖励并迭代优化大语言模型，在验证集上达到峰值性能，显著提升了鲁棒性。\n     - [TOPS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.18080)：发现以适当的推理深度进行深度推理样本的自学习效率最高。\n  2. **基于树搜索的自学习**\n     - [PGTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.06813)：采用策略引导的树搜索，将强化学习与结构化的树型探索相结合。\n     - [ReST-MCTS*](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.03816)：通过渐进式轨迹提取和课程偏好学习优化MCTS行为，显著提升了大语言模型的推理能力。\n- **最新进展**：引入错误纠正自适应机制，通过训练验证器或利用熵来过滤和优化奖励过程，从而提升自学习的质量。\n   - [UnCert-CoT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.15341)：基于熵感知的不确定性动态调度思维链，在高熵情况下才激活多路径推理，显著提高了代码生成的准确性和效率。\n   - [Wang et al.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.01939)：从token熵的角度分析了可验证奖励强化学习对大语言模型推理能力的影响，其中高熵的“分支”token主导了多路径推理策略的调整，而策略梯度优化仅应用于这些高熵token。\n   - [CoT-Valve](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.09601)：根据任务难度动态调整以缩短推理路径长度，从而降低计算开销。\n\n\n\n---\n\n\n\n### 2.2 可行的反思\n\n#### 2.2.1 反馈\n\n反馈机制为Long CoT提供了多粒度的评估信号，从评估最终结果的总体反馈，到监督推理过程各步骤的过程反馈，再到两者的混合反馈。这些机制不仅支持奖励建模和路径优化，还为后续的自我修正奠定了基础，是推动RLLMs从静态生成走向动态评估的关键桥梁。\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_82dc61bbe310.png\" style=\"width: 580pt\">\n\n---\n\n**总体反馈**\n\n- **核心思想**：总体反馈从全局视角评估完整的推理过程及最终结果，常用于在强化学习或自优化过程中指导大语言模型提升推理质量。反馈形式包括数值奖励、规则检查以及自然语言评价等。\n- **代表性工作**：\n  1. **结果奖励模型（ORM）**：提供数值奖励信号以优化输出质量，适用于难以直接评估准确性的任务。\n     - [Gen-Verifier](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.14168)：首次引入基于推理准确性的验证框架；\n     - [Critic-RM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.16646)：结合自然语言批评与奖励预测，显著提升了反馈质量；\n     - [Self-Rewarding LMs (SRLMs)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.08922)：引入一致性机制，在无需人工标注的情况下实现自监督奖励。\n  2. **规则提取**：利用任务内的规则验证和纠正答案，增强反馈的稳定性。\n     - [STaR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14465) \u002F [ReST](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08998)：表明基于最终答案的规则反馈在数学任务中优于ORM；\n     - [OpenCodeInterpreter](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14658) \u002F [AceCoder](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.01718)：在编码任务中使用自动化测试用例生成程序级反馈。\n  3. **RLLMs反馈（LLM-as-a-Judge）**：\n      模型以自然语言进行自我批评和评估，增强了反思与纠错能力。\n     - [EvalPlanner](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18099)：区分规划与推理两类反馈；\n     - [RoT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.12323)：结合逆向推理与反思，帮助模型发现知识漏洞；\n     - [AutoRace](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05221)：提供针对特定任务的评估标准，以提高反馈的相关性。\n- **相关资源库**：\n  - [RewardBench](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench)用于系统评估ORM方法。\n\n**过程反馈**\n\n- **核心思想：**\n   过程反馈逐步评估推理链中的每一步，通常与强化学习或树搜索结合使用，引导模型在无需人工标注的情况下进行微调。反馈来源主要包括过程奖励模型和由自然语言驱动的语言模型。\n- **代表性工作：**\n  1. **来自过程奖励模型（PRM）的过程反馈：**\n      使用自动构建或少量标注的数据训练分步奖励函数，这是Long CoT中的主流方法。\n     - [PRM800K](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.20050)：率先采用人工标注的分步监督来提升奖励稳定性；\n     - [Math-Shepherd](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.08935)：利用树搜索自动生成分步反馈，以增强PRM的泛化能力；\n     - [Full-Step-DPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14356)：对整个推理链进行奖励，鼓励整体优化；\n     - [AdaptiveStep](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13943)：根据置信度动态划分推理步骤，实现逐token级别的细粒度反馈。\n  2. **来自RLLM的过程反馈：** 利用模型自身生成的自然语言反馈来模拟奖励信号，从而提高过程监督的灵活性和可扩展性。\n     - [React](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03629) \u002F [Reflexion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11366)：在每一步动作后生成语言反馈，提升决策合理性；\n     - [Step-DPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.18629)：引入自我验证机制构建正负对比样本，优化训练过程；\n     - [CACE](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.07165)：提出推理步骤之间的因果影响度量，使整个链条更具可解释性；\n     - [ORPS](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2412.15118v1)：通过程序执行反馈自动优化推理策略，减少对人工的依赖。\n- **相关仓库：**\n  - [ProcessBench](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FProcessBench)：评估分步推理及奖励模型性能；\n  - [PRMBench](https:\u002F\u002Fgithub.com\u002Fssmisya\u002FPRMBench)：专注于数学任务中PRM方法的比较分析。\n\n**混合反馈**\n\n- **核心思想：** 混合反馈机制结合了整体反馈和过程反馈的优势，在关注中间推理步骤的同时也评估最终输出。这种统一的多粒度评估体系能够提升语言模型的整体推理质量和纠错能力。\n- **代表性工作：**\n  - [Consensus Filtering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.07301)：将蒙特卡洛估计与LLM-as-Judge相结合，整合整体和分步反馈，提升推理的一致性和准确性；\n  - [Step-KTO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.10799)：融合PRM和ORM的二元反馈机制，强调反思驱动的错误修正，引导模型形成更连贯的Long CoT结构。\n\n#### 2.2.2 精炼\n\n精炼机制侧重于基于反馈信息的自我纠错能力，是实现Long CoT闭环优化的关键步骤。通过基于提示的精炼实现自发性反思；基于SFT的精炼促进模仿学习；而基于RL的精炼则强化自我纠错策略。由此，模型逐渐具备“自我诊断—自我更新”的能力，使推理链更加 robust 和可控。\n\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_37b27a4cf133.png\" style=\"width: 580pt\">\n\n---\n\n**基于提示的精炼**\n\n- **核心思想：** 通过提示引导模型生成初始响应，并在后续轮次中允许其进行自我反馈和多轮修正，以此提升推理准确度、减少幻觉现象，并支持更强的自动化反思能力。\n- **代表性工作：**\n  - [ReAct](https:\u002F\u002Fgithub.com\u002Fysymyth\u002FReAct) \u002F [Reflexion](https:\u002F\u002Fgithub.com\u002Fnoahshinn\u002Freflexion)：典型的多轮反思与自我修正机制实现；\n  - [Self-Backtracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.04404) \u002F [Refiner](https:\u002F\u002Faclanthology.org\u002F2024.eacl-long.67\u002F) \u002F [BackMath](https:\u002F\u002Faclanthology.org\u002F2025.coling-industry.40\u002F)：支持模型在推理过程中自主回溯并修改，简化决策路径；\n  - [MCTSr](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07394) \u002F [ReST-MCTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.03816)：将树搜索与置信度更新相结合，实现多轮动态反思；\n  - [LLM2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.20372)\u002F [ReARTeR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.07861)：促进Long CoT任务中精炼策略的自动演化与稳定收敛。\n  \n\n**基于SFT的精炼**\n\n- **核心思想：** 通过高质量的反思数据进行监督式微调，使模型模仿更先进模型的自我纠正行为，从而提升其逐步纠错和反思能力。这种方法适用于小模型的能力迁移和细粒度训练。\n- **代表性工作：**\n  - [rStar](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.06195)：通过自我博弈方法提升小模型的自我改进能力；\n  - [Math-Minos](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14024)：利用分步解题逻辑标签对模型进行训练，实现细粒度推理；\n  - [Journey Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.16489)：结合MCTS回溯生成监督信号；\n  - [MM-Verify](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13383)：将精炼机制扩展至多模态图文推理领域。\n  \n**基于RL的精炼**\n\n- **核心思想：** 通过强化学习机制，引导模型在测试或推理过程中进行自我反思和修正，强调在奖励指导下的自我精炼能力，从而减少对人工监督的依赖。\n- **代表性工作：**\n  - [SCoRe](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12917)：通过自动生成的修正轨迹和正则化项，提升模型在测试阶段的自我精炼能力；\n  - [DeepSeek-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948)：利用结果层面的强化学习激活模型的自然反馈和“顿悟”时刻来进行修正；\n  - [S$^2$R](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12853)：结合过程层面的强化学习，实现推理过程中的动态精炼；\n  - [ReVISE](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14565)：引入内部验证器来决定何时触发RL引导的反思行为。\n\n### 2.3 广泛探索\n广泛探索使大型推理模型（RLLMs）在处理复杂问题时，能够更深入、更全面地探索多种推理路径，从而提高解题的准确性和鲁棒性。从探索类型的视角来看，广泛探索技术可分为三类：探索规模扩展、内部探索和外部探索。\n\n#### 2.3.1 探索规模扩展\n探索规模扩展旨在通过增加推理路径的数量或长度来提升模型解决更复杂问题的能力。这种方法通常适用于推理任务较为复杂，单一推理路径难以有效得出正确答案的情况。\n\n---\n\n**顺序式规模扩展**\n\n- **核心思想：** 通过延长单条路径的推理链条，逐步加深模型的思考深度，从而提升其对复杂问题的理解与处理能力。这尤其适用于需要多步推理才能得出结论的长链式思维（Long CoT）任务，例如数学证明、逻辑推导以及多步规划等。\n- **代表性工作：**\n  - [OpenAI-o1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16720) \u002F [Deepseek-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948)：\n    延伸推理链条以提供详细的多步推理过程，有效提升了在数学、编程等领域解决复杂问题的能力。\n  - [ITT（内部思考变换器：)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13842)：将Transformer中的层计算重新定义为“思考步骤”，在不增加模型总参数量的前提下，动态为关键token上的深层推理分配计算资源。\n\n---\n\n**并行式规模扩展**\n\n- **核心思想：** 通过并行生成多条推理路径，并对结果进行组合、投票或验证，模型可以有效避免单一路径陷入局部最优或错误的问题，从而在高度歧义、存在多种可能解或结果不明确的情况下，提升系统的鲁棒性和准确性。\n- **代表性工作：**\n  - [Self-Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)：提出生成多条推理路径，并从结果中选择出现频率最高的答案，从而有效提高最终答案的稳定性和准确性。\n  - [ECM（电子电路模型）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03325)：借鉴电子学中的并联和串联电路概念，将推理路径以并联或串联方式结合，综合考虑各种可能性，提升决策质量。\n\n---\n\n#### 2.3.2 内部探索\n内部探索主要指大型推理模型（RLLMs）通过其内部机制（通常是强化学习策略和奖励机制）主动探索和优化推理路径，从而更高效、更深入地解决复杂的推理问题。该方法使模型能够自主调整推理策略，减少对外部引导数据的依赖。\n\n---\n\n**RL策略**\n- **核心思想：** 利用强化学习（RL）算法引导模型主动学习和探索多样化的推理路径。这种方法克服了推理过程中模式过于单一的局限性，提升了模型在高不确定性任务或高度依赖自主决策的任务中的表现。\n- **代表性工作：**\n  - [PPO（近端策略优化）](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06347)：一种经典的RL算法，通过基于策略梯度的方法高效优化模型的内部决策机制，适用于复杂环境下的路径探索与优化。\n  - [DivPO（多样化偏好优化）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18101)：鼓励模型探索更多样化的推理路径，以保持决策多样性，防止收敛到局部最优解。\n  - [GRPO（引导式奖励策略优化）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.03300)：设计了一种引导式的奖励机制，使模型能够在复杂的逻辑推理空间中更有效地进行探索。\n\n---\n\n**奖励策略**\n\n- **核心思想：** 通过精心设计的奖励函数直接引导模型探索和优化有效的推理路径，这在存在明确优化目标或需要解决特定推理瓶颈的场景中尤为有用。\n- **代表性工作：**\n  - [Deepseek-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948)：提出专门设计的奖励函数，激励模型优化中间推理步骤，帮助模型构建高质量的内部推理流程。\n  - [ReST-MCTS*](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.03816)：将蒙特卡洛树搜索（MCTS）与奖励策略相结合，通过过程奖励引导树搜索算法更准确地探索有效推理路径，从而提升整体推理质量。\n\n---\n\n#### 2.3.3 外部探索\n外部探索是指借助外部工具、人类知识或其他模型的帮助，引导模型更有效地探索多样化的推理路径，进而提升其解决复杂问题的能力。这种方法常用于需要精细指导或外部知识才能有效解决问题的场景。外部探索又可细分为两类：人类驱动的探索和模型驱动的探索。\n\n---\n\n**人类驱动的探索**\n- **核心思想：** 利用人类的直觉、经验或反馈来指导模型选择和调整推理路径，特别是在模型的自主探索能力有限，或者推理任务复杂且需要分解为多个子任务的情况下。\n- **代表性工作：**\n  - [Least-to-Most](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.10625)：将复杂问题分解为若干简单子问题，逐个求解并将前一步的答案作为后续步骤的输入，最终整合出整体解决方案。该方法旨在解决传统“思维链”方法中的“泛化困难”瓶颈。\n  - [ToT（思维之树）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601)：将传统的“从左到右”token生成式推理过程扩展为“树状结构探索”，其中每个节点代表一个思维单元，支持推理过程中的多路径尝试、回溯、正向推理及自我评估。\n\n---\n\n**模型驱动的探索**\n\n- **核心思想：** 利用辅助模型或算法自动引导当前模型的推理过程，减少对人工干预的需求，从而高效地搜索和优化大量复杂的推理路径，提升自动化水平和整体效率。\n- **代表性工作：**\n  - [PPO-MCTS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15028)：将蒙特卡洛树搜索（MCTS）与基于PPO的训练相结合，以增强推理能力。其关键在于保留PPO训练过程中获得的价值网络，并在推理阶段利用该网络指导MCTS选择更优的输出序列，进而提高生成文本的质量和一致性。\n  - [MindStar](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16265)：将复杂的推理问题（尤其是数学问题）重新表述为搜索问题，在不同的推理路径上进行结构化搜索，以选出最优解。\n  - [rStar-Math](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.04519)：通过MCTS、小模型奖励机制以及自我进化过程，构建强大的数学推理系统，使小型模型在数学能力上超越o1-preview。\n\n---\n\n\n\n## 3. 关键现象及相关原理\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_899dad1ca48d.jpg\" style=\"width: 580pt\">\n\n### 3.1 推理涌现现象\n\n经过训练后，长链式思维（Long CoT）能力会自然涌现，表现为模型能够通过内化预训练数据中的逻辑结构和上下文示例，生成多步且连贯的推理过程，即使在缺乏直接监督的情况下亦然。相关研究对此现象有如下描述：\n\n- [Wang等](https:\u002F\u002Faclanthology.org\u002F2023.acl-long.153\u002F) 发现，少量高质量的上下文示例即可有效引导基于强化学习的语言模型（RLLMs）生成清晰、逻辑一致的推理链条，这表明模型在预训练阶段已内化了基本的推理模式。\n- [Madaan等](https:\u002F\u002Faclanthology.org\u002F2023.findings-emnlp.0\u002F) 证明，即便不提供具体的问题实体，模型仅凭保留的逻辑结构信息仍能生成合理的推理链条，展现了其对结构信息的归纳与迁移能力。\n- [Stechly等](https:\u002F\u002Fopenreview.net\u002Fforum?id=kPBEAZU5Nm) 指出，通过调整解码策略或设计特定提示词，可以显式激活模型内部潜在的CoT能力，从而实现复杂任务中的多步推理。\n- [Guo等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948) 表明，基于规则的强化学习策略可在预训练阶段直接促使模型形成连贯的推理链条，显著提升其在多步任务中的表现。\n\n### 3.2 推理边界现象\n\n大型语言模型在长链式思维（Long CoT）任务中表现出明显的性能边界：当推理的深度或复杂度超过某一阈值时，模型性能会显著下降，有时甚至出现逻辑崩溃。这一现象表明，当前模型存在“推理边界”，即由其参数规模和计算资源所能支持的推理复杂度上限。现有研究从理论建模和实验分析两个方面系统探讨了这一现象：\n\n- [Chen等](https:\u002F\u002Fopenreview.net\u002Fforum?id=pC44UMwy2v.) 正式提出了“推理边界”的概念，通过实验量化了不同任务复杂度下模型的性能临界点，并指出当推理任务超出模型承载能力时，准确率会急剧下降。\n- [Bi等](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F29721) 观察到，在代码生成任务中，当模型试图模仿过于复杂的CoT示例时，性能会大幅恶化，这表明超过一定复杂度后，Long CoT示例反而会适得其反。\n- [Feng等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15408) 提出了一种数学模型，说明固定参数规模的模型无法完成超过特定复杂度的数值计算，揭示了准确率上的硬性限制。\n- [Zhou等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.05252) 构建了GSM-Infinite数据集，并通过实验证明，不同任务的推理能力上限存在显著差异，进一步强调了推理边界与任务结构之间的关系。\n\n### 3.3 过度思考现象\n\n在长链式思维（Long CoT）中，延长推理链条并不总能带来性能提升。研究表明，一旦推理长度超过模型的承载能力，准确率便会下降，这种现象被称为“过度思考”，反映了推理的边际效益呈非线性变化以及推理过程中误差的累积效应。\n\n- [Chen等](https:\u002F\u002Fopenreview.net\u002Fforum?id=pC44UMwy2v.) 发现，当推理步骤数超过模型的边界时，推理准确率会显著降低，这表明推理存在一个最佳深度范围。\n- [Wolf等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.18028) 强调，性能下降的根本原因在于中间推理步骤中的误差被不断放大，从而影响最终判断。\n- [Xie等](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2502.14768v1) 的实验表明，推理长度与准确率之间并不存在单调关系，这挑战了“CoT越长推理越好”的直觉。\n- [Wu等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.07266) 建立了一个数学模型，用于界定不同模型和任务条件下“最佳推理长度”的区间，指出一旦长度超过最优范围，性能就会出现反转。\n- [Chen等](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03325) 提出了“推理链欧姆定律”，将推理长度与性能之间的非线性关系类比为模型中信息流的阻力。\n\n### 3.4 推理时缩放现象\n\n推理时缩放现象是指在推理过程中通过延长计算过程（例如推理链长度或样本数量）来提升推理性能的现象。这一现象揭示了模型的“动态放大”潜力，但也伴随着探索深度与计算成本之间的权衡。\n\n- [Brown等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.21787)观察到，即使初始尝试失败，通过多次重复推理，也能够在一定次数内找到正确答案，从而提出了“语言猴子”现象。\n- [o1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16720)证明，单纯增加推理链长度即可提高准确率，尤其是在复杂的数学任务中。\n- [Jin等人](https:\u002F\u002Faclanthology.org\u002Fvolumes\u002F2024.findings-acl\u002F)指出，虽然增加推理链长度起初会带来性能提升，但超过某一阈值后，性能反而会下降，呈现出典型的非线性增长曲线。\n- [Wu等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.00724)发现，推理样本数量与误差下界之间存在对数关系，这表明计算复杂度（FLOPs）与推理性能之间存在渐近关系。\n- [Chen等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03325)建立了并行推理的理论上限，指出无论样本量如何增加，模型的验证性能都无法超越其内部推理能力的上限。\n\n### 3.5 PRM与ORM选择现象\n\n在强化学习优化中，长链式思维任务需要对模型生成过程进行监督。研究者们区分了两种主要策略：过程奖励模型（PRM），它关注推理过程本身；以及结果奖励模型（ORM），它只关心最终输出是否正确。这两种策略在泛化能力、学习稳定性及监督成本方面存在显著差异。\n\n- [Lampinen等人](https:\u002F\u002Faclanthology.org\u002F2022.findings-emnlp.38)在定性实验中验证了中间步骤与最终答案之间的因果关系，为过程监督的合理性提供了理论支持。\n- [Jia等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.10581)从理论上证明，在数据足够多样化的情况下，ORM的优化难度并不比PRM更高，两者在样本复杂度上仅相差多项式因子。\n- [Guo等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948)表明，基于规则的PRM强化学习能够显著提升模型在复杂任务中的长链式思维能力，但同时也面临奖励黑客的风险。\n- [Tan](https:\u002F\u002Faclanthology.org\u002F2023.blackboxnlp-1.12.)强调了对于复杂推理路径而言，中间推理步骤的奖励分布至关重要，而ORM无法提供这一点。\n- [Jiang等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.03124)指出，PRM在数据收集方面成本更高，因为它需要对每个推理步骤进行标注，这限制了其大规模应用。\n\n### 3.6 哈哈瞬间现象\n\n哈哈瞬间是指在推理过程中信息突然整合，从而导致判断出现关键转折点的现象，类似于人类的反思与自我修正。这一现象凸显了模型的动态认知调整能力，但其发生依赖于外部刺激与内部机制的协同作用。\n\n- [Guo等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.12948)首次通过基于规则的奖励，在无监督条件下触发了哈哈瞬间行为，使模型能够反思中间推理过程并进行自我修正。\n- [Xie等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14768)进一步通过实验证明，这种行为可以在多个模型中复现，证实它并非偶然事件，而是一种可诱导的策略。\n- [Zhou等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.05132)将哈哈瞬间现象扩展到了多模态任务中，表明这一现象并不局限于文本任务，而是反映了模型更广泛的认知能力。\n- [Liu等人](https:\u002F\u002Foatllm.notion.site\u002Foat-zero)指出，在某些强化学习框架中（如R1-Zero），哈哈行为可能并不存在，所谓的生成过程延长更可能是奖励优化的结果，而非真正的反思。\n- [Yang等人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.02956)发现，哈哈行为往往伴随着类似人类的语言增强和动态不确定性调节，模型在高压任务下更倾向于使用“我认为”之类的表达，反映出其应对任务压力的机制。\n\n\n\n\n## 4. 算法\n\n### 4.1 监督微调（SFT）\n\n在推动大型模型具备强大的长链式思维推理能力的过程中，监督微调（SFT）起到了至关重要的作用，它架起了预训练与更高级的对齐方法（如基于人类反馈的强化学习，RLHF）之间的桥梁。SFT的核心目标是教会模型如何遵循指令，并初步掌握生成结构化、分步推理链的能力，从而为更复杂的推理任务奠定基础。\n\n\n\n- **在深度推理的背景下，SFT尤为重要。**\n尽管RLLM缺乏足够的推理深度会显著降低性能，但SFT通过记忆化过程稳定了模型的输出格式，使其能够从人工标注或蒸馏得到的数据中学习推理模式。与更注重泛化和自学习的强化学习（RL）不同，SFT在深度推理模仿中发挥着关键作用。它允许RLLM通过模仿由人类、先进RLLM或增强型RLLM生成的高质量推理示例，学习复杂的推理模式，并将其推广到新任务中。SFT不仅显著提升了模型的推理性能，而且在某些情况下，只需少量高质量样本就能激活底层LLM的深度推理能力，使其能够预测超出模型知识范围之外的事件。这使得SFT成为提升RLLM推理水平和泛化能力的关键技术之一。\n\n- **对于可行的反思，SFT 主要专注于基于优化的模仿（Refinement Imitation）。**\n在基于反思的 LLM 推理中，SFT 是使模型实现自我优化和错误修正的关键机制。通过 SFT，模型可以直接学习高级 LLM 的纠错过程，从而显著提升其反思能力，例如进行自我博弈推理、迭代反馈纠错，甚至通过逐步的自然语言反馈来论证和反思推理过程。此外，SFT 还可以在多模态场景中整合视觉与文本推理，提高模型的批判性思维和自我修正能力。SFT 通过迭代反馈和自我修正策略提升了 LLM 的推理准确性，这对于小型模型尤其有益。\n\n\n#### 4.1.1 核心技术\n\nSFT 包含两个核心概念：**指令微调** 和 **参数高效微调，PEFT**。\n\n**指令微调**\n-   **核心思想:** 通过对模型进行大量覆盖各类任务的指令微调，可以显著提升模型对未见任务的零样本泛化能力。这使得模型能够学习“遵循指令”的技能。\n-   **代表性工作:**\n    -   [微调后的语言模型是零样本学习者 (FLAN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01652): 谷歌的开创性工作，证明了多任务指令微调能够解锁 LLM 在未见任务上的零样本能力。\n    -   [大型语言模型的指令微调：综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.10792): 一篇全面的综述，系统介绍了指令微调的方法、数据集、挑战及未来方向。\n\n\n\n**参数高效微调(PEFT)**\n\n- **核心思想:** 鉴于对 LLM 进行全量微调（Full Fine-tuning）的成本极高，PEFT 方法应运而生。这些方法只需更新模型中的一小部分参数，即可达到接近全量微调的效果，从而大大降低硬件需求。\n-   **代表性工作:**\n    -   [LoRA：大型语言模型的低秩适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.09685): 一种革命性的 LoRA 技术，通过注入低秩适应矩阵来高效地微调模型，目前是最广泛应用的 PEFT 方法之一。\n    -   [QLoRA：量化 LLM 的高效微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14314): 对 LoRA 的进一步优化，结合 4 位量化、双权重量化和页面优化器，使得在单个消费级 GPU 上就能微调超大规模模型。\n    -   [Adapter 微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00751): 在 Transformer 的各层之间插入小型神经网络模块（适配器），训练时仅更新这些适配器的参数。\n    -   [Prompt 微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691) \u002F [P-Tuning](https:\u002F\u002Faclanthology.org\u002F2022.acl-short.8\u002F): 不修改模型权重，而是在输入端学习一个或多个可训练的虚拟标记（软提示），以更有效地引导模型完成下游任务。\n\n**技术对比**\n\n| 技术类型                            | 核心思想                                                                                                     |优 点                                                                                                    | 缺 点                                                                                                             |\n| :----------------------------------------- | :------------------------------------------------------------------------------------------------------------ | :------------------------------------------------------------------------------------------------------------ | :------------------------------------------------------------------------------------------------------------------------ |\n| **全量微调**                       | 更新所有模型权重。                                                                                     | 性能上限最高，可完全适应新数据。                                                     | 训练成本极高（内存、时间），易发生灾难性遗忘，需存储整个模型。         |\n| **参数高效微调 (PEFT)** | 冻结大部分原始参数，仅更新少量附加参数或特定子集。 | 训练成本极低，速度快，抗遗忘能力强，微调产物体积小（适配器），易于部署。 | 性能可能略逊于全量微调，且对极其复杂任务的适应性可能有限。 |\n\n#### 4.1.2 学习资源\n\n| 资源名称                 | 演讲者\u002F作者    | 特点                                                                                                                                                          | 链接                                                                                |\n| :---------------------------- | :---------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------------------------------------------------------------- |\n| 让我们从零开始构建 GPT | Andrej Karpathy   | 一份动手指南，教你从零开始构建 GPT，深入理解 Transformer 的基础原理和训练流程；是理解 SFT 的先决条件。 | [YouTube](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ) |\n| Hugging Face SFT 课程       | Hugging Face      | 官方 SFT 系列教程，使用 Hugging Face TRL 代码库进行 SFT 代码实践。                                                                          | [课程链接](https:\u002F\u002Fhuggingface.co\u002Flearn\u002Fllm-course\u002Fchapter11\u002F1)                  |\n| Hugging Face SFT Trainer 文档  | Hugging Face      | Hugging Face SFTTrainer 的进阶文档。                                                                                                               | [文档链接](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftrl\u002Fsft_trainer)                   |\n| Hugging Face PEFT 课程      | Hugging Face      | 官方 PEFT 系列教程，讲解了包括 LoRA 在内的各种高效微调技术的理论与代码实践。                                    | [课程链接](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fpeft\u002Findex)                               |\n| LLMs-from-scratch             | Sebastian Raschka | 官方书籍《从零开始构建大型语言模型》的教程代码。                                                                               | [课程链接](https:\u002F\u002Fgithub.com\u002Frasbt\u002FLLMs-from-scratch)                           |\n\n#### 4.1.3 开发框架\n\n| 框架            | 特性                                                                                                                                                               | 主要用例                                                                                            | 资源链接                                      |\n| :------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------- | :------------------------------------------------- |\n| **Hugging Face TRL** | 官方 Hugging Face 库，集成了多种训练方法，如 SFT、RLHF、DPO 等，并可与生态系统（`transformers`、`peft`、`accelerate`）无缝对接。 | 提供标准化的 SFT 训练器 `SFTTrainer`，简化了训练流程。                    | [GitHub](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftrl)       |\n| **LLaMA-Factory**    | 一站式 LLM 微调平台，配备 Web UI，使无编程经验的用户也能轻松进行 SFT、PEFT 和模型评估。                           | 用户友好度高，支持大规模模型和数据集，适合初学者及快速验证。 | [GitHub](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory) |\n\n\n\n#### 4.1.4 最佳实践与常见误区\n\n1.  **数据质量远比数量重要：**\n    -   **核心原则**：与其使用10,000条低质量、同质化的数据，不如使用1,000条高质量、多样化的数据。低质量的数据可能会让模型学到错误的模式。\n    -   **格式一致性**：确保所有训练数据遵循统一的对话模板（例如[ChatML](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fmain\u002Fen\u002Fchat_templating)），这对于训练模型识别角色和对话边界至关重要。\n2.  **选择合适的微调策略**：\n    -   对于大多数资源有限的应用场景，应优先选择**QLoRA**，因为它在效率和效果之间取得了最佳平衡。\n    -   如果追求最优性能且资源充足，则可以考虑**全量微调**，但需注意避免过拟合的风险。\n3.  **关键超参数的调整**：\n    -   **学习率**：SFT 的学习率通常比预训练时更小，一般在 `1e-5` 到 `5e-5` 之间。\n    -   **轮次**：通常1到3轮就足够了。轮次过多可能导致在小数据集上过拟合，从而使模型“忘记”预训练阶段学到的通用知识。\n    -   **批量大小**：在内存允许范围内，适当增加批量大小有助于稳定训练过程。\n4.  **评估与迭代**：\n    -   **全面评估**：不要仅依赖损失函数。应结合**客观评估**基准测试（如 MMLU）以及主观的**人工评估**，以更全面地衡量模型性能。\n    -   **迭代优化**：SFT 是一个持续的迭代过程。根据评估结果，不断清理数据、调整超参数并优化模型。\n\n#### 4.1.5 相关论文仓库\n- [LLM4NLP](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FAwesome-LLM-for-NLP)\n\n\n\n### 4.2 强化学习\n\n#### 4.2.1 强化学习\n\n- [西湖大学《强化学习的数学原理》](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1sd4y167NS\u002F)\n  - 特点：从 MDP 和贝尔曼方程出发，利用策略梯度定理推导而来。\n  - 先修知识：线性代数、概率论。\n  - 重点：价值迭代和策略优化的数学本质。\n- [Book-Mathematical-Foundation-of-Reinforcement-Learning](https:\u002F\u002Fgithub.com\u002FMathFoundationRL\u002FBook-Mathematical-Foundation-of-Reinforcement-Learning)（适合初学者）\n\n#### 4.2.2 强化学习的核心算法\n\n**权威课程**\n\n| 课程                                        | 讲师      | 特点                                                                 | 资源                                                                      |\n| --------------------------------------------- | ----------- | ------------------------------------------------------------------------ | -------------------------------------------------------------------------- |\n| Foundations of Deep RL                        | Pieter Abbeel | 6节简明课程（Q-learning → PPO）                                    | [YouTube](https:\u002F\u002Fyoutube.com\u002Fplaylist?list=PLkFD6_40KJIwhWpGazJ9VSj9CFMkb79A) |\n| UC Berkeley CS285                             | Sergey Levine | 包括 SAC\u002F逆向强化学习等高级主题                                    | [课程官网](http:\u002F\u002Frail.eecs.berkeley.edu\u002Fdeeprlcourse\u002F)                  |\n| Hung-yi Lee 的强化学习课程                   | Hung-yi Lee     | 中文讲解 + EasyRL 实践练习                                           | [Bilibili](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1UE411G78S)                        |\n| Reinforcement Learning: An Overview           | Kevin Murphy  | 持续更新的深度强化学习算法相关资源                                 | [Arxiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05265)                                      |\n\n**基础必学算法**\n\n- **基础强化学习算法**\n\n  - **DQN**：深度强化学习的开端\n  - **PPO**：策略优化中的关键方法，广泛应用于工业领域\n  - **SAC**：引入探索熵，对连续动作空间具有较强的鲁棒性\n  - **TD3**：通过双延迟网络改进了离策略强化学习\n\n- **基于模型的强化学习算法**\n\n  - **[dreamer](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fdreamer)**：一种基于模型的强化学习算法\n  - **[tdmpc2](https:\u002F\u002Fgithub.com\u002Fnicklashansen\u002Ftdmpc2)**：基于模型的强化学习算法的重大进展\n\n- **离线强化学习算法**\n\n  - **[CQL](https:\u002F\u002Fgithub.com\u002Faviralkumar2907\u002FCQL)**：引入保守约束，是离线强化学习的基础性工作\n  - **[decision-transformer](https:\u002F\u002Fgithub.com\u002Fkzl\u002Fdecision-transformer)**：将自回归模型引入离线强化学习\n\n- **大规模模型强化学习算法**\n\n  - **PPO**：将经典的 PPO 方法应用于大型语言模型\n  - **DPO**：无需奖励的偏好优化，是一种适用于大型模型的离线强化学习算法\n  - **GRPO**：群体相对策略优化，是 DeepSeek-R1 的核心算法\n\n**大规模模型强化学习的前沿算法**\n\n- **[DAPO](https:\u002F\u002Fgithub.com\u002FBytedTsinghua-SIA\u002FDAPO)**：对GRPO的四项改进\n- **[LUFFY](https:\u002F\u002Fgithub.com\u002FElliottYan\u002FLUFFY)**：GRPO的离策略版本，引入高质量的外部轨迹\n- **[Absolute-Zero-Reasoner](https:\u002F\u002Fgithub.com\u002FLeapLabTHU\u002FAbsolute-Zero-Reasoner)**：一种无需标注的大规模强化学习算法\n- **[One-Shot-RLVR](https:\u002F\u002Fgithub.com\u002Fypwang61\u002FOne-Shot-RLVR)**：针对大模型推理的一次性优化方法\n- **[SPIRAL](https:\u002F\u002Fgithub.com\u002Fspiral-rl\u002Fspiral)**：在自我对弈游戏环境中进行强化学习，成功提升了数学推理能力\n- **[高熵少数令牌驱动有效RLVR](https:\u002F\u002Fshenzhi-wang.github.io\u002Fhigh-entropy-minority-tokens-rlvr\u002F)**：由高熵令牌驱动的强化学习（20%）\n- **[Spurious\\_Rewards](https:\u002F\u002Fgithub.com\u002Fruixin31\u002FSpurious_Rewards)**：随机奖励同样可以提升LLM的推理能力\n- **[SwS](https:\u002F\u002Fgithub.com\u002FMasterVito\u002FSwS)**：由自我感知的弱点驱动的推理强化学习\n\n#### 4.2.3 强化学习开发框架\n\n**基础强化学习框架**\n\n- **[stable-baselines3](https:\u002F\u002Fgithub.com\u002FDLR-RM\u002Fstable-baselines3)**（快速实验，拥有成熟稳定的基线）\n- **[legged\\_gym](https:\u002F\u002Fgithub.com\u002Fleggedrobotics\u002Flegged_gym)**（四足机器人控制）\n\n**大规模模型强化学习框架**\n\n- **[verl](https:\u002F\u002Fgithub.com\u002Fvolcengine\u002Fverl)**：基于Ray、vLLM、ZeRO-3和HuggingFace Transformers的高性能、易用开源强化学习训练库，具备高效资源利用、可扩展性和生产就绪等特性。（结构复杂，高度可复用，性能优异）\n- **[OpenRLHF](https:\u002F\u002Fgithub.com\u002FOpenLLMAI\u002FOpenRLHF)**：由NVIDIA等团队发布的开源RLHF框架，基于Ray、vLLM、ZeRO-3和HuggingFace Transformers。支持PPO、GRPO、REINFORCE++等算法，并提供动态采样和异步智能体机制以加速训练。\n- **[AReaL](https:\u002F\u002Fgithub.com\u002FinclusionAI\u002FAReaL)**：异步强化学习框架\n- **[ROLL](https:\u002F\u002Fgithub.com\u002Falibaba\u002FROLL)**：支持训练参数量达6000亿以上的大型模型\n- **[Hugging Face TRL](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftrl)**：由Hugging Face维护的RLHF全栈库，集成了SFT、GRPO、DPO、奖励建模等多个模块。支持多种模型架构和分布式扩展，是社区中最活跃的RLHF工具之一。（用户友好，上手快，社区活跃）\n- **[RL4LMs](https:\u002F\u002Fgithub.com\u002Fallenai\u002FRL4LMs)**：面向语言模型的开源RLHF库，提供奖励模型构建和策略网络训练的端到端工具，帮助研究人员快速搭建自定义RLHF流水线。\n\n此外，还有一些有趣的扩展仓库：\n\n- **[Sachin19\u002Ftrlp](https:\u002F\u002Fgithub.com\u002FSachin19\u002Ftrlp)**：基于TRL堆栈的端到端RLHF库，不仅支持语言模型，还扩展至Stable Diffusion模型。包含SFT、奖励建模、PPO等步骤，并提供用于实验的示例代码。\n- **[OpenRLHF-M](https:\u002F\u002Fgithub.com\u002FOpenRLHF\u002FOpenRLHF-M)**：OpenRLHF的扩展版本，专为多模态模型优化。利用DeepSpeed和HuggingFace Transformers实现更高的吞吐量和更丰富的训练场景。\n- **[HumanSignal-RLHF](https:\u002F\u002Fgithub.com\u002FHumanSignal\u002FRLHF)**：一个已归档的资源库，汇集了关于RLHF数据收集、系统构建及最佳实践的链接和教程，适合初学者快速了解完整的RLHF流程。\n- **[MichaelEinhorn\u002Ftrl-textworld](https:\u002F\u002Fgithub.com\u002FMichaelEinhorn\u002Ftrl-textworld)**：TRL的一个衍生版本，专注于在TextWorld环境中开展RLHF实验，展示了如何使用PPO训练GPT2等模型，生成符合特定反馈要求的文本。\n\n### 4.2.4 测试环境\n\n**经典强化学习测试**\n\n- **OpenAI Gym**: 经典控制任务\n\n| 环境ID         | 任务描述                       | 特征                                                                             |\n| -------------- | ------------------------------ | ------------------------------------------------------------------------------------ |\n| `CartPole-v1`  | 平衡倒立摆                   | 4维状态\u002F离散动作，当摆杆倾斜超过12°或步数达到500时终止                             |\n| `MountainCar-v0` | 将小车荡到山顶               | 2维状态\u002F离散动作，需要利用势能完成摆动                                           |\n| `Pendulum-v1`  | 控制单摆保持竖直             | 3维状态\u002F连续动作，无物理终止条件                                                 |\n| `Acrobot-v1`   | 摆动双连杆触碰目标线         | 6维状态\u002F离散动作，当触碰到目标线时终止                                           |\n\n- **Atari 2600**: 游戏环境\n\n| 环境ID         | 游戏类型      | 挑战                                                         |\n| -------------- | ------------- | ------------------------------------------------------------ |\n| `Pong-v5`      | 乒乓球        | 210×160 RGB输入，需进行图像预处理                           |\n| `Breakout-v5`  | 打砖块        | 奖励密集，适合DQN训练                                        |\n| `SpaceInvaders-v5` | 太空侵略者    | 多个敌人协同攻击，奖励机制复杂                               |\n\n- **Box2D**: 物理仿真环境\n\n| 环境ID         | 物理系统     | 核心挑战                                                        |\n| -------------- | ------------ | --------------------------------------------------------------- |\n| `LunarLander-v2` | 月球着陆器   | 8维状态\u002F离散动作，涉及燃料控制与精准着陆                      |\n| `BipedalWalker-v3` | 双足行走机器人 | 24维状态\u002F连续动作，需在复杂地形上保持平衡                     |\n| `CarRacing-v2`   | 赛车竞速      | 96×96 RGB输入，结合视觉感知与连续控制                          |\n\n- **MuJoCo**: 机器人控制环境\n\n| 环境ID       | 机器人模型    | 任务类型                                          |\n| ------------ | ------------- | -------------------------------------------------- |\n| `HalfCheetah-v4` | 猎豹机器人  | 高速奔跑控制（17维状态）                          |\n| `Ant-v4`       | 蚂蚁机器人  | 复杂地形导航（111维状态）                         |\n| `Humanoid-v4`  | 人形机器人  | 双足平衡行走（376维状态）                         |\n\n- **其他特殊环境**\n\n| 类别      | 示例环境           | 应用领域                    |\n| --------- | ------------------ | --------------------------- |\n| 文本游戏  | `TextFlappyBird-v0` | 基于字符界面的强化学习      |\n| 多智能体  | `PistonBall-v6`    | 多智能体合作\u002F竞争           |\n| 3D导航    | `AntMaze-v4`       | 复杂迷宫路径规划            |\n\n**扩展资源**：\n\n- **安全强化学习**: `Safety-Gymnasium`（带约束的任务）\n- **自动驾驶**: `CARLA`\u002F`AirSim`（高保真仿真）\n- **多智能体**: `PettingZoo`（兼容Gymnasium API）\n\n> 💡 完整环境列表可参见：\n> [Gymnasium文档](https:\u002F\u002Fgymnasium.farama.org\u002F) | [OpenAI Gym Wiki](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgym\u002Fwiki\u002FTable-of-environments)\n\n**大模型强化学习测试**\n\n| 环境         | 目的                                      |\n| ------------ | ---------------------------------------- |\n| Math-500      | 数学推理                                 |\n| AIME2024\u002F2025 | 数学竞赛                                 |\n| AMC           | 数学竞赛                                 |\n| GPQA          | 博士级别的生物物理与化学推理           |\n\n### 4.3 代理\n\nLLM代理解决复杂问题的能力，从根本上依赖于其推理与规划能力。这一能力的核心机制是长链思维（Long CoT），它将复杂任务分解为更小、逻辑清晰的步骤。长链思维的特点——深度推理、广泛探索以及可行性反思——并非附加功能，而是实现这些能力的基础。如果代理无法“更深入地思考”并进入“思考—批判—改进”的循环，其独立决策和在陌生场景中适应的能力将受到严重限制，最终退化为“预设流程”或“与人类反复交互”。例如o1和DeepSeek-R1等模型通过长链思维成功解决了复杂任务，直接证明了这种因果关系：推理深度的提升会直接带来代理能力的增强（即在复杂任务中的自主性）。因此，未来AI代理的发展将与长链思维的突破紧密相关。\n\n**AI代理在线课程与资源**\n\n- [吴恩达的《如何构建、评估和迭代LLM代理》](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1Ew4m1R7ju\u002F?vd_source=a39056a294c1d415f3413ef933024e2b)：由LlamaIndex和TruEra团队专家主讲的研讨会（2024年3月），讲解如何使用LlamaIndex等工具框架构建LLM代理，并借助TruLens等可观测性工具评估代理性能、检测幻觉和偏差。视频提供中英字幕，适合学习生产环境中代理的开发与评估方法。\n- [Coursera AI代理开发者专项课程（范德堡大学）](https:\u002F\u002Fwww.coursera.org\u002Fspecializations\u002Fai-agents)：面向具备Python基础的初学者的六门系列课程，重点教授如何使用Python、工具、记忆和推理来构建与部署智能AI代理。课程内容包括创建自定义GPT、应用提示工程、设计可靠的人工智能系统以及实现多智能体协作系统。\n- [Hugging Face代理课程](https:\u002F\u002Fhuggingface.co\u002Flearn\u002Fagents-course\u002Funit0\u002Fintroduction)：一门免费的在线代理入门课程。\n\n**用于构建LLM AI代理的开源框架**\n\n- [LangChain](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Flangchain)：LLM智能体开发中最广泛使用的框架，提供模块化和可扩展的架构、统一的LLM接口、预构建的智能体工具包（用于CSV、JSON、SQL）、Python和Pandas集成以及向量存储能力。它支持React风格的智能体，并提供用于维护上下文的记忆模块。\n- [CrewAI](https:\u002F\u002Fgithub.com\u002FcrewAIInc\u002FcrewAI)：一个用于编排角色扮演型AI智能体的开源框架，强调通过定义的角色和共同目标实现多智能体协作。该框架独立、精简且高度可定制，支持“Crew”（团队）和“Flow”（事件驱动的工作流）。\n- [Dify](https:\u002F\u002Fgithub.com\u002Flanggenius\u002Fdify)：一个面向LLM应用的开源框架，具备可视化提示编排界面、长上下文集成、基于API的开发、多模型支持以及RAG流水线等功能。\n- [OpenAI Agent Demo](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fopenai-cs-agents-demo)：OpenAI官方提供的用于搭建Agent客户端服务的平台（可视化平台，无需额外代码）。\n\n更多框架请参阅[Awesome LLM Agent Frameworks](https:\u002F\u002Fgithub.com\u002Fkaushikb11\u002Fawesome-llm-agents\u002Fblob\u002Fmain\u002FREADME.md)。\n\n**针对复杂智能体轨迹的端到端强化学习**\n\n- [Agent-R1](https:\u002F\u002Fgithub.com\u002F0russwest0\u002FAgent-R1)：一个开源框架，旨在加速强化学习与智能体交叉领域的研发。它采用端到端强化学习在特定环境中训练智能体，允许开发者在无需复杂流程工程的情况下定义领域特定的工具和奖励函数。该框架支持多轮工具调用及多工具协同。\n- [RAGEN](https:\u002F\u002Fgithub.com\u002FRAGEN-AI\u002FRAGEN)：一个在交互式、随机性和多轮环境中利用强化学习训练LLM推理智能体的框架。它引入了StarPO（状态-思考-行动-奖励策略优化）框架，该框架具有交错的展开与更新阶段，以实现轨迹级别的优化。\n\n**强化学习增强的工具使用与搜索能力**\n\n- [ReCall](https:\u002F\u002Fgithub.com\u002FAgent-RL\u002FReCall)：一个新颖的框架，通过强化学习训练LLM进行工具调用推理，而无需关于工具使用轨迹或推理步骤的监督数据。它旨在使LLM能够以类似智能体的方式使用和组合任何用户自定义的工具。\n- [OpenManus-RL](https:\u002F\u002Fgithub.com\u002FOpenManus\u002FOpenManus-RL)：OpenManus框架的扩展版本，专门通过GRPO等强化学习技术来增强AI智能体，从而实现跨多种环境的训练及针对特定任务的性能调优。\n- [R1-Searcher](https:\u002F\u002Fgithub.com\u002FRUCAIBox\u002FR1-Searcher)、[Search-R1](github.com\u002FPeterGriffinJin\u002FSearch-R1)：研究探索如何利用强化学习提升LLM的搜索能力。\n\n\n**精彩博客**\n\n- [Neptune.ai博客](https:\u002F\u002Fneptune.ai\u002Fblog\u002Fbuilding-llm-agents-with-autogen)：提供了详细的分步指南，例如“如何使用AutoGen构建LLM智能体”，涵盖组件、RAG流水线、规划、工具和记忆集成等内容。\n- [n8n.io博客](https:\u002F\u002Fblog.n8n.io\u002Fllm-agents\u002F)：深入探讨了LLM智能体的能力（如战略规划、记忆和工具集成），并附有构建智能体的实用教程。\n- [NVIDIA开发者博客](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Fan-easy-introduction-to-llm-reasoning-ai-agents-and-test-time-scaling\u002F)：发表了一篇关于LLM推理和AI智能体的入门文章。\n- [Botpress博客](https:\u002F\u002Fbotpress.com\u002Fblog\u002Fchain-of-thought)：解释了思维链提示机制，并讨论了各种AI智能体框架。\n- [SuperAnnotate博客](https:\u002F\u002Fwww.superannotate.com\u002Fblog\u002Fllm-agents)：全面概述了LLM智能体、其能力及未来发展方向。\n- [Smythos博客](https:\u002F\u002Fsmythos.com\u002Fdevelopers\u002Fagent-development\u002Fllm-agents\u002F)：探讨了LLM智能体如何革新任务自动化和AI集成。\n- [Unite.ai](https:\u002F\u002Fwww.unite.ai\u002Freinforcement-learning-meets-chain-of-thought-transforming-llms-into-autonomous-reasoning-agents\u002F)：详细讨论了强化学习与思维链结合如何将LLM转化为自主推理智能体。\n- [Holistic AI博客](https:\u002F\u002Fwww.holisticai.com\u002Fblog\u002Fllm-agents-use-cases-risks)：深入分析了LLM智能体的架构，包括多模态增强、工具使用和记忆等方面。\n- [ProjectPro](https:\u002F\u002Fwww.projectpro.io\u002Farticle\u002Fagentic-ai-design-patterns\u002F1126)和[Lightrains博客](https:\u002F\u002Flightrains.com\u002Fblogs\u002Fai-agent-design-patterns-cxo\u002F)：讨论了多种AI智能体设计模式，包括反思、工具使用和规划模式等。\n\n**精选GitHub仓库**\n\n- [Awesome-LLM-Agents](https:\u002F\u002Fgithub.com\u002Fkaushikb11\u002Fawesome-llm-agents\u002Fblob\u002Fmain\u002FREADME.md)：一份精心整理的各类LLM智能体框架列表，是探索该生态系统的宝贵起点。\n- [Awesome-LLM-Agents-Scientific-Discovery](https:\u002F\u002Fgithub.com\u002Fzhoujieli\u002FAwesome-LLM-Agents-Scientific-Discovery)：一份聚焦于LLM驱动的AI智能体在生物医学研究及更广泛科学发现中应用的论文精选列表。\n- [Awesome-Agent-RL](https:\u002F\u002Fgithub.com\u002F0russwest0\u002FAwesome-Agent-RL)：一个专门收集论文和资源的合集，重点在于通过强化学习释放AI智能体的潜力。\n- [Awesome-LLM-APPs](https:\u002F\u002Fgithub.com\u002FShubhamsaboo\u002Fawesome-llm-apps)：一份精选的优秀LLM应用集合，涵盖了基于RAG、AI智能体、多智能体团队、MCP、语音智能体等多种技术构建的应用。\n\n\u003C!-- ## 4. 行为分析与理由 -->\n\n## 5. 数据集\n\n### 5.1 基准测试\n\n#### 5.1.1 评估框架\n\n- **LLM评估框架：**\n\n  - [OpenCompass](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fopencompass) 是一个综合性的大语言模型（LLM）评估平台，支持对超过100个数据集上的多种开放和闭源模型进行测评。它覆盖语言理解、推理和代码生成等多个维度，并支持零样本、少样本、思维链（CoT）等多种评估方式，同时还具备分布式评估能力。\n  - [DeepEval](https:\u002F\u002Fgithub.com\u002Fconfident-ai\u002Fdeepeval) 是一个易于使用的开源LLM评估框架，专为评估和测试大型语言模型系统而设计。其目标是帮助开发者根据相关性、事实一致性、偏见和毒性等关键指标，高效评估模型生成内容的质量。它的使用方式类似于Python单元测试框架Pytest。\n\n- **MLLM评估框架：**\n\n- [VLMEvalKit](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fvlmevalkit) 是 OpenCompass 推出的开源工具包，专门用于评估大型视觉-语言模型。它支持在 80 多个基准测试上对 220 多种视觉-语言模型进行一键式评估，涵盖图像问答、图文匹配和视觉推理等任务。该工具包既能基于精确匹配提供评估结果，也能通过大语言模型抽取答案来生成评估结果。\n  - [EvalScope](https:\u002F\u002Fgithub.com\u002Fmodelscope\u002Fevalscope) 是 MoTower 社区推出的一个模型评估框架，支持对多种类型模型的性能基准测试，包括大型语言模型、多模态语言模型、嵌入模型以及 AIGC 模型。\n\n- **CoT 评估框架：**\n\n  - [ROSCOE](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FParlAI\u002Ftree\u002Fmain\u002Fprojects\u002Froscoe) 旨在提供一套自动化指标，用于在无需参考答案的情况下评估模型的推理质量。\n  - [ReCEval](https:\u002F\u002Fgithub.com\u002Farchiki\u002FReCEval) 是由 Archiki Prasad 及其同事提出的一种推理链评估框架。它通过“正确性”和“信息量”两个维度，对大型语言模型生成的多步推理过程进行详细分析。\n\n#### 5.1.2 结果基准\n\n本节重点从整体视角评估 Long CoT 推理的最终表现，强调推理链是否最终合理且准确。\n\n- **复杂数学**\n\n|      名称     | 题目数量 | 发布日期 |                                     作者                                     |                                                                                                                                                                                               描述                                                                                                                                                                                              |                                                          相关链接                                                         |\n| :-----------: | :----------------: | :----------: | :-----------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: |\n|     GSM8K     |       \\~8,500      |     2021     |                                      OpenAI                                     |                                                                                                     OpenAI 提供的 K-12 数学文字应用题数据集，每道题都配有详细的解题步骤。题目涵盖基础算术、文字应用题等，需要多步推理才能解答。                                                                                                     |                                    🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopenai\u002Fgsm8k)                                    |\n|      MATH     |       12,500       |     2021     |                          Hendrycks 等（加州大学伯克利分校）                         |                                                                           来自数学竞赛的高难度数学问题数据集，每道题都附有完整的分步解答。内容涵盖代数、几何、概率等领域，旨在评估模型的数学推理能力。                                                                           |                                        🌐[仓库](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fmath)                                        |\n|   AIME 2024   |         30         |     2024     |                               AI-MO 项目组                               |                                                                                     2024 年美国邀请数学竞赛，一个高水平的高中数学竞赛数据集，包含 2024 年 AIME I 和 II 的所有题目。题目侧重于整数解和组合推理。                                                                                    |                             🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002Faimo-validation-aime)                             |\n|   AIME 2025   |         30         |     2025     |                                   OpenCompass                                   |                                                                                                                     2025 年 AIME I 和 II 的题目合集。难度与 2024 年 AIME 相似，用于评估高中生解决复杂数学问题的能力。                                                                                                                    |                                🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopencompass\u002FAIME2025)                                |\n|    AMC 2023   |         83         |     2024     |                               AI-MO 项目组                               |                                                                                               2023 年美国数学竞赛，由 AMC12 竞赛中的 83 道题组成的验证集。题目涵盖 2022–2023 年 AMC12 中的代数、几何等内容。                                                                                              |                              🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002Faimo-validation-amc)                             |\n|   USAMO 2025  |          6         |     2025     |                          Balunović 等（苏黎世联邦理工学院）                          |                                                                                             2025 年美国数学奥林匹克竞赛的题目数据集。这些是 USAMO 的决赛试题，通常为难度较高的证明题，考察深度的数学推理和证明能力。                                                                                             |                   🌐[网站](https:\u002F\u002Fmatharena.ai\u002F) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Feth-sri\u002Fmatharena)                  |\n| OlympiadBench |        8,476       |     2024     |                     贺朝晖等（清华大学）                     |                                                    奥林匹克级别的双语多模态科学问题数据集。包含来自数学、物理等学科竞赛的 8,476 道题目，每道题都有专家提供的分步解答，用于全面评估模型的跨学科深度推理能力。                                                    | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHothan\u002FOlympiadBench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FOlympiadBench) |\n|  OlympicArena |       11,163       |     2024     | 黄震等（上海交通大学 & 上海研究院） |                                    又称 OlympiadArena，这一综合性基准涵盖了数学、物理、化学、生物等 7 个大类中的 62 种“奥林匹克”挑战。包含 11,163 道奥林匹克级别题目，按学科和题型分类，旨在推动通用人工智能的推理发展。                                    |   🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FGAIR\u002FOlympicArena) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgair-nlp.github.io\u002FOlympicArena)   |\n|  Putnam-AXIOM |      236 + 52      |     2024     |                       Gulati 等（斯坦福大学）                       |                                                                      来自普特南数学竞赛的数据集，包括 236 道普特南竞赛题目以及 52 道普特南 AIME 的交叉题目。每道题都配有详细的解题步骤，用于评估模型的数学推理能力。                                                                     |                                      📄[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=t1mAXb4Cop)                                      |\n|  FrontierMath |          -         |     2024     |                             Glazer 等（Epoch AI）                            | 由数十位数学家协作创建的一系列前沿数学问题集。涵盖现代数学的主要分支，从数论、实分析到代数几何。这些问题若手动求解，往往需要数小时甚至数天。收录了数百道原创高难度题目，且均未公开发布，以避免训练泄露。 |                                           📄[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.04872)                                           |\n|   ThinkBench  |        2,912       |     2025     |      黄书林等（上海科技大学）     |                                                   一套动态挑战题集，用于评估大型语言模型（LLM）的稳健推理能力。包含 2,912 个通过对外部分布扰动生成的推理任务，旨在测试模型在陌生情境下的推理准确性。                                                   |                                           📄[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.16268)                                           |\n|  MATH-Perturb |      279 \\* 2      |     2025     |                    黄凯旋等（普林斯顿大学 & Google）                    |                    MATH 数据集中最难题目的扰动集。从 MATH 中选取 279 道最困难的 Level 5 题目，并通过“简单扰动”和“困难扰动”为每道题生成 279 个变体。模型在这些扰动题目上的表现会显著下降，从而反映其真实的数学泛化能力。                   |                                      📄[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=IkmD3fKBPQ)                                      |\n\n- **复杂编码**\n\n\n|      名称     | 题目数量 | 发布日期 |                                     作者                                     |                                                                                                                                                                                               描述                                                                                                                                                                                              |                                                          相关链接                                                         |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n| SWE-bench     | 2,294                       | 2024         | 陈天乐等（普林斯顿NLP）                              | 软件工程基准数据集，从GitHub上真实的软件项目问题-补丁对中提取。收集了12个流行Python库中的2,294个问题及其对应的Pull Request修复。该数据集用于评估模型自动修复真实代码缺陷的能力。                                                                                          | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSWE-bench\u002FSWE-bench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FSWE-bench\u002FSWE-bench)              |\n| CodeContests  | \\~10,000                    | 2022         | 李等人（DeepMind）                                            | DeepMind为训练AlphaCode而提出的竞技编程数据集。它汇集了来自Codeforces、AtCoder等平台的大量题目和测试用例。数据集包含约10,000道多语言编程题目，可用于代码生成模型的训练和评估。                                                                             | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fdeepmind\u002Fcode_contests)                                                                       |\n| LiveCodeBench | \\~400（逐年增加）           | 2024         | 贾因等人（加州大学伯克利分校 & MIT）                                 | 一个“实时”代码基准。持续收集LeetCode、AtCoder和Codeforces上最新的公开题目，总计约400道高质量编程题目。除了代码生成外，还评估模型在代码调试、自我修复及单元测试生成方面的能力。                                                                           | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Flivecodebench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FLiveCodeBench\u002FLiveCodeBench)                     |\n| MHPP          | 210                         | 2025         | 戴建波等人                                               | 大多数困难的Python问题，由人工设计的一组高难度Python编程任务。数据集包含七个挑战类别共210道题目，每道题都需要多步推理或复杂的算法来解决。用于评估大语言模型在代码推理效率和准确性方面的极限。                                                                      | 📄[论文](https:\u002F\u002Fopenreview.net\u002Fforum?id=TVFVx8TUbN)                                                                                     |\n| ProBench      | -                           | 2025         | 杨磊等人（上海科技大学） | 专为竞技编程设计的基准。收集了2024年下半年来自Codeforces、洛谷和Nowcoder平台的竞赛题目，统一了难度和算法标签。数据集包含数百道题目，填补了高级代码推理评估的空白。                                                                         | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fyl-9\u002Fprobench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FYL-9\u002Fprobench)                          |\n| HumanEval Pro | 164                         | 2024         | 于兆健等人（微软人工智能研究院）                      | OpenAI HumanEval数据集的增强版。在原有的164道编程题目基础上，增加了“子问题”，要求模型先解决一个较简单的子问题，再利用其结果去解决更复杂的问题。与原始的HumanEval相比，Pro版本使模型的准确率降低了约20%。                                              | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FCodeEval-Pro\u002Fhumaneval-pro) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FCodeEval-Pro\u002FCodeEval-Pro) |\n| MBPP Pro      | 378                         | 2024         | 于兆健等人（微软人工智能研究院）                      | Google MBPP编程题数据集的进阶版。从MBPP测试集中选取378道题目，并构建了类似于HumanEval Pro的附加问题，使题目更具层次性和综合性。用于更严格地评估模型在基础编程任务中的多步推理能力。                              | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FCodeEval-Pro\u002Fmbpp-pro) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FCodeEval-Pro\u002FCodeEval-Pro)      |\n| EquiBench     | 2,400                       | 2025         | 魏安江等人（斯坦福大学 & 纽约大学）                             | 代码语义理解基准。通过等价性验证任务评估大语言模型对程序执行语义的理解能力。数据集提供了四种编程语言中2,400对功能等价或不等价的程序。要求模型判断两段程序的输出是否相同，以此测试其对深层代码逻辑的理解。 | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fanjiangwei\u002FEquiBench-Datasets) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FAnjiang-Wei\u002FEquiBench)  |\n\n\n- **常识谜题**\n\n以下是表格的学术英文翻译：\n\n|      名称     | 题目数量 | 发布日期 |                                     作者                                     |                                                                                                                                                                                               描述                                                                                                                                                                                              |                                                          相关链接                                                         |\n| :------------------: | :-----------------------------: | :----------: | :-----------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |\n|       LiveBench      |          动态更新         |     2025     |   White et al. (NYU & Meta AI)  |                                                                             一个在线更新的、全面的LLM评估框架。每月都会新增任务，以确保测试集不会被模型的训练数据污染。任务涵盖数学、逻辑、编程和常识问答等领域。它采用自动评分和可验证的标准答案，以确保评估的公正性和客观性。                                                                             | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Flivebench\u002Flivebench-67eaef9bb68b45b17a197a98) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Flivebench\u002Flivebench) \u003Cbr> 🌐[网站](https:\u002F\u002Flivebench.ai\u002F) |\n| BIG-Bench Hard (BBH) | 23个任务（超过2,000道题） |     2023     | Suzgun et al. (Google Research) |                                        从BIG-Bench大规模通用基准中挑选出的23个最具挑战性的任务集合。这些任务在GPT-3等模型上的表现远低于人类平均水平，涵盖布尔表达式求值、因果推理、日期理解以及复杂的常识\u002F逻辑问题等领域。它常被用作思维链（CoT）增强实验的基准。                                       |                               🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmaveriq\u002Fbigbenchhard) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fsuzgunmirac\u002FBIG-Bench-Hard)                               |\n|      ZebraLogic      |                -                |     2024     |        Lin et al. (HKUST)       |                                                                                                        受“斑马谜题”启发的逻辑推理数据集。它包含一组复杂的演绎推理问题，通常涉及非单调推理场景，由模型生成并经过人工验证。该数据集用于测试模型在纯逻辑线索下的推理一致性。                                                                                                        |  🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FWildEval\u002FZebraLogic) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FWildEval\u002FZeroEval) \u003Cbr> 🌐[网站](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FWildEval\u002FZebraLogic)  |\n|          ARC         |              10,377             |     2018     |        Clark et al. (AI2)       |                                   AI2推理挑战赛，一个针对自然常识和科学问题的选择题数据集。题目来源于美国K-12科学考试，分为简单和困难两个部分。其中包含7,787道训练题和2,590道挑战题。即使GPT-4在ARC挑战集上也难以超越淘汰赛阶段的表现，因此它常被用作通用常识智能测试的基准。                                  |                                                                   🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fai2_arc)                                                                   |\n|       JustLogic      |              4,900              |     2024     |    Michael Chen et al. (USYD)   |                            纯粹的演绎逻辑推理基准。它包含由合成算法自动生成的4,900道命题逻辑推理问题，不依赖任何常识知识，仅专注于测试模型进行形式化逻辑推导的能力。每个任务提供一组前提和一个结论命题，模型需要判断结论的真值：真、假或不确定。                            |                                🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FWildEval\u002FZebraLogic) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fmichaelchen-lab\u002FJustLogic)                                |\n|      QuestBench      |              ~600              |     2025     |       Li et al. (DeepMind)      | DeepMind发布的信息检索推理评估。它包含四类“不完整问题”：逻辑、规划、数学（GSM）和公式问题，每道题都缺少一个关键条件。模型必须识别出最需要澄清的问题，并利用该信息来回答原问题。数据集中约有600道此类常识\u002F推理问题，旨在评估LLM识别和询问关键信息的能力。 |                                                                   🌐[仓库](https:\u002F\u002Fgithub.com\u002Fgoogle-deepmind\u002Fquestbench)                                                                  |\n\n\n- **科学推理**\n\n| 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n| GPQA Diamond               | 198                 | 2024         | Rein et al. (NYU)                   | 研究生级别的物理\u002F生物\u002F化学问答数据集中难度极高的子集。GPQA数据集筛选出198道专家能正确回答但普通人会答错的问题。“钻石级”题目几乎达到研究生水平，要求模型具备跨学科的深度推理能力。                                                                      | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FIdavidrein\u002Fgpqa) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fidavidrein\u002Fgpqa)                                       |\n| MMLU-Pro                   | ~12,000            | 2024         | Wang Yubo等                    | 原始MMLU基准的增强版。包含来自14个主要领域的12,000道高质量学术考试题（选项数由4个增至10个），侧重于综合知识与复杂推理。相比原始MMLU，Pro版本显著提升了难度，模型准确率平均下降约20%。 | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTIGER-Lab\u002FMMLU-Pro) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FTIGER-AI-Lab\u002FMMLU-Pro)                              |\n| SuperGPQA                  | 26,529              | 2025         | Doubao (Seed)团队                  | 大规模研究生级别知识推理基准。涵盖285个学术领域，包含26,529道高难度专业考试题。超过42%的题目需要数学计算或形式化推理，旨在测试模型在长尾学科中的推理极限。                                                                                        | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fm-a-p\u002FSuperGPQA) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FSuperGPQA\u002FSuperGPQA)                                   |\n| Humanity’s Last Exam (HLE) | 2,500               | 2025         | CAIS & Scale AI                     | “人类最后一考”，被设计为人类知识的最后一场闭卷考试。包含数学、自然科学和人文科学等数十个领域的2,500道选择题或简答题。由全球专家协作创建，其难度超越了以往所有基准，被认为是当前AI面临的最困难的综合性考试。    | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcais\u002Fhle) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fcenterforaisafety\u002Fhle) \u003Cbr> 🌐[网站](https:\u002F\u002Flastexam.ai\u002F) |\n| TPBench                    | -                   | 2024         | Daniel J.H. Chung等 (DeepMind) | 用于评估模型解决高级理论物理问题能力的理论物理基准。该基准由Chung等人提出，收集了一系列需要高级知识和复杂推导的理论物理问题，以测试模型在物理定律和方程推导方面的推理极限。                                                       | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FZhiqiGao\u002FTPBench) \u003Cbr> 🌐[网站](https:\u002F\u002Ftpbench.org\u002F)                                                       |\n\n\n- **医学推理**\n\n| 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|          MedQA          |       12,723       |     2020     |    Jin et al. (清华大学)    | 医学考试问答数据集。来源于美国医师执照考试（USMLE）的选择题，涵盖解剖学、生理学、病理学等科目。包括英文版（12,723题）以及简体中文和繁体中文版本（总计约5万题）。用于评估模型运用医学知识和诊断推理的能力。 |                    🌐[Google Drive](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1ImYUSLk9JbgHXOemfvyiDiirluZHPeQw\u002Fview) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fjind11\u002FMedQA)                   |\n| JAMA临床挑战 |        1,524       |     2024     | Chen et al. (约翰霍普金斯大学) |             美国医学会杂志（JAMA）的临床挑战病例集。汇集了该期刊发表的1,524例具有挑战性的临床病例，每例都配有详细的病例描述、问题、四个选项及专业解析。重点评估模型在真实、复杂的临床场景中进行诊断决策和解释的能力。            |                                                      🌐[网站](https:\u002F\u002Fjamanetwork.com\u002Fcollections\u002F44038\u002Fclinical-challenge)                                                     |\n|        Medbullets       |         308        |     2024     | Chen et al. (约翰霍普金斯大学) |    模拟临床问答数据集。由308道USMLE Step 2\u002F3风格的选择题组成，取自Twitter上的医学问答账号。每道题都包含病例情景、五个选项及详细解析。尽管基于常见临床场景，这些题目仍颇具挑战性，用于评估模型在临床决策和可解释性方面的能力。    |                                                           🌐[网站](https:\u002F\u002Fgithub.com\u002FHanjieChen\u002FChallengeClinicalQA)                                                           |\n|        MedXpertQA       |        4,460       |     2024     |            清华C3I团队           |                                    一套“专家级”医学推理的综合性基准。包含4,460道高难度临床知识问答，覆盖17个专科和11个身体系统。提供纯文本（病例+问答）和多模态（含医学影像）两种格式，用于评估模型对医学文本和图像的联合推理能力。                                    | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTsinghuaC3I\u002FMedXpertQA) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FTsinghuaC3I\u002FMedXpertQA) \u003Cbr> 🌐[网站](https:\u002F\u002Fmedxpertqa.github.io\u002F) |\n\n#### 5.1.3 能力基准测试\n重点在于长链式思维（Long CoT）推理过程中模型的局部视角或个体能力，通过考察模型每一步推理是否正确且逻辑自洽，来实现更细粒度的评估。例如，模型能否准确识别错误并加以修正，或者能否逐步完成复杂任务。\n\n- **深度推理**\n\n| 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n| ZebraLogic |       \\~1,000      |     2024     |     Bill Yuchen Lin等     |                               ZebraLogic 是一个专注于逻辑推理的 AI 基准测试，包含复杂的数学和语言推理问题，用于评估模型的高级推理能力。其题目设计类似于“斑马谜题”，挑战模型在约束条件下进行逻辑推理和问题解决。                              | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fallenai\u002FZebraLogic) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FWildEval\u002FZeroEval) \u003Cbr> 🌐[官网](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fyuchenlin\u002Fzebra-logic) |\n|   BigGSM   |         610        |     2025     | Qiguang Chen 等 (HIT-SCIR) |                                  一个用于评估大型语言模型在多步数学问题上表现的数学推理基准测试。它扩展了经典的 GSM8K 数据集，加入了更具挑战性的数学应用题，要求模型进行更为复杂的逻辑推理和计算。                                  |                          🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FLightChen2333\u002FBigGSM) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FLightChen233\u002Freasoning-boundary)                          |\n| GSM-Ranges |        30.1k       |     2025     |   Safal Shrestha 等 (NYU)  | GSM-Ranges 是基于 GSM8K 基准构建的数据集生成器。它系统性地修改数学文字题中的数值，以评估大型语言模型在广泛数值范围内的鲁棒性。通过引入数值扰动，GSM-Ranges 可以检验 LLM 在超出其训练分布范围的数值上进行数学推理的能力。 |                              🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fguactastesgood\u002FGSM-Ranges) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fminwukim\u002FGSM-Ranges)                             |\n\n\n- **探索基准测试**\n\n| 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|  Sys2Bench  |          -         |     2025     |         Shubham Parashar 等        | Sys2Bench 旨在系统性地测试大型语言模型在各类推理与规划任务中的表现。该基准涵盖五种主要类型的推理：算法推理、规划、算术推理、逻辑推理和常识推理，共包含 11 个子任务，从 NP 难问题（如魔方和装箱问题）到多步数学问题（如 GSM8K）。Sys2Bench 特别强调推理过程中的中间步骤，突出推理路径的质量与效率。此外，该项目还引入了 AutoHD（自动化启发式发现）方法，允许模型在推理过程中自主生成启发式函数，从而提升复杂任务的规划能力。 | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fdive-lab\u002FSys2Bench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fdivelab\u002Fsys2bench) |\n| BanditBench |          -         |     2025     | Allen Nie 等 (斯坦福大学) |            BanditBench 旨在评估大型语言模型在多臂老虎机（MAB）和上下文老虎机（CB）环境下的探索与决策能力。该基准将 LLM 模拟为智能体，仅依靠上下文信息进行多轮交互而不更新参数，以此衡量其在不确定性环境中的表现。BanditBench 提供多种任务场景，包括基于 MovieLens 数据集的电影推荐任务，覆盖不同的动作数量和奖励分布类型（如高斯分布和伯努利分布）。此外，研究人员还引入了算法引导的推理支持和算法蒸馏方法，以提升 LLM 的探索效率。           |                           🌐[仓库](https:\u002F\u002Fgithub.com\u002Fallenanie\u002FEVOLvE?tab=readme-ov-file)                          |\n\n- **反思基准测试**\n\n| 名称 | 问题数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|  RewardBench |          2,958         |       2024       |                Nathan Lambert 等 (AI2)               |                                                                                                                                RewardBench 是首个系统的奖励模型评估基准，由 AI2 和华盛顿大学联合发布，旨在从对齐质量、推理能力、安全性及指令遵循等多个维度，分析和比较不同训练方法下奖励模型的性能，提供统一的评估框架。                                                                                                                               |       🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Freward-bench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench) \u003Cbr> 🌐[网站](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fallenai\u002Freward-bench)       |\n| ProcessBench |          3,400         |       2024       |              郑楚杰 等 (通义千问团队)             | ProcessBench 是阿里巴巴通义千问团队提出的一个数学推理过程评估基准，包含 3,400 道带有分步解答的奥数级题目，每一步都经过人工标注是否存在错误。该基准要求模型识别推理过程中最早出现的错误步骤，侧重于过程监督而非仅关注最终答案。评估结果显示，通用语言模型（如 QwQ-32B-Preview）在分步批评任务中表现优于专门训练的过程奖励模型（PRM），接近 GPT-4o 的水平。 |                                            🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FQwen\u002FProcessBench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FProcessBench)                                           |\n|   PRMBench   |          6,216         |       2025       | Mingyang Song 等 (复旦大学、上海人工智能实验室) |                                                                PRMBench 旨在填补现有基准主要关注步骤正确性而缺乏对 PRM 系统性评估的空白，提供一个涵盖简洁性、鲁棒性和敏感性等多个维度的统一评估框架。基准中的每个样本包括一道题目、一段含有错误的推理过程、错误步骤的标注以及错误原因，以评估 PRM 对细粒度错误的检测能力。                                                                |                   🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhitsmy\u002FPRMBench_Preview) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fssmisya\u002FPRMBench) \u003Cbr> 🌐[网站](https:\u002F\u002Fprmbench.github.io\u002F)                   |\n|  CriticBench |         \\~3,800        |       2024       |           天兰 等 (清华大学)          |                        CriticBench 由清华大学等机构提出，是一个用于评估大型语言模型批评与修正能力的综合性基准。它覆盖数学、常识、符号推理、编程和算法五大推理领域，整合了 15 个数据集，用于评估 17 种 LLM 在生成、批评和修正三个阶段的表现。研究发现，专门为批评任务训练的模型在“生成-批评-修正”（GQC）任务中表现更好，且规模更大的模型具有更高的批评一致性。                       |               🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fllm-agents\u002FCriticBench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FCriticBench\u002FCriticBench) \u003Cbr> 🌐[网站](https:\u002F\u002Fcriticbench.github.io\u002F)              |\n|  DeltaBench  |          1,236         |       2025       |                      OpenStellarTeam                     |                                                                              DeltaBench 由 OpenStellar Team 发布，是一个用于评估大型语言模型在长链式思维（CoT）推理任务中错误检测能力的基准。它包含 1,236 个样本，涵盖数学、编程、物理-化学-生物（PCB）推理以及通用推理等领域。每个样本都配有详细的人工标注，标明错误步骤、策略转变及反思效率。                                                                              |    🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FOpenStellarTeam\u002FDeltaBench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FOpenStellarTeam\u002FDeltaBench) \u003Cbr> 🌐[网站](https:\u002F\u002Fopenstellarteam.github.io\u002FDeltaBench\u002F)   |\n|  ErrorRadar  |          2,500         |       2024       |               Yan Yibo 等 (Squirrel AI)              |                                                                 ErrorRadar 是一个多模态数学推理错误检测基准，旨在评估多模态大型语言模型识别和分类学生解题过程中错误的能力。该基准包含来自真实教育场景的 2,500 道 K-12 数学题目，融合文本和图像信息，并标注错误步骤及错误类型。评估任务包括定位错误步骤和分类错误类型。                                                                | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FErrorRadar\u002FErrorRadar) \u003Cbr> 🌐[仓库](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FError-Radar\u002Freadme.md) \u003Cbr> 🌐[网站](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FError-Radar) |\n|     MEDEC    |          3,848         |       2024       |            Ben Abacha Asma 等 (微软)            |                                                                                                                              MEDEC 是首个公开的医疗错误检测与纠正基准，由微软和华盛顿大学联合发布。它包含 3,848 篇临床文本，涵盖诊断、治疗和用药等五类错误，为提升医疗文档生成的准确性和安全性提供了重要工具。                                                                                                                              |                                                                                 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fabachaa\u002FMEDEC)                                                                                |\n\n#### 5.1.4 高级基准测试\n专门设计用于评估大型语言模型在复杂推理、跨领域知识整合和多模态理解方面能力的基准测试。随着基础评估逐渐被顶尖模型所饱和，研究人员开始开发更具挑战性的基准测试，以更准确地衡量模型在现实世界复杂任务中的表现。\n\n\n\n- **智能体与具身推理**\n\n| 名称 | 问题数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|  ToolComp  |                  485                 |       2025       |          Vaskar Nath等（Scale AI）          |                                               ToolComp旨在评估大型语言模型在复杂的多步骤工具使用任务中进行推理和过程监督的能力。该基准测试包含485个经过人工编辑和验证的提示，涉及11种不同工具的使用，以及1,731个分步监督标签，能够全面评估模型在多工具推理任务中的表现。                                               |   🌐[官网](https:\u002F\u002Fscale.com\u002Fresearch\u002Ftoolcomp-a-multi-tool-reasoning-and-process-supervision-benchmark)   |\n|   OSWorld  |                  369                 |       2025       |   谢天宝等（香港大学）  |                                        OSWorld是由香港大学、Salesforce Research等机构联合发布的多模态智能体评估基准，旨在测试AI在真实计算机环境中完成开放式任务的能力。该基准包含文件操作、网页浏览、办公软件使用等场景下的369个任务，支持Ubuntu、Windows和macOS系统。                                       |       🌐[仓库](https:\u002F\u002Fgithub.com\u002Fxlang-ai\u002FOSWorld) \u003Cbr> 🌐[官网](https:\u002F\u002Fos-world.github.io\u002F)       |\n|   WebShop  | 12,087条指令 \u002F 118万件商品 |       2022       |     姚顺宇等（普林斯顿大学）    | WebShop模拟了一个电子商务网站环境，用于评估大型语言模型在真实网络交互中的能力。该基准包含118万件真实商品和12,087条用户指令，要求智能体根据自然语言指令浏览网页、搜索、筛选并完成购买任务。WebShop专注于评估模型对复杂指令的理解能力、处理网络干扰的能力以及策略探索能力。 |   🌐[仓库](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FWebShop) \u003Cbr> 🌐[官网](https:\u002F\u002Fwebshop-pnlp.github.io\u002F)  |\n|  WebArena  |                  812                 |       2024       | 周淑燕等（卡内基梅隆大学） |                                              WebArena是由卡内基梅隆大学发布的一个高保真网络环境，旨在评估大型语言模型在真实网络任务中的智能体能力。该基准包含812个任务，涵盖电子商务、社交论坛、内容管理和协作开发等领域，要求模型通过自然语言指令完成多步骤的网络交互。                                              |        🌐[仓库](https:\u002F\u002Fgithub.com\u002Fweb-arena-x\u002Fwebarena) \u003Cbr> 🌐[官网](https:\u002F\u002Fwebarena.dev\u002F)        |\n|  WebGames  |                  50+                 |       2025       |      Thomas George等（Convergence AI）      |                                                                                         WebGames是一个网页浏览智能体基准测试，涵盖了基本浏览操作、复杂输入处理、认知任务以及工作流自动化等内容。WebGames提供了一个轻量级、可验证的测试环境，支持快速迭代和评估，非常适合用于开发更强大的网络智能体。                                                                                        | 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fconvergence-ai\u002Fwebgames) \u003Cbr> 🌐[官网](https:\u002F\u002Fwebgames.convergence.ai\u002F) |\n| Text2World |                  103                 |       2025       |   胡孟康等（香港大学）  |                                  Text2World是由香港大学等机构提出的一个基准测试，旨在评估大型语言模型从自然语言生成符号化世界模型的能力。该基准基于规划域定义语言（PDDL），覆盖数百个不同的领域，采用多维度、基于执行的评估方法，以提供更为稳健的评估结果。                                  |   🌐[仓库](https:\u002F\u002Fgithub.com\u002FAaron617\u002Ftext2world) \u003Cbr> 🌐[官网](https:\u002F\u002Ftext-to-world.github.io\u002F)   |\n\n- **多模态推理**\n    - **复杂数学：**\n\n| 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    |  MathVista |        6,141       |     2023     |               潘陆等（加州大学洛杉矶分校）               |                                                                                                                                                                              MathVista是由加州大学洛杉矶分校、华盛顿大学和微软研究院联合发布的多模态数学推理评估基准。它旨在系统地评估大型语言模型和多模态模型在视觉情境下的数学推理能力。                                                                                                                                                                              |    🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI4Math\u002FMathVista) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Flupantech\u002FMathVista) \u003Cbr> 🌐[官网](https:\u002F\u002Fmathvista.github.io\u002F)    |\n    | MathVision |        3,040       |     2024     | 王科等（香港中文大学） |                                                                                                         MathVision（MATH-V）是由香港中文大学等机构发布的多模态数学推理评估基准。其目标是在视觉情境中系统地评估大型视觉-语言模型的数学推理能力。该基准包含来自16个数学领域的3,040道题目，分为五个难度级别，题目均来源于真实的数学竞赛。                                                                                                         | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FMathLLMs\u002FMathVision) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fmathllm\u002FMATH-V) \u003Cbr> 🌐[官网](https:\u002F\u002Fmathllm.github.io\u002Fmathvision\u002F) |\n    |  MathVerse |      \\~15,000      |     2024     | 陆子木等（香港中文大学） | MathVerse是由香港中文大学MMLab与上海人工智能实验室联合发布的多模态数学推理评估基准。它旨在全面评估多模态大型语言模型对数学图表的理解能力。该基准包含2,612道题目，涵盖平面几何、立体几何和函数等领域，并由专家进行标注。基准生成六种版本的多模态信息，总计约15,000个测试样本。MathVerse引入了思维链（CoT）评估策略，利用GPT-4V对模型的推理过程进行细粒度分析。 |                          🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fluzimu\u002FWebGen-Bench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fmnluzimu\u002FWebGen-Bench)                         |\n\n    - **复杂代码：**\n\n    | 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    | HumanEval-V |         253        |     2024     | 张峰吉等（香港城市大学） |                                                                        HumanEval-V是由香港大学提出的一种多模态代码生成评估基准，旨在测试大型多模态模型在复杂图表理解及代码生成任务中的能力。该基准包含253个Python编程任务，每个任务都配有关键图表和函数签名，要求模型根据视觉信息生成可执行代码。                                                                        | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHumanEval-V\u002FHumanEval-V-Benchmark) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FHumanEval-V\u002FHumanEval-V-Benchmark) \u003Cbr> 🌐[官网](https:\u002F\u002Fhumaneval-v.github.io\u002F) |\n    | Code-Vision |       1,000+       |     2025     |       王汉斌等（北京大学）       |                                  Code-Vision是由北京大学、东北大学和香港大学联合发布的多模态代码生成评估基准。其目标是测试多模态大型语言模型理解流程图并生成相应代码的能力。该基准填补了现有基准的空白——现有基准大多侧重于文本推理，缺乏对视觉情境下代码生成的系统性评估——并提供了一个统一的评估框架。                                 |                                     🌐[仓库](https:\u002F\u002Fgithub.com\u002Fwanghanbinpanda\u002FCodeVision) \u003Cbr> 🌐[官网](https:\u002F\u002Fpingshengren0901.github.io\u002Fcodevision.io\u002F)                                     |\n    |  ChartMimic |        4,800       |     2024     |       杨成等（清华大学）      | ChartMimic是由清华大学、腾讯AI实验室等机构联合发布的多模态代码生成评估基准。它旨在评估大型多模态模型在图表理解和代码生成方面的跨模态推理能力，以弥补现有基准主要关注文本推理而缺乏对图表理解和代码生成系统性评估的不足。该基准包含两种任务类型：直接模仿和定制模仿，数据来源于多个领域的科学论文。 |              🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FChartMimic\u002FChartMimic) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FChartMimic\u002FChartMimic) \u003Cbr> 🌐[官网](https:\u002F\u002Fchartmimic.github.io\u002F)             |\n\n    - **复杂科学：**\n\n| 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    | ScienceQA |       21,208       |     2022     |        Pan Lu et al. (UCLA)        |                                                                        ScienceQA 是一个包含自然科学、语言科学和社会科学领域共 21,208 道题目的多模态选择题数据集，面向 K-12 年级学生。该数据集提供图像与文本相结合的背景信息、解题说明及详细答案，支持链式思维（CoT）推理，旨在评估并提升 AI 模型的多步推理能力和可解释性。                                                                       |         🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTheMrguiller\u002FScienceQA) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Flupantech\u002FScienceQA) \u003Cbr> 🌐[网站](https:\u002F\u002Fscienceqa.github.io\u002F)        |\n    |   M3CoT   |       11,459       |     2024     | Qiguang Chen et al. (HIT-SCIR Lab) | M3CoT 是基于 ScienceQA 构建的一个多模态、多领域、多步推理数据集，用于评估 AI 模型在复杂推理任务中的能力。与 ScienceQA 相比，M3CoT-Science 的平均推理步骤从 2.5 增加到 10.9，平均文本长度也由 48 字增至 294 字，显著提升了任务难度。该数据集涵盖科学、常识和数学等领域，强调图像与文本信息之间的交叉推理，对现有多模态大模型的推理能力提出了更高挑战。 | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FLightChen2333\u002FM3CoT) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FM3CoT) \u003Cbr> 🌐[网站](https:\u002F\u002Flightchen233.github.io\u002Fm3cot.github.io\u002F) |\n    | MolPuzzle |         234        |     2024     |          Kehan Guo et al.          |                       MolPuzzle 是一个用于评估大型语言模型在分子结构分析任务中表现的多模态、多步推理数据集。该数据集包含多种光谱数据类型，包括红外光谱（IR）、质谱（MS）以及核磁共振（1H-NMR 和 13C-NMR），同时还提供分子式信息。任务分为三个阶段：分子理解、光谱分析和分子构建，模拟真实的化学推理过程。                       |   🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fkguo2\u002FMolPuzzle_data) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FKehanGuo2\u002FMolPuzzle) \u003Cbr> 🌐[网站](https:\u002F\u002Fkehanguo2.github.io\u002FMolpuzzle.io\u002F)   |\n\n    - **常识推理数据集:**\n\n    | 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n    |:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n    |   PuzzleVQA  |        2,000       |     2024     |          Yew Ken Chia et al.         | PuzzleVQA 是一个包含 2,000 道抽象图形谜题的多模态推理数据集，旨在评估大型多模态模型在颜色、数字、形状和大小等基本概念上的视觉感知、归纳和演绎能力。实验表明，即使是 GPT-4V 等先进模型，在单一概念谜题上的平均准确率也仅为 46.4%，远低于人类水平，这暴露出其在抽象模式识别和多步推理方面的局限性。 | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fdeclare-lab\u002FPuzzleVQA) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002FLLM-PuzzleTest\u002Ftree\u002Fmaster\u002FPuzzleVQA) \u003Cbr> 🌐[网站](https:\u002F\u002Fpuzzlevqa.github.io\u002F) |\n    | LEGO-Puzzles |        1,100       |     2025     | Kexian Tang et al. (Shanghai AI Lab) |                                                                                                  LEGO-Puzzles 数据集旨在评估大型多模态语言模型在多步空间推理任务中的能力。该数据集包含 1,100 道基于乐高积木的视觉问答（VQA）题目，覆盖 11 种任务类型，包括空间理解、单步及多步序列推理等。                                                                                                 |      🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FKexianTang\u002FLEGO-Puzzles) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002FTangkexian\u002FLEGO-Puzzles) \u003Cbr> 🌐[网站](https:\u002F\u002Ftangkexian.github.io\u002FLEGO-Puzzles\u002F)     |\n    |     CVQA     |       10,374       |     2024     |     David Romero et al. (MBZUAI)     |                                                                                                         CVQA 是一个多模态视觉问答数据集，用于评估模型整合多种视觉线索进行综合推理的能力。该数据集包含三类任务，要求模型从多张图片中提取并综合关键信息以回答复杂问题。                                                                                                        |                                                    🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fafaji\u002Fcvqa) \u003Cbr> 🌐[网站](https:\u002F\u002Fcvqa-benchmark.org\u002F)                                                   |\n\n- **AI4Research:**\n\n| 名称 | 题目数量 | 发布日期 | 作者 | 描述 | 相关链接 |\n|:-------:|:-----------:|:--------:|:------------------------:|:------------------------------------------------------------------------------------------:|:---------:|\n|    SciWorld   | 30个任务 \u002F 6,000+个实例 |     2022     |                           王若瑶等                           |                                                                                                                                                                                                      SciWorld旨在评估大型多模态模型在复杂科学场景下的理解与推理能力。该数据集融合了图像、文本和结构化数据，覆盖多个科学领域，并设计了多步推理任务，以挑战模型整合多源信息、进行因果推理及提供可解释答案的能力。它包含30个任务，每个任务又有多种变体，总计超过6,000个实例。SciWorld的推出推动了多模态模型在科学教育和研究中的应用。                                                                                                                                                                                                     |           🌐[仓库](https:\u002F\u002Fgithub.com\u002Fallenai\u002FScienceWorld) \u003Cbr> 🌐[官网](https:\u002F\u002Fsciworld.apps.allenai.org\u002F)           |\n|     HardML    |             100             |     2025     |                           Tidor-Vlad Pricope                           | HardML是一个专门用于评估人工智能在数据科学和机器学习领域知识与推理能力的基准数据集。它由独立机器学习工程师Tidor-Vlad Pricope创建，包含100道精心设计的选择题，涵盖自然语言处理、计算机视觉、统计建模以及经典机器学习算法等主题。这些题目难度极高，即便是经验丰富的机器学习工程师也难以全部答对。为避免数据污染，大部分题目均为原创，反映了过去两年来机器学习领域的最新进展。目前最先进的AI模型在HardML上的错误率约为30%，是MMLU-ML的三倍，这充分证明了HardML在区分模型能力方面的有效性。此外，作者还发布了稍易一些的EasyML数据集，专为参数量较少的模型设计。 |                                            📄[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.15627)                                           |\n|   MLE-BENCH   |              75             |     2024     |                                 OpenAI                                 |                                                                                                                                                            MLE-bench是由OpenAI发布的基准数据集，旨在评估AI智能体在机器学习工程（MLE）任务中的实际能力。该基准从Kaggle平台上精选了75个多样化的竞赛任务，涵盖自然语言处理、计算机视觉、信号处理等领域，测试模型在数据预处理、模型训练和实验执行等方面的工程技能。在评测中，结合AIDE框架的OpenAI o1-preview模型在16.9%的任务上达到了Kaggle铜牌水平。研究还探讨了资源扩展对性能的影响以及预训练数据污染相关问题。                                                                                                                                                           |            🌐[仓库](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmle-bench\u002F) \u003Cbr> 🌐[官网](https:\u002F\u002Fopenai.com\u002Findex\u002Fmle-bench\u002F)            |\n| SolutionBench |            1,053            |     2025     | 李卓群等（中国科学院软件研究所） |                                                                                                                                                                                                                                                           SolutionBench是一个用于评估AI系统在复杂工程解决方案设计方面能力的基准数据集。它旨在填补当前检索增强生成（RAG）方法在处理多约束工程问题上的空白，其特点是使用真实数据源和结构化数据。此外，作者还提出了一种名为SolutionRAG的系统，通过结合树状搜索和双点思维机制，在SolutionBench上取得了领先性能。                                                                                                                                                                                                                                                          | 🤗[数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flzq2021\u002FSolutionBench) \u003Cbr> 🌐[仓库](https:\u002F\u002Fgithub.com\u002Ficip-cas\u002FDeepSolution) |\n\n\n\n### 5.2 训练数据集\n\n为了构建并提升具备强大Long CoT能力的模型，众多开源训练数据集应运而生。这些数据集为数学、科学、医学、编程以及通用推理等多个领域提供了基础性的监督信号。根据其构建方式，我们将这些数据集分为四大类：人工标注、直接蒸馏、基于搜索的蒸馏以及验证蒸馏。\n\n在本节中，我们系统地列出了每一类中具有代表性的数据集，涵盖了它们的来源、模态、适用领域及数据规模等关键信息，为寻求合适训练资源的研究人员和开发者提供全面的指南与便捷的参考。\n\n#### 5.2.1 人工标注\n\n这类数据集通过人工标注或规则构建而成，通常提供高质量且推理路径可解释的样本。尽管规模较小，但它们对于引导初始模型的对齐与评估至关重要。\n\n- [R1-OneVision](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFancy-MLLM\u002FR1-Onevision) 将 LLaVA-OneVision 的高质量数据与特定领域的数据集相结合，弥合了视觉与文本理解之间的鸿沟，提供了涵盖自然场景、科学、数学、基于 OCR 的内容以及复杂图谱等丰富且具备上下文感知的推理任务。\n- [M3CoT](https:\u002F\u002Fgithub.com\u002FLightChen233\u002FM3CoT) 为多领域、多步骤、多模态的思维链研究奠定了基础。\n- [Big-Math-RL-Verified](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSynthLabsAI\u002FBig-Math-RL-Verified) 专为使用大型语言模型（LLMs）进行强化学习训练而设计，例如 [PPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06347)、[GRPO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03300) 等。\n- [GSM8K](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math) 是一个高质量、语言多样性丰富的小学数学应用题数据集。\n\n| 名称                 | 类别             | 来源   | 模态      | 数量 |\n| :------------------- | :------------------- | :----- | :------------ | :------- |\n| R1-OneVision         | 数学、科学       | 规则   | 视觉 + 文本 | 119K     |\n| M3CoT                | 数学、科学       | 人工   | 视觉 + 文本 | 11K      |\n| Big-Math-RL-Verified | 数学             | 人工   | 文本          | 251K     |\n| GSM8K                | 数学             | 人工   | 文本          | 8K       |\n\n#### 5.2.2 直接蒸馏\n\n该方法利用大型语言模型通过提示或思维链式推理生成训练数据。这些数据集可以扩展到数百万个样本，覆盖广泛的领域。\n\n- [NaturalReasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13124) 通过知识蒸馏实验验证，NaturalReasoning 能有效从强大的教师模型中提取并迁移推理能力。它在使用外部奖励模型进行无监督自训练或采用自我奖励机制时同样高效。\n- [NuminaMath-CoT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002FNuminaMath-CoT) 采用思维链（CoT）格式来解答每一道题目。该数据集涵盖了中国高中数学练习题、美国及国际数学奥林匹克竞赛题目等，数据来源于在线考试 PDF 文件和数学论坛。处理步骤包括：(a) 对原始 PDF 文件进行 OCR 识别；(b) 将内容分割为题目-解答对；(c) 翻译成英文；(d) 重新组织以生成思维链推理格式；以及 (e) 最终答案的格式化。\n- [NuminaMath-TIR](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAI-MO\u002FNuminaMath-TIR) 专注于从 NuminaMath-CoT 数据集中选取的需输出数值结果的题目。研究团队利用 GPT-4 构建了一条流水线，用于生成类似 TORA 的推理路径、执行代码并得出结果，直至完成最终解答，并过滤掉最终答案与参考答案不符的解法。\n- [DART-Math-uniform](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-uniform) 通过应用 DARS-Uniform 方法构建数据集。\n- [DART-Math-hard](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-hard) 是基于 DARS-Prop2DiffMATH 和 GSK8K 训练数据集中的查询集构建的数学问答样本。它在多个具有挑战性的数学推理基准上取得了 SOTA 效果，与传统的拒绝采样方法不同，该数据集刻意偏好难度较高的问题。\n- [DART-Math-pool-math](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-pool-math) 是由 MATH 训练数据集的查询集合成的数据池，包含所有正确答案的样本以及过程中生成的额外元数据。DART-Math-\\- 系列数据集均是从 DART-Math-pool-\\- 数据池中抽取的。\n- [DART-Math-pool-gsm8k](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhkust-nlp\u002Fdart-math-pool-gsm8k) 是由 GSM8K 训练数据集的查询集合成的数据池，包含所有正确答案的样本以及额外的元数据。DART-Math-\\- 系列数据集也从中抽取。\n- [OpenO1-SFT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FO1-OPEN\u002FOpenO1-SFT) 是一个用于通过 SFT 微调语言模型以激活思维链推理的数据集。\n- [OpenO1-SFT-Pro](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FO1-OPEN\u002FOpenO1-SFT-Pro) 是一个用于通过 SFT 微调语言模型以激活思维链推理的数据集。\n- [OpenO1-SFT-Ultra](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FO1-OPEN\u002FOpenO1-SFT-Ultra) 基于现有的开源数据集，使用 openo1-qwen-sft 模型合成而成。\n- [Medical-o1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFreedomIntelligence\u002Fmedical-o1-reasoning-SFT) 是一个基于可医学验证的问题及 LLM 验证器构建的 SFT 医学推理数据集。\n- [AoPS-Instruc](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FDeepStudentLlama\u002FAoPS-Instruct) 是一个大规模高质量的高级数学推理问答数据集，采用可扩展的方法创建并维护。\n- [Orca-Math](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.14830) 是一个高质量的合成数据集，包含 20 万道数学题，这些题目是在多智能体协作的设置下生成的。\n- [MATH-plus](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FTIGER-Lab\u002FWebInstructSub) 从预训练网络语料库中收集了 1000 万条自然发生的指令数据。\n- [UltraInteract-SFT](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fopenbmb\u002Feurus-660bc40bec5376b3adc9d1c5) 专为复杂推理任务设计，有助于探索推理任务中的偏好学习。它适用于监督微调和偏好学习。每条指令都包含一个偏好树，由以下部分组成：(1) 具有多重规划策略且格式一致的推理链条；(2) 与环境的多轮交互轨迹及评论；以及 (3) 用于促进偏好学习的配对数据。\n- [MathCodeInstruct](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FMathLLMs\u002FMathCodeInstruct) 是一个新颖的高质量数据集，包含数学问题及其基于代码的解答。\n- MathCodeInstruct-Plus [$Paper¹](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03731), [²$](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.07921) 是一个新颖的高质量数据集，包含数学问题及其基于代码的解答。\n- OpenMathInstruct-1[\\[HuggingFace\\]](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fnvidia\u002FOpenMathInstruct-1) 是一个数学指令调整数据集，利用 Mixtral-8x7B 模型生成了 180 万个题目-解答对。题目来源于 GSM8K 和 MATH 训练子集；解答则由 Mixtral 模型通过文本推理和 Python 解释器执行的代码块合成。\n- [OpenMathInstruct-2](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FNeMo-Skills) 是一个大规模的数学推理数据集，用于训练大型语言模型（LLM）。\n- [AceMath-Instruct](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fnvidia\u002Facemath-678917d12f09885479d549fe) 是 AceMath 中用于训练尖端数学推理模型的训练数据集。\n- [QwQ-LongCoT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPowerInfer\u002FQWQ-LONGCOT-500K) 整合了来自多个高质量来源的提示，以创建多样化且全面的训练数据。\n- [SCP-116K](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FEricLu\u002FSCP-116K) 是一套高质量的科学问答对，由网络爬取的文档中自动提取而来。每个问题都配有从原始资料中提取的匹配答案，以及由先进语言模型生成的回答和推理过程。\n- [R1-Distill-SFT](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FServiceNow-AI\u002FR1-Distill-SFT) 使用 DeepSeek-R1-32b 进行蒸馏；数据由 Numina-math 和 Tulu 提供；每个提示都会采样一条回答。\n- [Sky-T1-Data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FNovaSky-AI\u002FSky-T1_data_17k) 包含来自 APPs 和 TACO 的 5000 条编码数据，以及来自 NuminaMATH 数据集中 AIME、MATH 和奥林匹克竞赛子集的 1 万条数学数据。此外，还保留了 STILL-2 中的 1000 条科学和谜题数据。\n- [Bespoke-Stratos-17k](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fbespokelabs\u002FBespoke-Stratos-17k) 是一个包含问题、推理轨迹和答案的推理数据集。它复制并改进了 Berkeley Sky-T1 数据流水线，使用 DeepSeek-R1 蒸馏后的 SFT 数据。\n- [s1K](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fsimplescaling\u002Fs1K) 包含 1000 个多样、高质量且困难的例题（源自 Gemini Thinking），用于优化推理路径和解答。\n- MedThoughts-8K\n- [SYNTHETIC-1](https:\u002F\u002Fwww.primeintellect.ai\u002Fblog\u002Fsynthetic-1-release) 是 Deepseek-R1 生成的最大开放式推理数据集，涵盖数学、编程、科学等领域的推理轨迹，并由特定任务的验证器确认其正确性。\n- [Medical-R1-Distill-Data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFreedomIntelligence\u002FMedical-R1-Distill-Data) 是一个基于 HuatuoGPT-o1 可医学验证问题的 SFT 数据集，由 Deepseek-R1（全功率版）蒸馏而来。\n- [Medical-R1-Distill-Data-Chinese](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FFreedomIntelligence\u002FMedical-R1-Distill-Data-Chinese) 是一个基于 HuatuoGPT-o1 可医学验证问题的 SFT 中文版数据集，由 Deepseek-R1（全功率版）蒸馏而来。\n- [RLVR-GSM-MATH](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fopen-instruct) 用于训练 Tulu3 模型。\n- [LIMO](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FGAIR\u002FLIMO) “少即是多”——用于推理。\n- [OpenThoughts-114k](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopen-thoughts\u002FOpenThoughts-114k) 是一个开放的合成推理数据集，包含 11.4 万个高质量示例，覆盖数学、科学、代码和谜题等领域。\n- [Magpie-Reasoning-V2](https:\u002F\u002Fgithub.com\u002Fmagpie-align\u002Fmagpie) 通过使用预查询模板提示对齐 LLM，生成高质量的对齐数据。\n- [Dolphin-R1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcognitivecomputations\u002Fdolphin-r1) 是一个包含 80 万条样本的数据集，其构成与用于训练 DeepSeek-R1 Distill 模型的数据集相似。\n\n| 名称                            | 类别                           | 来源                            | 模态 | 数量 |\n| :------------------------------ | :--------------------------------- | :-------------------------------- | :------- | :------- |\n| NaturalReasoning                | 科学、通用                   | Llama3.3-70B                      | 语言     | 100万       |\n| NuminaMath-CoT                  | 数学                        | GPT-4o                            | 语言     | 86万     |\n| NuminaMath-TIR                  | 数学                        | GPT-4o                            | 语言     | 7.3万      |\n| DART-Math-uniform               | 数学                        | DeepSeekMath-7B-RL                | 语言     | 59.1万     |\n| DART-Math-hard                  | 数学                        | DeepSeekMath-7B-RL                | 语言     | 58.5万     |\n| DART-Math-pool-math             | 数学                        | DeepSeekMath-7B-RL                | 语言     | 160万     |\n| DART-Math-pool-gsm8k            | 数学                        | DeepSeekMath-7B-RL                | 语言     | 270万     |\n| OpenO1-SFT                      | 数学、科学、通用              | -                                 | 语言     | 7.8万     |\n| OpenO1-SFT-Pro                  | 数学、科学、通用              | -                                 | 语言     | 12.6万     |\n| OpenO1-SFT-Ultra                | 数学、科学、通用              | -                                 | 语言     | 2800万     |\n| Medical-o1                      | 医学                        | DeepSeek R1                       | 语言     | 5万      |\n| AoPS-Instruct                   | 数学                        | Qwen2.5-72B                       | 语言     | 64.7万     |\n| Orca-Math                       | 数学                        | GPT-4                             | 语言     | 20万      |\n| MATH-plus                       | 数学                        | GPT-4                             | 语言     | 89.4万     |\n| UltraInteract-SFT               | 数学、代码、逻辑              | GPT-4 CoT + PoT                   | 语言     | 28.9万     |\n| MathCodeInstruct                | 数学                        | GPT-4 + Codellama PoT             | 语言     | 7.9万      |\n| MathCodeInstruct-Plus           | 数学                        | -                                 | 语言     | 8.8万      |\n| OpenMathInstruct-1              | 数学                        | Mixtral-8x7B PoT                  | 语言     | 500万     |\n| OpenMathInstruct-2              | 数学                        | Llama3.1-405B                     | 语言     | 1400万     |\n| AceMath-Instruct                | 数学、通用                   | Qwen2.5-Math-72B + GPT-4o-mini    | 语言     | 500万     |\n| QwQ-LongCoT                     | 通用                        | QwQ                               | 语言     | 28.6万     |\n| SCP-116K                        | 科学                        | QwQ + O1-mini                     | 语言     | 11.7万     |\n| R1-Distill-SFT                  | 数学                        | DeepSeek-R1-32B                   | 语言     | 17.2万     |\n| Sky-T1-Data                     | 数学、代码、科学、谜题        | QwQ                               | 语言     | 1.7万      |\n| Bespoke-Stratos-17k             | 数学、代码、科学、谜题        | DeepSeek R1                       | 语言     | 1.7万      |\n| s1K                             | 数学                        | DeepSeek R1                       | 语言     | 1000       |\n| MedThoughts-8K                  | 医学                        | DeepSeek R1                       | 语言     | 8000       |\n| SYNTHETIC-1                     | 数学、代码、科学              | DeepSeek R1                       | 语言     | 89.4万     |\n| Medical-R1-Distill-Data         | 医学                        | DeepSeek R1                       | 语言     | 2.2万      |\n| Medical-R1-Distill-Data-Chinese | -                                  | -                                 | 语言     | 1.7万      |\n| RLVR-GSM-MATH                   | 数学                        | -                                 | 语言     | 3万      |\n| LIMO                            | 数学                        | 人类 + DeepSeek R1 + Qwen2.5-32B | 语言     | 817       |\n| OpenThoughts-114k               | 数学、代码、科学、谜题        | -                                 | 语言     | 11.4万     |\n| Magpie-Reasoning-V2             | 数学、代码                  | DeepSeek-R1 + Llama-70B           | 语言     | 25万      |\n| Dolphin-R1                      | 数学、科学                  | DeepSeek R1 + Gemini2 + Dolphin   | 语言     | 81.4万     |\n\n#### 5.2.3 基于搜索的蒸馏\n\n基于搜索的数据集是通过自动化搜索算法构建的，该算法会探索推理树以生成最优的推理轨迹。尽管规模有限，但这些数据集通常能够生成高质量且深度的推理样本。\n\n- [STILL-1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.11694) 通过奖励引导的树搜索算法提升了大型语言模型（LLM）的推理能力。\n\n| 名称    | 类别                           | 来源                       | 模态 | 数量 |\n| :------ | :--------------------------------- | :--------------------------- | :------- | :------- |\n| STILL-1 | 数学、代码、科学、谜题        | LLaMA-3.1-8B-Instruct + MCTS | 语言     | 5000       |\n\n#### 5.2.4 经验证的蒸馏\n经验证的数据集包含基于规则的过滤、测试用例验证或大型语言模型验证，以确保质量。这类数据集在可扩展性和可靠性之间取得了平衡。\n\n- [KodCode-V1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FKodCode\u002FKodCode-V1) 提供可验证的代码任务解决方案和测试用例；专为监督微调（SFT）和强化学习（RL）优化设计；涵盖多个领域（从算法到特定领域的软件知识）以及不同难度级别（从基础编码练习到面试和竞赛编程挑战）。\n- [KodCode-V1-SFT-R1](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FKodCode\u002FKodCode-V1-SFT-R1) 提供可验证的代码任务解决方案和测试用例；专为监督微调（SFT）和强化学习（RL）优化设计；涵盖多个领域（从算法到特定领域的软件知识）以及不同难度级别（从基础编码练习到面试和竞赛编程挑战）。\n- [OpenR1-Math](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fopen-r1\u002FOpenR1-Math-220k) 是一个大规模数学推理数据集，由 DeepSeek R1 为 NuminaMath 1.5 版本的问题生成，每道题包含两到四条推理路径。\n- [Chinese-DeepSeek-R1-Distill-Data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FCongliu\u002FChinese-DeepSeek-R1-Distill-data-110k) 是来自 DeepSeek-R1 的中文开源蒸馏数据集，不仅包含数学数据，还包含大量通用类型的数据。\n- [AM-DeepSeek-R1-Distilled](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fa-m-team\u002FAM-DeepSeek-R1-Distilled-1.4M) 包含来自众多开源数据集的问题，这些问题经过语义去重和清洗，以避免测试集污染。答案是从推理模型（主要是 DeepSeek-R1）中提取的，并经过严格验证：数学问题通过核对答案进行验证，代码问题通过测试用例验证，其他任务则通过奖励模型评估。\n\n\n| 名称                             | 类别                      | 来源                               | 模态 | 数量 |\n| :------------------------------- | :---------------------------- | :----------------------------------- | :------- | :------- |\n| KodCode-V1                       | -                             | GPT-4 + 测试用例验证         | 文本     | 447K     |\n| KodCode-V1-SFT-R1                | 代码                          | DeepSeek R1 + 测试用例验证   | 文本     | 443K     |\n| OpenR1-Math                      | 数学                   | DeepSeek R1 + 规则与大语言模型验证  | 文本     | 225K     |\n| Chinese-DeepSeek-R1-Distill-Data | 数学、科学、通用            | DeepSeek R1 + 规则与大语言模型验证  | 文本     | 110K     |\n| AM-DeepSeek-R1-Distilled         | 数学、代码、通用            | 奖励模型 + 规则与大语言模型验证 | 文本     | 1.4M     |\n\n\n\n\n## 7. 论文列表 & 精选资源\n- [Awesome-Long-Chain-of-Thought-Reasoning](pages\u002Fpaper.md)（我们的官方论文列表，1000+篇）\n\n- [Awesome-System2-Reasoning-LLM](https:\u002F\u002Fgithub.com\u002Fzzli2022\u002FAwesome-System2-Reasoning-LLM)\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_7dd87192ac55.jpg\" style=\"width: 580pt\">\n\n\n# 🎁 引用\n如果您觉得这项工作有用，请引用我们。\n```bib\n@misc{chen2025reasoning,\n      title={迈向推理时代：面向推理型大语言模型的长链式思维综述}, \n      author={Qiguang Chen、Libo Qin、Jinhao Liu、Dengyun Peng、Jiannan Guan、Peng Wang、Mengkang Hu、Yuhang Zhou、Te Gao、Wanxiang Che},\n      year={2025},\n      eprint={2503.09567},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09567}, \n}\n```\n\n\n# 贡献\n如果您有关于长链式思维的有趣新闻，也可以在 Twitter 上 @Qiguang_Chen（[QiguangChen](https:\u002F\u002Ftwitter.com\u002FQiguangChen)）或发送邮件至 [charleschen2333@gmail.com](mailto:charleschen2333@gmail.com)，以便我们在 GitHub 仓库中跟进并更新。\n\n希望大家都能享受长链式思维的时代 :)\n\n\u003C!-- omit in toc -->\n# ⭐ 星标历史\n\n\u003Ca href=\"https:\u002F\u002Fstar-history.com\u002F#LightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning&Date\">\n \u003Cpicture>\n   \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_cee7767d680e.png&theme=dark\" \u002F>\n   \u003Csource media=\"(prefers-color-scheme: light)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_cee7767d680e.png\" \u002F>\n   \u003Cimg alt=\"星标历史图表\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_readme_cee7767d680e.png\" \u002F>\n \u003C\u002Fpicture>\n\u003C\u002Fa>","# Awesome-Long-Chain-of-Thought-Reasoning 快速上手指南\n\n本项目并非一个可直接安装运行的单一软件包，而是一个**长思维链（Long CoT）推理大模型的研究资源汇总库**。它包含了最新的论文综述、经典模型列表以及技术分类指南。以下是如何快速获取并利用这些资源的步骤。\n\n## 1. 环境准备\n\n本项目主要提供文档、论文链接和代码库索引，因此对系统环境要求极低。\n\n*   **操作系统**：Windows \u002F macOS \u002F Linux 均可。\n*   **前置依赖**：\n    *   **Git**：用于克隆仓库。\n    *   **浏览器**：用于查看整理的论文列表和技术图表。\n    *   **Python (可选)**：如果你计划运行仓库中链接的具体模型代码（如 DeepSeek-R1, Qwen3 等），建议安装 Python 3.8+ 及对应的深度学习框架（PyTorch\u002FTensorFlow）。\n\n## 2. 安装步骤\n\n通过 Git 克隆仓库到本地，即可获取完整的中文教程、论文列表及技术分类图。\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning.git\ncd Awesome-Long-Chain-of-Thought-Reasoning\n```\n\n> **国内加速建议**：\n> 如果直接克隆速度较慢，可使用 Gitee 镜像（如有）或配置 Git 代理：\n> ```bash\n> git clone https:\u002F\u002Fgitee.com\u002Fmirrors\u002FAwesome-Long-Chain-of-Thought-Reasoning.git \n> # 注：若官方未同步 Gitee 镜像，请使用以下命令配置临时代理\n> export GIT_PROXY_COMMAND=\"connect-proxy -S 127.0.0.1:1080 %h %p\" \n> # 或者在 git clone 时指定 config\n> git -c http.proxy=http:\u002F\u002F127.0.0.1:7890 clone https:\u002F\u002Fgithub.com\u002FLightChen233\u002FAwesome-Long-Chain-of-Thought-Reasoning.git\n> ```\n\n克隆完成后，直接在本地浏览器打开 `README-zh.md` 或 `pages\u002Fpaper.md` 即可查看详细的中文分类资源。\n\n## 3. 基本使用\n\n本项目的核心用法是**查阅资料**与**定位模型**。\n\n### 3.1 查阅长思维链（Long CoT）技术图谱\n打开项目根目录下的 `README-zh.md` 文件，你将看到按能力分类的技术全景图：\n*   **深度推理 (Deep Reasoning)**：查看自然语言、结构化语言（代码\u002F符号）及隐空间推理的代表性工作。\n*   **广泛探索与反思**：了解模型如何进行多路径探索和自我修正。\n*   **经典模型列表**：快速访问 OpenAI-o1, DeepSeek-R1, Qwen3, Kimi-k1.5 等模型的官方仓库或论文链接。\n\n### 3.2 运行具体的推理模型（示例）\n本项目索引了多个开源模型。以列表中提到的 **DeepSeek-R1** 为例，若要实际体验 Long CoT 推理，需前往其对应仓库进行部署：\n\n```bash\n# 1. 进入你喜欢的目录\ncd ~\u002Fprojects\n\n# 2. 克隆具体模型仓库 (以 DeepSeek-R1 为例)\ngit clone https:\u002F\u002Fgithub.com\u002Fdeepseek-ai\u002FDeepSeek-R1.git\ncd DeepSeek-R1\n\n# 3. 安装依赖 (参考该具体模型的 requirements.txt)\npip install -r requirements.txt\n\n# 4. 运行推理示例 (具体命令需参照该模型文档)\npython inference.py --prompt \"请解决这个复杂的数学问题...\"\n```\n\n### 3.3 追踪最新论文\n项目维护了一份动态更新的论文列表。你可以定期拉取最新内容以跟进前沿研究：\n\n```bash\ncd Awesome-Long-Chain-of-Thought-Reasoning\ngit pull origin main\n```\n随后查看 `pages\u002Fpaper.md` 获取超过 1000 篇已评审的 Long CoT 相关论文链接。","某顶尖 AI 实验室的研究团队正致力于复现并优化类似 OpenAI-o1 的复杂推理模型，以解决高难度的数学证明与代码生成任务。\n\n### 没有 Awesome-Long-Chain-of-Thought-Reasoning 时\n- **文献梳理困难**：面对海量且分散的 arXiv 论文，研究人员难以区分“长思维链（Long CoT）”与传统“短思维链”的本质差异，导致技术选型迷茫。\n- **核心概念混淆**：团队在调试模型时，无法准确理解“过度思考（overthinking）”和“测试时扩展（test-time scaling）”等现象的成因，浪费大量算力进行无效尝试。\n- **缺乏系统指引**：新手研究员缺乏统一的知识分类体系，难以快速掌握深度推理、广泛探索等关键特性，入门门槛极高且容易走弯路。\n- **前沿动态滞后**：由于缺少实时更新的资源库，团队难以及时获取关于多模态推理整合等最新研究方向，错失创新机会。\n\n### 使用 Awesome-Long-Chain-of-Thought-Reasoning 后\n- **知识体系清晰**：依托其收录的超 1000 篇综述论文及新颖分类法，团队迅速厘清了 Long CoT 的独特范式，精准锁定了适合当前任务的技术路线。\n- **现象洞察深入**：通过研读其中关于“过度思考”等关键现象的深度分析，研究人员成功调整了推理策略，显著提升了模型在复杂任务中的效率与连贯性。\n- **上手效率倍增**：利用其专为初学者设计的双语教程和结构化概览，新成员能在短时间内建立完整的领域认知，快速投入到核心算法改进中。\n- **紧跟前沿趋势**：团队借助其持续的月度更新，第一时间掌握了多模态推理与效率优化的最新进展，为下一代模型架构设计指明了方向。\n\nAwesome-Long-Chain-of-Thought-Reasoning 将碎片化的前沿研究转化为系统化的行动指南，极大地加速了从理论调研到高性能推理模型落地的全过程。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FLightChen233_Awesome-Long-Chain-of-Thought-Reasoning_221787a5.png","LightChen233","Qiguang Chen","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FLightChen233_4f1fc272.png","A happy NLPer and Software Developer",null,"https:\u002F\u002Fgithub.com\u002FLightChen233",623,27,"2026-04-06T10:54:53",1,"","未说明",{"notes":86,"python":84,"dependencies":87},"该项目是一个综述资源列表（Awesome List），用于整理和介绍长思维链（Long CoT）相关的论文、模型和技术，本身不是一个可直接运行的软件工具或代码库，因此没有具体的运行环境、依赖库或硬件需求。用户需根据列表中提到的具体模型（如 DeepSeek-R1, Qwen3 等）去查阅其各自仓库的环境要求。",[],[35,13],[90,91,92,93,94,95,96,97,98,99,100,101,102,103,104],"agent","chain-of-thought","deepseek-r1","long","o1","o3","openai-o1","r1","reasoning","reasoning-language-models","reinforcement-learning","rl","system-2","thinking","long-chain-of-thought","2026-03-27T02:49:30.150509","2026-04-07T09:54:20.798587",[],[]]