[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-DSXiangLi--DecryptPrompt":3,"tool-DSXiangLi--DecryptPrompt":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":76,"owner_location":79,"owner_email":76,"owner_twitter":76,"owner_website":80,"owner_url":81,"languages":76,"stars":82,"forks":83,"last_commit_at":84,"license":76,"difficulty_score":85,"env_os":86,"env_gpu":87,"env_ram":88,"env_deps":89,"category_tags":102,"github_topics":103,"view_count":10,"oss_zip_url":76,"oss_zip_packed_at":76,"status":16,"created_at":118,"updated_at":119,"faqs":120,"releases":131},1182,"DSXiangLi\u002FDecryptPrompt","DecryptPrompt","总结Prompt&LLM论文，开源数据&模型，AIGC应用","DecryptPrompt是一个专注于整理和解析大模型相关技术内容的开源项目，旨在帮助开发者和研究者更高效地理解和应用提示工程、模型微调、对齐技术以及AIGC（人工智能生成内容）等前沿领域。它汇总了大量论文、开源模型、数据集、框架及实际应用场景，同时通过系列博客深入解读关键技术和方法。对于希望快速掌握LLM技术、进行模型优化或探索AI应用的用户来说，DecryptPrompt提供了系统化的学习资源和实践指南。其内容覆盖广泛，适合研究人员、开发者及对AI技术感兴趣的群体使用。","# DecryptPrompt\n> 如果LLM的突然到来让你感到沮丧，不妨读下主目录的Choose Your Weapon Survival Strategies for Depressed AI Academics\n持续更新以下内容，Star to keep updated~\n\n## LLM资源汇总\n- [开源模型和评测榜单](开源模型.MD)\n- [开源推理，微调，Agent，RAG，propmt 框架](开源框架.MD)\n- [开源SFT，RLHF，Pretrain 数据集](开源数据.MD)\n- [AIGC各领域应用汇总](AIGC各领域应用.MD)\n- [Prompt教程，经典博客和AI会议访谈](教程博客会议.MD)\n\n## 跟着博客读论文\n- [解密Prompt系列1. Tunning-Free Prompt：GPT2 & GPT3 & LAMA & AutoPrompt](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2215545?areaSource=&traceId=)\n- [解密Prompt系列2. 冻结Prompt微调LM： T5 & PET & LM-BFF](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2223355?areaSource=&traceId=)\n- [解密Prompt系列3. 冻结LM微调Prompt: Prefix-tuning & Prompt-tuning & P-tuning](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2237259?areaSource=&traceId=)\n- [解密Prompt系列4. 升级Instruction Tuning：Flan\u002FT0\u002FInstructGPT\u002FTKInstruct](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2245094?areaSource=&traceId=)\n- [解密prompt系列5. APE+SELF=自动化指令集构建代码实现](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2260697?areaSource=&traceId=)\n- [解密Prompt系列6. lora指令微调扣细节-请冷静,1个小时真不够~](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2276508)\n- [解密Prompt系列7. 偏好对齐RLHF-OpenAI·DeepMind·Anthropic对比分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2289566?areaSource=&traceId=)\n- [解密Prompt系列8. 无需训练让LLM支持超长输入:知识库 & Unlimiformer & PCW & NBCE ](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2295783?areaSource=&traceId=)\n- [解密Prompt系列9. COT：模型复杂推理-思维链基础和进阶玩法](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2296079?areaSource=&traceId=)\n- [解密Prompt系列10. COT：思维链COT原理探究](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2298660)\n- [解密Prompt系列11. COT：小模型也能COT，先天不足后天补](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2301999)\n- [解密Prompt系列12. LLM Agent零微调范式 ReAct & Self Ask](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2305421)\n- [解密Prompt系列13. LLM Agent指令微调方案: Toolformer & Gorilla](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2312674)\n- [解密Prompt系列14. LLM Agent之搜索应用设计：WebGPT & WebGLM & WebCPM](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2319879)\n- [解密Prompt系列15. LLM Agent之数据库应用设计：DIN & C3 & SQL-Palm & BIRD](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2328749)\n- [解密Prompt系列16. LLM对齐经验之数据越少越好？LTD & LIMA & AlpaGasus](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2333495)\n- [解密Prompt系列17. LLM对齐方案再升级 WizardLM & BackTranslation & SELF-ALIGN](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2338592)\n- [解密Prompt系列18. LLM Agent之只有智能体的世界](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2351540)\n- [解密Prompt系列19. LLM Agent之数据分析领域的应用：Data-Copilot & InsightPilot](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2358413)\n- [解密Prompt系列20. RAG之再谈召回多样性优化](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2365050)\n- [解密Prompt系列21. RAG之再谈召回信息密度和质量](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2369977)\n- [​解密Prompt系列22. RAG的反思：放弃了压缩还是智能么？](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2375066)\n- [解密Prompt系列23.大模型幻觉分类&归因&检测&缓解方案脑图全梳理](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2378383)\n- [解密prompt系列24. RLHF新方案之训练策略：SLiC-HF & DPO & RRHF & RSO](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2389619)\n- [解密prompt系列25. RLHF改良方案之样本标注：RLAIF & SALMON](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2398654)\n- [解密prompt系列26. 人类思考vs模型思考：抽象和发散思维](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2394120)\n- [解密prompt系列27. LLM对齐经验之如何降低通用能力损失](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2406888)\n- [解密Prompt系列28. LLM Agent之金融领域智能体：FinMem & FinAgent](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2411792)\n- [解密Prompt系列29. LLM Agent之真实世界海量API解决方案：ToolLLM & AnyTool](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2415908)\n- [解密Prompt系列30. LLM Agent之互联网冲浪智能体们](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2419768)\n- [​解密Prompt系列31. LLM Agent之从经验中不断学习的智能体](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2425139)\n- [解密Prompt系列32. LLM之表格理解任务-文本模态](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2429900)\n- [解密Prompt系列33. LLM之图表理解任务-多模态篇](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2433883)\n- [​解密prompt系列34. RLHF之训练另辟蹊径：循序渐进 & 青出于蓝](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2437031)\n- [解密prompt系列35. Prompt标准化进行时！ DSPy论文串烧和代码示例](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2441201)\n- [解密Prompt系列36. Prompt结构化编写和最优化算法UNIPROMPT](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2444167)\n- [解密Prompt系列37. RAG之前置决策何时联网的多种策略](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2448156)\n- [解密Prompt系列38. 多Agent路由策略](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2451000)\n- [解密prompt系列39. RAG之借助LLM优化精排环节](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2453693)\n- [解密prompt系列40. LLM推理scaling Law](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2456441)\n- [解密prompt系列41. GraphRAG真的是Silver Bullet？](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2461325)\n- [解密prompt系列42. LLM通往动态复杂思维链之路](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2464011)\n- [解密prompt系列43. LLM Self Critics](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2468406)\n- [解密prompt系列44. RAG探索模式？深度思考模式？](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2474048)\n- [解密Prompt系列45. 再探LLM Scalable Oversight -辩论、博弈哪家强](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2479401)\n- [解密prompt系列46. LLM结构化输出代码示例和原理分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2483500)\n- [解密prompt系列47. O1 Long Thought的一些特征分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2487221)\n- [​解密prompt系列48. DeepSeek R1 & Kimi 1.5长思维链 - RL Scaling](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2493924)\n- [​解密prompt系列49. 回顾R1之前的思维链发展](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2497501)\n- [解密prompt系列50. RL用于优化Agent行为路径的一些思路](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2502322)\n- [解密prompt系列51. R1实验的一些细节讨论](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2506684)\n- [解密prompt系列52. 闲聊大模型还有什么值得探索的领域](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2510004)\n- [解密prompt系列53. 再谈大模型Memory](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2514545)\n- [解密prompt系列54. Context Cache代码示例和原理分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2522820)\n- [解密prompt系列55. Agent Memory的工程实现 - Mem0 & LlamaIndex](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2528447)\n- [解密prompt系列56. Agent context Engineering - 单智能体代码剖析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2537040)\n- [​解密prompt系列57. Agent Context Engineering - 多智能体代码剖析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2541926)\n- [解密prompt系列58. MCP - 工具演变 & MCP基础](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2549927)\n- [解密prompt系列59. MCP实战：从Low-Level到FastMCP的搭建演进](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2554794)\n- [​解密prompt系列60. Agent实战：从0搭建Jupter数据分析智能体](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2563549)\n- [​解密prompt系列61. 手搓代码沙箱与FastAPI-MCP实战](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2570796)\n- [​解密prompt系列62. Agent Memory新视角 - MATTS&CFGM&MIRIX](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2577365)\n- [解密prompt系列63. Agent训练方案: RStar2 & Early Experience etc](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2581959)\n- [解密Prompt系列64. Anthropic Skils的延伸思考](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2586667)\n- [解密Prompt系列65. 三巨头关于大模型内景的硬核论文](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2594738) \n- [解密Prompt系列66. 视觉Token爆炸→DeepSeek-OCR光学压缩](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2600104)\n- [解密Prompt系列67. 智能体的经济学：从架构选型到工具预算](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2610869)\n- [解密Prompt系列68. 告别逐词蹦字 - Transformer 的新推理范式](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2616180)\n\n## 论文汇总\n### paper List\n- https:\u002F\u002Fgithub.com\u002Fdongguanting\u002FIn-Context-Learning_PaperList\n- https:\u002F\u002Fgithub.com\u002Fthunlp\u002FPromptPapers\n- https:\u002F\u002Fgithub.com\u002FTimothyxxx\u002FChain-of-ThoughtsPapers\n- https:\u002F\u002Fgithub.com\u002Fthunlp\u002FToolLearningPapers\n- https:\u002F\u002Fgithub.com\u002FMLGroupJLU\u002FLLM-eval-survey\n- https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FPaperForONLG\n- https:\u002F\u002Fgithub.com\u002Fkhuangaf\u002FAwesome-Chart-Understanding\n- https:\u002F\u002Fgithub.com\u002Fsrush\u002Fawesome-o1\u002F?tab=readme-ov-file\n\n### 图像生成\n- Neural Discrete Representation Learning\n- Denoising Diffusion Probabilistic Models\n- Scalable Diffusion Models with Transformers\n- Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding\n- High-Resolution Image Synthesis with Latent Diffusion Models\n\n### Post Train（和COT，RL有交集）\n- Inference Scaling\n  - An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models\n  - Are More LM Calls All You Need? Towards the Scaling Properties of Compound AI Systems\n  - Large Language Monkeys: Scaling Inference Compute with Repeated Sampling\n  - Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters   :star:\n  - Q*: Improving Multi-step Reasoning for LLMs with Deliberative Planning\n  - Planning In Natural Language Improves LLM Search For Code Generation\n  - ReST-MCTS∗ : LLM Self-Training via Process Reward Guided Tree Search\n  - AlphaZero-Like Tree-Search can Guide Large Language Model Decoding and Training\n  - Smaller, Weaker, Yet Better: Training LLM Reasoners via Compute-Optimal Sampling\n  - The Surprising Effectiveness of Test-Time Training for Abstract Reasoning\n  - Inference Scaling for Long-Context Retrieval Augmented Generation\n  - Toward Self-Improvement of LLMs via Imagination, Searching, and Criticizing\n  - InfAlign: Inference-aware language model alignment\n  - Scaling up Test-Time Compute with Latent Reasoning: A Recurrent Depth Approach\n  - What type of inference is planning?\n  - Goedel-Prover: A Frontier Model for Open-Source Automated Theorem Proving\n  - PROVABLE SCALING LAWS OF FEATURE EMERGENCE FROM LEARNING DYNAMICS OF GROKKING\n  - Do Machine Learning Models Memorize or Generalize?\n- slow thinking COT\n  - O1 Replication Journey: A Strategic Progress Report – Part 1  :star:\n  - Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions\n  - A Comparative Study on Reasoning Patterns of OpenAI's o1 Model\n  - Imitate, Explore, and Self-Improve: A Reproduction Report on Slow-thinking Reasoning Systems\n  - Dualformer: Controllable Fast and Slow Thinking by Learning with Randomized Reasoning Traces\n  - Training Large Language Models to Reason in a Continuous Latent Space\n  - Beyond A∗ : Better Planning with Transformers via Search Dynamics Bootstrapping\n  - o1-Coder: an o1 Replication for Coding\n  - Scaling of Search and Learning: A Roadmap to Reproduce o1 from Reinforcement Learning Perspective\n  - Sky-T1: Train your own O1 preview model within $450\n  - Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Thought\n  - rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking :star:\n  - Demystifying Long Chain-of-Thought Reasoning in LLMs\n  - Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models\n  - [Huggingface Open R1](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fopen-r1\u002Fupdate-1)\n  - CODEI\u002FO: Condensing Reasoning Patterns via Code Input-Output Prediction\n  - Training Language Models to Reason Efficiently\n  - s1: Simple test-time scaling\n  - Inner Thinking Transformer: Leveraging Dynamic Depth Scaling to Foster Adaptive Internal Thinking\n  - ALPHAONE: Reasoning Models Thinking Slow and Fast at Test Time\n- O3 Related\n  - Competitive Programming with Large Reasoning Models\n- RL COT原理\n  - SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training\n  - Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs\n  - Thoughts Are All Over the Place: On the Underthinking of o1-Like LLMs\n  - All Roads Lead to Likelihood: The Value of Reinforcement Learning in Fine-Tuning\n  - Does Reinforcement Learning Really Incentivize Reasoning Capacity in LLMs Beyond the Base Model?\n  - Think Deep, Not Just Long:Measuring LLM Reasoning Effort via Deep-Thinking Tokens\n- R1 Reprodce\n  - LogicRL: Logic-RL: Unleashing LLM Reasoning with Rule-Based Reinforcement Learning\n  - [SimpleR1](https:\u002F\u002Fhkust-nlp.notion.site\u002Fsimplerl-reason)\n  - [Huggingface Open R1](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fopen-r1\u002Fupdate-1)\n  - DianJin-R1: Evaluating and Enhancing Financial Reasoning in Large Language Models\n  - Think Only When You Need with Large Hybrid-Reasoning Models\n  - Topology of Reasoning: Understanding Large Reasoning Models through Reasoning Graph Properties\n  - Skywork Open Reasoner 1 Technical Report\n  - Learning to Reason: Training LLMs with GPT-OSS or DeepSeek R1 Reasoning Traces\n- RL Agent \n  - RAGEN: Understanding Self-Evolution in LLM Agents via Multi-Turn Reinforcement Learning\n  - ToolRL: Reward is All Tool Learning Needs\n  - ReTool: Reinforcement Learning for Strategic Tool Use in LLMs\n  - ReSearch: Learning to Reason with Search for LLMs via Reinforcement Learning\n  - [Improving Multi-Turn Tool Use with Reinforcement Learning](https:\u002F\u002Fwww.bespokelabs.ai\u002Fblog\u002Fimproving-multi-turn-tool-use-with-reinforcement-learning)\n  - WebThinker: Empowering Large Reasoning Models with Deep Research Capability\n  - Reinforcement Learning for Machine Learning Engineering Agents\n  - AgentGym-RL: Training LLM Agents for Long-Horizon Decision Making through Multi-Turn Reinforcement Learning\n  - rStar2-Agent: Agentic Reasoning Technical Report\n  - The Landscape of Agentic Reinforcement Learning for LLMs: A Survey\n  - IN-THE-FLOW AGENTIC SYSTEM OPTIMIZATION FOR EFFECTIVE PLANNING AND TOOL USE\n  - UI-TARS-2 Technical Report: Advancing GUI Agent with Multi-Turn Reinforcement Learning\n  - PokeeResearch: Effective Deep Research via Reinforcement Learning from AI Feedback and Robust Reasoning Scaffold\n  - DeepAnalyze: Agentic Large Language Models for Autonomous Data Science\n  - Thinking with Programming Vision: Towards a Unified View for Thinking with Images\n  - Scaling Agent Learning via Experience Synthesis\n  - CaveAgent: Transforming LLMs into Stateful Runtime Operators\n- 经验学习\n  - Welcome to the Era of Experience\n  - Agent Learning via Early Experience\n- 其他训练方式\n  - QWENLONG-L1: Towards Long-Context Large Reasoning Models with Reinforcement Learning\n  - REWARDBENCH 2: Advancing Reward Model Evaluation\n  - Compute as Teacher: Turning Inference Compute Into Reference-Free Supervision\n  - DiffusionNFT: Online Diffusion Reinforcement with Forward Process\n  - EVOLUTION STRATEGIES AT SCALE: LLM FINETUNING BEYOND REINFORCEMENT LEARNING\n  - Learning to Reason Across Parallel Samples for LLM Reasoning\n  - PARAM∆ FOR DIRECT WEIGHT MIXING: POST-TRAIN LARGE LANGUAGE MODEL AT ZERO COST \n  - LaSeR: Reinforcement Learning with Last-Token Self-Rewarding\n  - The Delta Learning Hypothesis: Preference Tuning on Weak Data can Yield Strong Gains\n- RL  Overview\n  - Reinforcement Learning: An Overview\n  - Towards a Unified View of Large Language Model Post-Training\n- RL数据集\n  - ReasonMed: A 370K Multi-Agent Generated Dataset for Advancing Medical Reasoning\n\n### Context Engineer\n- A Survey of Context Engineering for Large Language Models\n- Agentic Context Engineering: Evolving Contexts for Self-Improving Language Models\n- Scaling Long-Horizon LLM Agent via Context-Folding\n- Towards a Science of Scaling Agent Systems\n- Budget-Aware Tool-Use Enables Effective Agent Scaling\n- Context Engineering 2.0\n- End-to-End Test-Time Training for Long Context\n\n### New Model Architecture\n- SPG: Sandwiched Policy Gradient for Masked Diffusion Language Models\n- Less is More: Recursive Reasoning with Tiny Networks\n- Continuous Thought Machines\n- TiDAR: Think in Diffusion, Talk in Autoregression\n- Nested Learning: The Illusion of Deep Learning Architectures\n\n### 主流LLMS和预训练\n- GLM-130B: AN OPEN BILINGUAL PRE-TRAINED MODEL\n- PaLM: Scaling Language Modeling with Pathways\n- PaLM 2 Technical Report\n- GPT-4 Technical Report\n- Backpack Language Models\n- LLaMA: Open and Efficient Foundation Language Models\n- Llama 2: Open Foundation and Fine-Tuned Chat Models\n- Sheared LLaMA: Accelerating Language Model Pre-training via Structured Pruning\n- OpenBA: An Open-sourced 15B Bilingual Asymmetric seq2seq Model Pre-trained from Scratch\n- Mistral 7B\n- Ziya2: Data-centric Learning is All LLMs Need\n- MEGABLOCKS: EFFICIENT SPARSE TRAINING WITH MIXTURE-OF-EXPERTS\n- TUTEL: ADAPTIVE MIXTURE-OF-EXPERTS AT SCALE\n- Phi1- Textbooks Are All You Need  :star:\n- Phi1.5- Textbooks Are All You Need II: phi-1.5 technical report\n- Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone\n- Gemini: A Family of Highly Capable Multimodal Models\n- In-Context Pretraining: Language Modeling Beyond Document Boundaries\n- LLAMA PRO: Progressive LLaMA with Block Expansion\n- QWEN TECHNICAL REPORT\n- Fewer Truncations Improve Language Modeling\n- ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools\n- Phi-4 Technical Report\n- Byte Latent Transformer: Patches Scale Better Than Tokens\n- Qwen2.5 Technical Report\n- DeepSeek-V3 Technical Report\n- Mixtral of Experts\n- DeepSeek_R1  :star:\n- KIMI K1.5: SCALING REINFORCEMENT LEARNING WITH LLMS  :star:\n- CWM: An Open-Weights LLM for Research on Code Generation with World Models\n- DeepSeek V3.2 Tech Report\n- DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models\n\n### 思维链 (prompt_chain_of_thought)\n- 基础&进阶用法\n    - 【zero-shot-COT】 Large Language Models are Zero-Shot Reasoners :star:\n    - 【few-shot COT】 Chain of Thought Prompting Elicits Reasoning in Large Language Models  :star:\n    - 【SELF-CONSISTENCY 】IMPROVES CHAIN OF THOUGHT REASONING IN LANGUAGE MODELS\n    - 【LEAST-TO-MOST】 PROMPTING ENABLES COMPLEX REASONING IN LARGE LANGUAGE MODELS :star:\n    - 【TOT】Tree of Thoughts: Deliberate Problem Solving with Large Language Models :star:\n    - 【Plan-and-Solve】 Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models\n    - 【Verify-and-Edit】: A Knowledge-Enhanced Chain-of-Thought Framework\n    - 【GOT】Beyond Chain-of-Thought, Effective Graph-of-Thought Reasoning in Large Language Models\n    - 【TOMT】Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop Visual Reasoning\n    - 【LAMBADA】: Backward Chaining for Automated Reasoning in Natural Language\n    - 【AOT】Algorithm of Thoughts: Enhancing Exploration of Ideas in Large Language Models :star:\n    - 【GOT】Graph of Thoughts: Solving Elaborate Problems with Large Language Models :star:\n    - 【PHP】Progressive-Hint Prompting Improves Reasoning in Large Language Models\n    - 【HtT】LARGE LANGUAGE MODELS CAN LEARN RULES :star:\n    - 【DIVSE】DIVERSITY OF THOUGHT IMPROVES REASONING ABILITIES OF LARGE LANGUAGE MODELS\n    - 【CogTree】From Complex to Simple: Unraveling the Cognitive Tree for Reasoning with Small Language Models\n    - 【Step-Back】Take a Step Back: Evoking Reasoning via Abstraction in Large Language Models :star:\n    - 【OPRO】LARGE LANGUAGE MODELS AS OPTIMIZERS :star:\n    - 【BOT】Buffer of Thoughts: Thought-Augmented Reasoning with Large Language Models\n    - Abstraction-of-Thought Makes Language Models Better Reasoners\n    - 【SymbCoT】Faithful Logical Reasoning via Symbolic Chain-of-Thought\n    - 【XOT】EVERYTHING OF THOUGHTS : DEFYING THE LAW OF PENROSE TRIANGLE FOR THOUGHT GENERATION\n    - 【IoT】Iteration of Thought: Leveraging Inner Dialogue for Autonomous Large Language Model Reasoning\n    - 【DOT】On the Diagram of Thought\n    - 【ROT】Reversal of Thought: Enhancing Large Language Models with Preference-Guided Reverse Reasoning Warm-up.\n    - Thinking Forward and Backward: Effective Backward Planning with Large Language Models\n    - 【KR】K-Level Reasoning: Establishing Higher Order Beliefs in Large Language Models for Strategic Reasoning\n    -  【Self-Discover】SELF-DISCOVER: Large Language Models Self-Compose Reasoning Structures\n    - 【Theory-of-Mind】HOW FAR ARE LARGE LANGUAGE MODELS FROMAGENTS WITH THEORY-OF-MIND?\n    - 【PC-SUBQ】Prompting Strategies for Enabling Large Language Models to Infer Causation from Correlation\n    - Reverse Thinking Makes LLMs Stronger Reasoners\n    - Chain of Draft: Thinking Faster by Writing Less\n    - Atom of Thoughts for Markov LLM Test-Time Scaling\n- 非传统COT问题分解方向\n    - Decomposed Prompting A MODULAR APPROACH FOR Solving Complex Tasks\n    - Successive Prompting for Decomposing Complex Questions\n- 分领域COT [Math, Code, Tabular, QA]\n    - Solving Quantitative Reasoning Problems with Language Models\n    - SHOW YOUR WORK: SCRATCHPADS FOR INTERMEDIATE COMPUTATION WITH LANGUAGE MODELS\n    - Solving math word problems with processand outcome-based feedback\n    - CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning\n    - T-SciQ: Teaching Multimodal Chain-of-Thought Reasoning via Large Language Model Signals for Science Question Answering\n    - LEARNING PERFORMANCE-IMPROVING CODE EDITS\n    - Chain of Code: Reasoning with a Language Model-Augmented Code Emulator\n- 原理分析\n    - Chain of Thought Empowers Transformers to Solve Inherently Serial Problems :star:\n    - Towards Understanding Chain-of-Thought Prompting: An Empirical Study of What Matters  :star:\n    - TEXT AND PATTERNS: FOR EFFECTIVE CHAIN OF THOUGHT IT TAKES TWO TO TANGO\n    - Towards Revealing the Mystery behind Chain of Thought: a Theoretical Perspective\n    - Large Language Models Can Be Easily Distracted by Irrelevant Context\n    - Chain-of-Thought Reasoning Without Prompting\n    - Inductive or Deductive? Rethinking the Fundamental Reasoning Abilities of LLMs\n    - Beyond Chain-of-Thought: A Survey of Chain-of-X Paradigms for LLMs\n    - To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic reasoning :star:\n    - Why think step by step? Reasoning emerges from the locality of experience\n   -  Internal Consistency and Self-Feedback in Large Language Models: A Survey :star:\n   - Iteration Head: A Mechanistic Study of Chain-of-Thought :star:\n   - The Impact of Reasoning Step Length on Large Language Models :star:\n   - Do Large Language Models Perform Latent Multi-Hop Reasoning without Exploiting Shortcuts?\n   - Compressed Chain of Thought: Efficient Reasoning Through Dense Representations\n   - Do LLMs Really Think Step-by-step In Implicit Reasoning?\n   - Cognitive Foundations for Reasoning and Their Manifestation in LLMs\n- 小模型COT蒸馏\n    - Specializing Smaller Language Models towards Multi-Step Reasoning   :star:\n    - Teaching Small Language Models to Reason \n    - Large Language Models are Reasoning Teachers\n    - Distilling Reasoning Capabilities into Smaller Language Models\n    - The CoT Collection: Improving Zero-shot and Few-shot Learning of Language Models via Chain-of-Thought Fine-Tuning\n    - Distilling System 2 into System 1\n- COT样本自动构建\u002F选择\n    - AutoCOT：AUTOMATIC CHAIN OF THOUGHT PROMPTING IN LARGE LANGUAGE MODELS\n    - Active Prompting with Chain-of-Thought for Large Language Models\n    - COMPLEXITY-BASED PROMPTING FOR MULTI-STEP REASONING\n- COT能力学习\n   - Large Language Models Can Self-Improve\n   - Training Chain-of-Thought via Latent-Variable Inference\n   - Quiet-STaR: Language Models Can Teach Themselves to Think Before Speaking\n   - STaR: Self-Taught Reasoner Bootstrapping ReasoningWith Reasoning \n   - V-STaR: Training Verifiers for Self-Taught Reasoners\n   - THINK BEFORE YOU SPEAK: TRAINING LANGUAGE MODELS WITH PAUSE TOKENS\n   - SELF-DIRECTED SYNTHETIC DIALOGUES AND REVISIONS TECHNICAL REPORT\n   - COT-SELF-INSTRUCT: BUILDING HIGH-QUALITY SYNTHETIC PROMPTS FOR REASONING AND NON-REASONING TASKS\n- others\n    - OlaGPT Empowering LLMs With Human-like Problem-Solving abilities\n    - Challenging BIG-Bench tasks and whether chain-of-thought can solve them \n    - Large Language Models are Better Reasoners with Self-Verification\n    - ThoughtSource A central hub for large language model reasoning data\n    - Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs\n\n### Self-Evolution\n- Darwin Godel Machine: Open-Ended Evolution of Self-Improving Agents\n- [Alpha Evolve](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Falphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms\u002F)\n- Can Large Reasoning Models Self-Train\n- Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO\n- Evolution Strategies at the Hyperscale\n- Guided Self-Evolving LLMs with Minimal Human Supervision\n\n### RLHF\n- Deepmind\n  - Teaching language models to support answers with verified quotes\n  - sparrow, Improving alignment of dialogue agents via targetd human judgements :star:\n  - STATISTICAL REJECTION SAMPLING IMPROVES PREFERENCE OPTIMIZATION\n  - Reinforced Self-Training (ReST) for Language Modeling\n  - SLiC-HF: Sequence Likelihood Calibration with Human Feedback\n  - CALIBRATING SEQUENCE LIKELIHOOD IMPROVES CONDITIONAL LANGUAGE GENERATION\n  - REWARD DESIGN WITH LANGUAGE MODELS\n  - Final-Answer RL Solving math word problems with processand outcome-based feedback\n  - Solving math word problems with process- and outcome-based feedback\n  - Beyond Human Data: Scaling Self-Training for Problem-Solving with Language Models\n  - BOND: Aligning LLMs with Best-of-N Distillation\n  - RL on Incorrect Synthetic Data Scales the Efficiency of LLM Math Reasoning by Eight-Fold\n  - Generative Verifiers: Reward Modeling as Next-Token Prediction\n  - Training Language Models to Self-Correct via Reinforcement Learning\n- openai\n  - PPO: Proximal Policy Optimization Algorithms :star:\n  - Deep Reinforcement Learning for Human Preference\n  - Fine-Tuning Language Models from Human Preferences\n  - learning to summarize from human feedback\n  - InstructGPT: Training language models to follow instructions with human feedback :star:\n  - Scaling Laws for Reward Model Over optimization :star:\n  - WEAK-TO-STRONG GENERALIZATION: ELICITING STRONG CAPABILITIES WITH WEAK SUPERVISION  :star:\n  - PRM：Let's verify step by step  :star:\n  - Training Verifiers to Solve Math Word Problems [PRM的前置依赖]\n  - [OpenAI Super Alignment Blog](https:\u002F\u002Fopenai.com\u002Fblog\u002Fintroducing-superalignment)\n  - LLM Critics Help Catch LLM Bugs :star:\n  - PROVER-VERIFIER GAMES IMPROVE LEGIBILITY OF LLM OUTPUTS\n  - Rule Based Rewards for Language Model Safety\n  - Self-critiquing models for assisting human evaluators\n- Anthropic\n  - A General Language Assistant as a Laboratory for Alignmen \n  - Measuring Progress on Scalable Oversight or Large Language Models\n  - Red Teaming Language Models to Reduce Harms Methods,Scaling Behaviors and Lessons Learned\n  - Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback  :star:\n  - Constitutional AI Harmlessness from AI Feedback :star:\n  - Pretraining Language Models with Human Preferences\n  - The Capacity for Moral Self-Correction in Large Language Models\n  - Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Trainin\n- AllenAI, RL4LM：IS REINFORCEMENT LEARNING (NOT) FOR NATURAL LANGUAGE PROCESSING BENCHMARKS\n- 改良方案 \n  - RRHF: Rank Responses to Align Language Models with Human Feedback without tears\n  - Chain of Hindsight Aligns Language Models with Feedback\n  - AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback\n  - RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment\n  - RLAIF: Scaling Reinforcement Learning from Human Feedback with AI Feedback\n  - Training Socially Aligned Language Models in Simulated Human Society\n  - RAIN: Your Language Models Can Align Themselves without Finetuning\n  - Generative Judge for Evaluating Alignment\n  - PEERING THROUGH PREFERENCES: UNRAVELING FEEDBACK ACQUISITION FOR ALIGNING LARGE LANGUAGE MODELS\n  - SALMON: SELF-ALIGNMENT WITH PRINCIPLE-FOLLOWING REWARD MODELS\n  - Large Language Model Unlearning :star:\n  - ADVERSARIAL PREFERENCE OPTIMIZATION :star:\n  - Preference Ranking Optimization for Human Alignment\n  - A Long Way to Go: Investigating Length Correlations in RLHF\n  - ENABLE LANGUAGE MODELS TO IMPLICITLY LEARN SELF-IMPROVEMENT FROM DATA\n  - REWARD MODEL ENSEMBLES HELP MITIGATE OVEROPTIMIZATION\n  - LEARNING OPTIMAL ADVANTAGE FROM PREFERENCES AND MISTAKING IT FOR REWARD\n  - ULTRAFEEDBACK: BOOSTING LANGUAGE MODELS WITH HIGH-QUALITY FEEDBACK\n  - MOTIF: INTRINSIC MOTIVATION FROM ARTIFICIAL INTELLIGENCE FEEDBACK\n  - STABILIZING RLHF THROUGH ADVANTAGE MODEL AND SELECTIVE REHEARSAL\n  - Shepherd: A Critic for Language Model Generation\n  - LEARNING TO GENERATE BETTER THAN YOUR LLM\n  - Fine-Grained Human Feedback Gives Better Rewards for Language Model Training\n  - Principle-Driven Self-Alignment of Language Models from Scratch with Minimal Human Supervision\n  - Direct Preference Optimization: Your Language Model is Secretly a Reward Model\n  - HIR The Wisdom of Hindsight Makes Language Models Better Instruction Followers\n  - Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction\n  - A Minimaximalist Approach to Reinforcement Learning from Human Feedback\n  - PANDA: Preference Adaptation for Enhancing Domain-Specific Abilities of LLMs\n  - Weak-to-Strong Search: Align Large Language Models via Searching over Small Language Models\n  - Weak-to-Strong Extrapolation Expedites Alignment\n  - Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study\n  - Token-level Direct Preference Optimization\n  - SimPO: Simple Preference Optimization with a Reference-Free Reward\n  - AUTODETECT: Towards a Unified Framework for Automated Weakness Detection in Large Language Models\n  - META-REWARDING LANGUAGE MODELS: Self-Improving Alignment with LLM-as-a-Meta-Judge\n  - HELPSTEER: Multi-attribute Helpfulness Dataset for STEERLM\n  - Recursive Introspection: Teaching Language Model Agents How to Self-Improve\n  - Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization\n  - DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models\n  - GLoRe: When, Where, and How to Improve LLM Reasoning via Global and Local Refinements\n  - REFT: Reasoning with REinforced Fine-Tuning\n  - SCPO：SELF-CONSISTENCY PREFERENCE OPTIMIZATION\n  - MONA: Myopic Optimization with Non-myopic Approval Can Mitigate Multi-step Reward Hacking\n  - Optimizing Test-Time Compute via Meta Reinforcement Fine-Tuning\n  - Pre-Trained Policy Discriminators are General Reward Models\n- RL探究\n  - UNDERSTANDING THE EFFECTS OF RLHF ON LLM GENERALISATION AND DIVERSITY\n  - A LONG WAY TO GO: INVESTIGATING LENGTH CORRELATIONS IN RLHF\n  - THE TRICKLE-DOWN IMPACT OF REWARD (IN-)CONSISTENCY ON RLHF\n  - Open Problems and Fundamental Limitations of Reinforcement Learning from Human Feedback\n  - HUMAN FEEDBACK IS NOT GOLD STANDARD\n  - CONTRASTIVE POST-TRAINING LARGE LANGUAGE MODELS ON DATA CURRICULUM\n  - Language Models Resist Alignment\n  - Towards a Unified View of Preference Learning for Large Language Models: A Survey\n\n### Memory\n> 脱离上文长度这个狭窄的视角重新看待模型记忆\n- A-MEM: Agentic Memory for LLM Agents\n- MemInsight: Autonomous Memory Augmentation for LLM Agents\n- G-Memory: Tracing Hierarchical Memory for Multi-Agent Systems\n- AGENT WORKFLOW MEMORY\n- KBLAM: KNOWLEDGE BASE AUGMENTED LANGUAGE MODEL\n- MIRIX: Multi-Agent Memory System for LLM-Based Agents\n- M3-Agent: Seeing, Listening, Remembering, and Reasoning: A Multimodal Agent with Long-Term Memory\n- MemTool: Optimizing Short-Term Memory Management for Dynamic Tool Calling in LLM Agent Multi-Turn Conversations\n- Memory-R1: Enhancing Large Language Model Agents to Manage and Utilize Memories via Reinforcement Learning\n- Multiple Memory Systems for Enhancing the Long-term Memory of Agent\n- PerPilot: Personalizing VLM-based Mobile Agents via Memory and Exploration\n- Coarse-to-Fine Grounded Memory for LLM Agent Planning\n- Intrinsic Memory Agents: Heterogeneous Multi-Agent LLM Systems through Structured Contextual Memory\n- Memp: Exploring Agent Procedural Memory\n- RCR-Router: Efficient Role-Aware Context Routing for Multi-Agent LLM Systems with Structured Memory\n- A-MEM: Agentic Memory for LLM Agents\n- MemoryBank: Enhancing Large Language Models with Long-Term Memory\n- Metacognitive Reuse: Turning Recurring LLM Reasoning Into Concise Behaviors\n- Cognitive Architectures for Language Agents\n- Reason ingBank: Scaling Agent Self-Evolving with Reasoning Memory\n- LIGHTMEM: LIGHTWEIGHT AND EFFICIENT MEMORY-AUGMENTED GENERATION\n- Titans: Learning to Memorize at Test Time\n- Learning to Reason from Feedback at Test-Time\n- Deep Researcher with Test-Time Diffusion\n- It’s All Connected: A Journey Through Test-Time Memorization, Attentional Bias, Retention, and Online Optimization\n- Agentic Memory: Learning Unified Long-Term and Short-Term Memory Management for Large Language Model Agents\n- MEMRL: SELF-EVOLVING AGENTS VIA RUNTIME REINFORCEMENT LEARNING ON EPISODIC MEMORY\n\n### 多轮对话\n>- 近期我们也陷入多轮对话优化，发现了角色混乱、理解下降等很多问题 \n- LLMS GET LOST IN MULTI-TURN CONVERSATION\n\n###  指令微调&对齐 (instruction_tunning)\n- 经典方案\n   - Flan: FINETUNED LANGUAGE MODELS ARE ZERO-SHOT LEARNERS :star:\n   - Flan-T5: Scaling Instruction-Finetuned Language Models\n   - ExT5: Towards Extreme Multi-Task Scaling for Transfer Learning\n   - Instruct-GPT: Training language models to follow instructions with human feedback :star:\n   - T0: MULTITASK PROMPTED TRAINING ENABLES ZERO-SHOT TASK GENERALIZATION\n   - Natural Instructions: Cross-Task Generalization via Natural Language Crowdsourcing Instructions\n   - Tk-INSTRUCT: SUPER-NATURALINSTRUCTIONS: Generalization via Declarative Instructions on 1600+ NLP Tasks\n   - ZeroPrompt: Scaling Prompt-Based Pretraining to 1,000 Tasks Improves Zero-shot Generalization\n   - Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor\n   - INSTRUCTEVAL Towards Holistic Evaluation of Instrucion-Tuned Large Language Models\n- SFT数据Scaling Law\n    - LIMA: Less Is More for Alignment :star:\n    - Maybe Only 0.5% Data is Needed: A Preliminary Exploration of Low Training Data Instruction Tuning\n    - AlpaGasus: Training A Better Alpaca with Fewer Data\n    - InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4\n    - Instruction Mining: High-Quality Instruction Data Selection for Large Language Models\n    - Visual Instruction Tuning with Polite Flamingo\n    - Exploring the Impact of Instruction Data Scaling on Large Language Models:  An Empirical Study on Real-World Use Cases\n    - Scaling Relationship on Learning Mathematical Reasoning with Large Language Models\n    - WHEN SCALING MEETS LLM FINETUNING: THE EFFECT OF DATA, MODEL AND FINETUNING METHOD\n- 新对齐\u002F微调方案\n   - WizardLM: Empowering Large Language Models to Follow Complex Instructions :star:\n   - Becoming self-instruct: introducing early stopping criteria for minimal instruct tuning\n   - Self-Alignment with Instruction Backtranslation :star:\n   - Mixture-of-Experts Meets Instruction Tuning:A Winning Combination for Large Language Models\n   - Goat: Fine-tuned LLaMA Outperforms GPT-4 on Arithmetic Tasks\n   - PROMPT2MODEL: Generating Deployable Models from Natural Language Instructions\n   - OpinionGPT: Modelling Explicit Biases in Instruction-Tuned LLMs\n   - Improving Language Model Negotiation with Self-Play and In-Context Learning from AI Feedback\n   - Human-like systematic generalization through a meta-learning neural network\n   - Magicoder: Source Code Is All You Need\n   - Beyond Human Data: Scaling Self-Training for Problem-Solving with Language Models\n   - Generative Representational Instruction Tuning\n   - InsCL: A Data-efficient Continual Learning Paradigm for Fine-tuning Large Language Models with Instructions\n   - The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions\n   - Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing\n- 指令数据生成\n  - APE: LARGE LANGUAGE MODELS ARE HUMAN-LEVEL PROMPT ENGINEERS  :star:\n  - SELF-INSTRUCT: Aligning Language Model with Self Generated Instructions :star:\n  - iPrompt: Explaining Data Patterns in Natural Language via Interpretable Autoprompting  \n  - Flipped Learning: Guess the Instruction! Flipped Learning Makes Language Models Stronger Zero-Shot Learners\n  - Fairness-guided Few-shot Prompting for Large Language Models  \n  - Instruction induction: From few examples to natural language task descriptions .\n  - SELF-QA Unsupervised Knowledge Guided alignment.\n  - GPT Self-Supervision for a Better Data Annotator  \n  - The Flan Collection Designing Data and Methods\n  - Self-Consuming Generative Models Go MAD\n  - InstructEval: Systematic Evaluation of Instruction Selection Methods\n  - Overwriting Pretrained Bias with Finetuning Data\n  - Improving Text Embeddings with Large Language Models\n  - MAGPIE: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing\n  - Scaling Synthetic Data Creation with 1,000,000,000 Personas\n  - UNLEASHING REASONING CAPABILITY OF LLMS VIA SCALABLE QUESTION SYNTHESIS FROM SCRATCH\n  - A Survey on Data Synthesis and Augmentation for Large Language Models\n  - AgentInstruct: Toward Generative Teaching with Agentic Flows\n  - Unveiling the Flaws: Exploring Imperfections in Synthetic Data and Mitigation Strategies for Large Language Models\n- 如何降低通用能力损失\n  - How Abilities in Large Language Models are Affected by Supervised Fine-tuning Data Composition\n  - TWO-STAGE LLM FINE-TUNING WITH LESS SPECIALIZATION AND MORE GENERALIZATION\n- 微调经验\u002F实验报告\n    - BELLE: Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases\n    - Baize: Baize: An Open-Source Chat Model with Parameter-Efficient Tuning on Self-Chat Data\n    - A Comparative Study between Full-Parameter and LoRA-based Fine-Tuning on Chinese Instruction Data for Large LM\n    - Exploring ChatGPT’s Ability to Rank Content: A Preliminary Study on Consistency with Human Preferences\n    - Towards Better Instruction Following Language Models for Chinese: Investigating the Impact of Training Data and Evaluation\n    - Fine tuning LLMs for Enterprise: Practical Guidelines and Recommendations\n- Others\n   - Crosslingual Generalization through Multitask Finetuning\n   - Cross-Task Generalization via Natural Language Crowdsourcing Instructions\n   - UNIFIEDSKG: Unifying and Multi-Tasking Structured Knowledge Grounding with Text-to-Text Language Models\n   - PromptSource: An Integrated Development Environment and Repository for Natural Language Prompts\n   - ROLELLM: BENCHMARKING, ELICITING, AND ENHANCING ROLE-PLAYING ABILITIES OF LARGE LANGUAGE MODELS\n\n  \n### LLM Agent 让模型使用工具 (llm_agent)\n- AGENT AI: SURVEYING THE HORIZONS OF MULTIMODAL INTERACTION\n- A Survey on Large Language Model based Autonomous Agents\n- PERSONAL LLM AGENTS: INSIGHTS AND SURVEY ABOUT THE CAPABILITY, EFFICIENCY AND SECURITY\n- 基于prompt通用方案\n  - ReAct: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS  :star:\n  - Self-ask: MEASURING AND NARROWING THE COMPOSITIONALITY GAP IN LANGUAGE MODELS :star:\n  - MRKL SystemsA modular, neuro-symbolic architecture that combines large language models, external knowledge sources and discrete reasoning\n  - PAL: Program-aided Language Models\n  - ART: Automatic multi-step reasoning and tool-use for large language models\n  - ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models  :star:\n  - Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions \n  - Chameleon: Plug-and-Play Compositional Reasoning with Large Language Models   :star:\n  - Faithful Chain-of-Thought Reasoning\n  - Reflexion: Language Agents with Verbal Reinforcement Learning  :star:\n  - Verify-and-Edit: A Knowledge-Enhanced Chain-of-Thought Framework\n  - RestGPT: Connecting Large Language Models with Real-World RESTful APIs\n  - ChatCoT: Tool-Augmented Chain-of-Thought Reasoning on Chat-based Large Language Models\n  - InstructTODS: Large Language Models for End-to-End Task-Oriented Dialogue Systems\n  - TPTU: Task Planning and Tool Usage of Large Language Model-based AI Agents\n  - ControlLLM: Augment Language Models with Tools by Searching on Graphs\n  - Reflexion: an autonomous agent with dynamic memory and self-reflection\n  - AutoAgents: A Framework for Automatic Agent Generation\n  - GitAgent: Facilitating Autonomous Agent with GitHub by Tool Extension\n  - PreAct: Predicting Future in ReAct Enhances Agent's Planning Ability\n  - TOOLLLM: FACILITATING LARGE LANGUAGE MODELS TO MASTER 16000+ REAL-WORLD APIS :star:\n   -AnyTool: Self-Reflective, Hierarchical Agents for Large-Scale API Calls \n  - AIOS: LLM Agent Operating System\n  - LLMCompiler An LLM Compiler for Parallel Function Calling\n  - Re-Invoke: Tool Invocation Rewriting for Zero-Shot Tool Retrieval\n- 基于微调通用方案\n  - TALM: Tool Augmented Language Models\n  - Toolformer: Language Models Can Teach Themselves to Use Tools  :star:\n  - Tool Learning with Foundation Models\n  - Tool Maker：Large Language Models as Tool Maker\n  - TaskMatrix.AI: Completing Tasks by Connecting Foundation Models with Millions of APIs\n  - AgentTuning: Enabling Generalized Agent Abilities for LLMs\n  - SWIFTSAGE: A Generative Agent with Fast and Slow Thinking for Complex Interactive Tasks\n  - FireAct: Toward Language Agent Fine-tuning\n  - Pangu-Agent: A Fine-Tunable Generalist Agent with Structured Reasoning\n  - REST MEETS REACT: SELF-IMPROVEMENT FOR MULTI-STEP REASONING LLM AGENT\n  - Efficient Tool Use with Chain-of-Abstraction Reasoning\n  - Agent-FLAN: Designing Data and Methods of Effective Agent Tuning for Large Language Models\n  - AgentOhana: Design Unified Data and Training Pipeline for Effective Agent Learning\n  - Agent Lumos: Unified and Modular Training for Open-Source Language Agents\n  - ToolGen: Unified Tool Retrieval and Calling via Generation\n  - Scaling Agents via Continual Pre-training\n  - LIMI: Less is More for Agency\n- 调用模型方案\n  - HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace\n  - Gorilla：Large Language Model Connected with Massive APIs  :star:\n  - OpenAGI: When LLM Meets Domain Experts\n- 垂直领域 \n  - 数据分析\n    - DS-Agent: Automated Data Science by Empowering Large Language Models with Case-Based Reasoning\n    - InsightLens: Discovering and Exploring Insights from Conversational Contexts in Large-Language-Model-Powered Data Analysis\n    - Data-Copilot: Bridging Billions of Data and Humans with Autonomous Workflow\n    - Demonstration of InsightPilot: An LLM-Empowered Automated Data Exploration System\n    - TaskWeaver: A Code-First Agent Framework\n    - Automated Social Science: Language Models as Scientist and Subjects\n    - Data Interpreter: An LLM Agent For Data Science\n    - FDABench: A Benchmark for Data Agents on Analytical Queries over Heterogeneous Data\n  - 金融\n    - WeaverBird: Empowering Financial Decision-Making with Large Language Model, Knowledge Base, and Search Engine\n    - FinGPT: Open-Source Financial Large Language Models\n    - FinMem: A Performance-Enhanced LLM Trading Agent with Layered Memory and Character Design\n    - AlphaFin：使用检索增强股票链框架对财务分析进行基准测试\n    - FinAgent： A Multimodal Foundation Agent for Financial Trading: Tool-Augmented, Diversified, and Generalist :star:\n    - Can Large Language Models Beat Wall Street? Unveiling the Potential of AI in stock Selection \n    - ENHANCING ANOMALY DETECTION IN FINANCIAL MARKETS WITH AN LLM-BASED MULTI-AGENT FRAMEWORK\n    - TRADINGGPT: MULTI-AGENT SYSTEM WITH LAYERED MEMORY AND DISTINCT CHARACTERS FOR ENHANCED FINANCIAL TRADING PERFORMANCE\n    - FinRobot: An Open-Source AI Agent Platform for Financial Applications using Large Language Models\n    - LLMFactor: Extracting Profitable Factors through Prompts for Explainable Stock Movement Prediction\n    - Alpha-GPT: Human-AI Interactive Alpha Mining for Quantitative Investment\n    - Advancing Anomaly Detection: Non-Semantic Financial Data Encoding with LLMs\n    - TradExpert: Revolutionizing Trading with Mixture of Expert LLMs\n    - FinVision: A Multi-Agent Framework for Stock Market Prediction\n    - AI in Investment Analysis: LLMs for Equity Stock Ratings\n    - AAPM: Large Language Model Agent-based Asset Pricing Models\n    - FinCon: A Synthesized LLM Multi-Agent System with Conceptual Verbal Reinforcement for Enhanced Financial Decision Making\n    - TradingAgents: Multi-Agents LLM Financial Trading Framework\n    - Pretrained LLM Adapted with LoRA as a Decision Transformer for Offline RL in Quantitative Trading\n    - FinRL-DeepSeek: LLM-Infused Risk-Sensitive Reinforcement Learning for Trading Agents\n    - FinSphere: A Conversational Stock Analysis Agent Equipped with Quantitative Tools based on Real-Time Database\n    - FLAG-Trader: Fusion LLM-Agent with Gradient-based Reinforcement Learning for Financial Trading\n    - Ploutos: Towards interpretable stock movement prediction with financial large language model\n    - HedgeAgents: A Balanced-aware Multi-agent Financial Trading System\n    - TIMERAG: BOOSTING LLM TIME SERIES FORECASTING VIA RETRIEVAL-AUGMENTED GENERATION\n    - CausalStock: Deep End-to-end Causal Discovery for News-driven Stock Movement Prediction\n    - Can LLM-based Financial Investing Strategies Outperform the Market in Long Run?\n    - Advancing Financial Engineering with Foundation Models: Progress, Applications, and Challenges\n    - AlphaAgents: Large Language Model based Multi-Agents for Equity Portfolio Constructions\n  - 生物医疗\n    - GeneGPT: Augmenting Large Language Models with Domain Tools for Improved Access to Biomedical Information\n    - ChemCrow Augmenting large language models with chemistry tools\n    - Generating Explanations in Medical Question-Answering by Expectation Maximization Inference over Evidence\n    - Agent Hospital: A Simulacrum of Hospital with Evolvable Medical Agents\n    - Integrating Chemistry Knowledge in Large Language Models via Prompt Engineering\n    - CHEMAGENT: SELF-UPDATING LIBRARY IN LARGE LANGUAGE MODELS IMPROVES CHEMICAL REASONING\n  - web\u002Fmobile Agent\n    - AutoWebGLM: Bootstrap And Reinforce A Large Language Model-based Web Navigating Agent\n    - A Real-World WebAgent with Planning, Long Context Understanding, and Program Synthesis\n    - Mind2Web: Towards a Generalist Agent for the Web\n    - MiniWoB++ Reinforcement Learning on Web Interfaces Using Workflow-Guided Exploration\n    - WEBARENA: A REALISTIC WEB ENVIRONMENT FORBUILDING AUTONOMOUS AGENTS\n    - AutoCrawler: A Progressive Understanding Web Agent for Web Crawler Generation\n    - WebLINX: Real-World Website Navigation with Multi-Turn Dialogue\n    - WebVoyager: Building an End-to-end Web Agent with Large Multimodal Models\n    - CogAgent: A Visual Language Model for GUI Agents\n    - Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration\n    - WebCanvas: Benchmarking Web Agents in Online Environments\n    - The Dawn of GUI Agent: A Preliminary Case Study with Claude 3.5 Computer Use\n    - UI-TARS: Pioneering Automated GUI Interaction with Native Agents\n    - Exposing Limitations of Language Model Agents in Sequential-Task Compositions on the Web\n    - WebSailor: Navigating Super-human Reasoning for Web Agent\n    - WebShaper: Agentically Data Synthesizing via Information-Seeking Formalization\n    - WebWatcher: Breaking New Frontiers of Vision-Language Deep Research Agent\n    - OS-Genesis: Automating GUI Agent Trajectory Construction via Reverse Task Synthesis\n    - Scalable Video-to-Dataset Generation for Cross-Platform Mobile Agents\n    - Explorer: Scaling Exploration-driven Web Trajectory Synthesis for Multimodal Web Agents\n    - Watch and Learn: Learning to Use Computers from Online Videos\n    - Fara-7B: An Efficient Agentic Model for Computer Use\n  - software engineer\n   - Agents in Software Engineering: Survey, Landscape, and Vision\n   - ChatDev: Communicative Agents for Software Development\n  - Research Agent\n    - PaSa: An LLM Agent for Comprehensive Academic Paper Search\n    - ResearchAgent: Iterative Research Idea Generation over Scientific Literature with Large Language Models\n    - Agent Laboratory: Using LLM Agents as Research Assistants\n    - Automated Hypothesis Validation with Agentic Sequential Falsifications\n    - Towards an AI co-scientist\n    - AI4Research: A Survey of Artificial Intelligence for Scientific Research\n    - Kosmos: An AI Scientist for Autonomous Discovery\n    - Knowledge-Informed Automatic Feature Extraction via Collaborative Large Language Model Agents\n  - 设计\n    - PosterGen: Aesthetic-Aware Paper-to-Poster Generation via Multi-Agent LLMs\n    - Paper2Poster: Towards Multimodal Poster Automation from Scientific Papers\n  - 其他\n    - WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents\n    - ToolkenGPT: Augmenting Frozen Language Models with Massive Tools via Tool Embeddings\n    - PointLLM: Empowering Large Language Models to Understand Point Clouds\n    - Interpretable Long-Form Legal Question Answering with Retrieval-Augmented Large Language Models\n    - CarExpert: Leveraging Large Language Models for In-Car Conversational Question Answering\n    - SCIAGENTS: AUTOMATING SCIENTIFIC DISCOVERY THROUGH MULTI-AGENT INTELLIGENT GRAPH REASONING \n- 评估\n  - Evaluating Verifiability in Generative Search Engines\n  - Auto-GPT for Online Decision Making: Benchmarks and Additional Opinions\n  - API-Bank: A Benchmark for Tool-Augmented LLMs\n  - ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs\n  - Automatic Evaluation of Attribution by Large Language Models\n  - Benchmarking Large Language Models in Retrieval-Augmented Generation\n  - ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems\n  - Agent-as-a-Judge: Evaluate Agents with Agents\n- MultiAgent\n  - An Empirical Study of Agent Developer Practices in AI Agent Frameworks\n   - GENERATIVE AGENTS\n  - LET MODELS SPEAK CIPHERS: MULTIAGENT DEBATE THROUGH EMBEDDINGS\n  - War and Peace (WarAgent): Large Language Model-based Multi-Agent Simulation of World Wars\n  - Small LLMs Are Weak Tool Learners: A Multi-LLM Agent\n  - Merge, Ensemble, and Cooperate! A Survey on Collaborative Strategies in the Era of Large Language Models\n  - Generative Agents: Interactive Simulacra of Human Behavior  :star:\n  - AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors in Agents \n  - System-1.x: Learning to Balance Fast and Slow Planning with Language Models\n  - Agents Thinking Fast and Slow:A Talker-Reasoner Architecture\n  - Generative Agent Simulations of 1,000 People\n  - Advanced Reasoning and Learning for Autonomous AI Agents\n  - Multi-Agent Design: Optimizing Agents with Better Prompts and Topologies\n  - Emergent Coordination in Multi-Agent Language Models\n  - TUMIX: Multi-Agent Test-Time Scaling with Tool-Use Mixture\n  - SOLVING A MILLION-STEP LLM TASK WITH ZERO ERRORS\n  - Latent Collaboration in Multi-Agent Systems\n  - 多智能体系统\n    - Internet of Agents: Weaving a Web of Heterogeneous Agents for Collaborative Intelligence \n    - MULTI-AGENT COLLABORATION: HARNESSING THE POWER OF INTELLIGENT LLM AGENTS\n    - Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks \n    - Assemble Your Crew: Automatic Multi-agent Communication Topology Design via Autoregressive Graph Generation\n - 任务型智能体协作\n    - METAAGENTS: SIMULATING INTERACTIONS OF HUMAN BEHAVIORS FOR LLM-BASED TASK-ORIENTED COORDINATION VIA COLLABORATIVE\n    - CAMEL: Communicative Agents for \"Mind\" Exploration of Large Scale Language Model Society  :star:\n    - Exploring Large Language Models for Communication Games: An Empirical Study on Werewolf\n    - Communicative Agents for Software Development  :star:\n    - MedAgents: Large Language Models as Collaborators for Zero-shot Medical Reasoning\n    - METAGPT: META PROGRAMMING FOR A MULTI-AGENT COLLABORATIVE FRAMEWORK\n  - 智能体路由\n     - One Agent To Rule Them All: Towards Multi-agent Conversational AI\n     - A Multi-Agent Conversational Recommender System\n  - 基座模型路由&Ensemble\n     - Large Language Model Routing with Benchmark Datasets\n     - LLM-BL E N D E R: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion\n     - RouteLLM: Learning to Route LLMs with Preference Data\n     - More Agents Is All You Need\n     - Routing to the Expert: Efficient Reward-guided Ensemble of Large Language Models\n- 自主学习和探索进化\n  - AppAgent: Multimodal Agents as Smartphone Users\n  - Investigate-Consolidate-Exploit: A General Strategy for Inter-Task Agent Self-Evolution\n  - LLMs in the Imaginarium: Tool Learning through Simulated Trial and Error\n  - Empowering Large Language Model Agents through Action Learning\n  - Trial and Error: Exploration-Based Trajectory Optimization for LLM Agents\n  - OS-COPILOT: TOWARDS GENERALIST COMPUTER AGENTS WITH SELF-IMPROVEMENT\n  - LLAMA RIDER: SPURRING LARGE LANGUAGE MODELS TO EXPLORE THE OPEN WORLD\n  - PAST AS A GUIDE: LEVERAGING RETROSPECTIVE LEARNING FOR PYTHON CODE COMPLETION\n  - AutoGuide: Automated Generation and Selection of State-Aware Guidelines for Large Language Model Agents\n  - A Survey on Self-Evolution of Large Language Models\n  - ExpeL: LLM Agents Are Experiential Learners\n  - ReAct Meets ActRe: When Language Agents Enjoy Training Data Autonomy\n  - PROACTIVE AGENT: SHIFTING LLM AGENTS FROM REACTIVE RESPONSES TO ACTIVE ASSISTANCE\n  - From Novice to Expert: LLM Agent Policy Optimization via Step-wise Reinforcement Learning\n  - AGILE: A Novel Reinforcement Learning Framework of LLM Agents\n  - Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents\n  - ARMAP: SCALING AUTONOMOUS AGENTS VIA AUTOMATIC REWARD MODELING AND PLANNING\n  - Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning\n  - Contextual Experience Replay for Continual Learning of Language Agents\n  - TaskCraft: Automated Generation of Agentic Tasks\n- MCP\n  - SCALEMCP: DYNAMIC AND AUTO-SYNCHRONIZING MODEL CONTEXT PROTOCOL TOOLS FOR LLM AGENTS\n  - LIVEMCP-101: STRESS TESTING AND DIAGNOSING MCP-ENABLED AGENTS ON CHALLENGING QUERIES\n- 其他\n  - LLM+P: Empowering Large Language Models with Optimal Planning Proficiency\n  - Inference with Reference: Lossless Acceleration of Large Language Models\n  - RecallM: An Architecture for Temporal Context Understanding and Question Answering\n  - LLaMA Rider: Spurring Large Language Models to Explore the Open World\n  - LLMs Can’t Plan, But Can Help Planning in LLM-Modulo Frameworks\n  - Routine: A Structural Planning Framework for LLM Agent System in Enterprise\n- Custom Agent\n  - Creating General User Models from Computer Use\n\n\n### RAG\n- 经典论文\n  - WebGPT：Browser-assisted question-answering with human feedback \n  - WebGLM: Towards An Efficient Web-Enhanced Question Answering System with Human Preferences \n  - WebCPM: Interactive Web Search for Chinese Long-form Question Answering :star:\n  - REPLUG: Retrieval-Augmented Black-Box Language Models :star:\n  - RETA-LLM: A Retrieval-Augmented Large Language Model Toolkit\n  - Atlas: Few-shot Learning with Retrieval Augmented Language Models\n  - RRAML: Reinforced Retrieval Augmented Machine Learning\n  - FRESHLLMS: REFRESHING LARGE LANGUAGE MODELS WITH SEARCH ENGINE AUGMENTATION\n- 微调\n  - RLCF：Aligning the Capabilities of Large Language Models with the Context of Information Retrieval via Contrastive Feedback\n  - RA-DIT: RETRIEVAL-AUGMENTED DUAL INSTRUCTION TUNING\n  - CHAIN-OF-NOTE: ENHANCING ROBUSTNESS IN RETRIEVAL-AUGMENTED LANGUAGE MODELS\n  - RAFT: Adapting Language Model to Domain Specific RAG\n  - Rich Knowledge Sources Bring Complex Knowledge Conflicts: Recalibrating Models to Reflect Conflicting Evidence\n- 其他论文\n  - Investigating the Factual Knowledge Boundary of Large Language Models with Retrieval Augmentation\n  - PDFTriage: Question Answering over Long, Structured Documents\n  - Walking Down the Memory Maze: Beyond Context Limit through Interactive Reading  :star:\n  - Active Retrieval Augmented Generation\n  - kNN-LM Does Not Improve Open-ended Text Generation\n  - Can Retriever-Augmented Language Models Reason? The Blame Game Between the Retriever and the Language Model\n  - DORIS-MAE: Scientific Document Retrieval using Multi-level Aspect-based Queries\n  - Factuality Enhanced Language Models for Open-Ended Text Generation  \n  - KwaiAgents: Generalized Information-seeking Agent System with Large Language Models\n  - Complex Claim Verification with Evidence Retrieved in the Wild\n  - Retrieval-Augmented Generation for Large Language Models: A Survey\n  - ChatQA: Building GPT-4 Level Conversational QA Models\n  - RAG vs Fine-tuning: Pipelines, Tradeoffs, and a Case Study on Agriculture\n  - Benchmarking Large Language Models in Retrieval-Augmented Generation\n  - T-RAG: Lessons from the LLM Trenches\n  - ARAGOG: Advanced RAG Output Grading\n  - ActiveRAG: Revealing the Treasures of Knowledge via Active Learning\n  - OpenResearcher: Unleashing AI for Accelerated Scientific Research\n  - [Contextual.ai-RAG2.0](https:\u002F\u002Fcontextual.ai\u002Fintroducing-rag2\u002F)\n  - Mindful-RAG: A Study of Points of Failure in Retrieval Augmented Generation\n  - Memory3 : Language Modeling with Explicit Memory\n- 优化检索\n  - IAG: Induction-Augmented Generation Framework for Answering Reasoning Questions\n  - HyDE：Precise Zero-Shot Dense Retrieval without Relevance Labels\n  - PROMPTAGATOR : FEW-SHOT DENSE RETRIEVAL FROM 8 EXAMPLES\n  - Query Rewriting for Retrieval-Augmented Large Language Models\n  - Query2doc: Query Expansion with Large Language Models  :star:\n  - Query Expansion by Prompting Large Language Models  :star:\n  - [Anthropic Contextual Retrieval](https:\u002F\u002Fwww.anthropic.com\u002Fnews\u002Fcontextual-retrieval)\n  - Multi-Level Querying using A Knowledge Pyramid\n  - A Survey of Query Optimization in Large Language Models\n- Ranking\n  - A Setwise Approach for Effective and Highly Efficient Zero-shot Ranking with Large Language Models\n  - RankVicuna: Zero-Shot Listwise Document Reranking with Open-Source Large Language Models\n  - Improving Passage Retrieval with Zero-Shot Question Generation\n  - Large Language Models are Effective Text Rankers with Pairwise Ranking Prompting\n  - RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in LLMs\n  - Ranking Manipulation for Conversational Search Engines\n  - Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agents\n  - Opensource Large Language Models are Strong Zero-shot Query Likelihood Models for Document Ranking\n  - T2Ranking: A large-scale Chinese Benchmark for Passage Ranking\n  - Learning to Filter Context for Retrieval-Augmented Generation\n- 传统搜索方案\n  - ASK THE RIGHT QUESTIONS:ACTIVE QUESTION REFORMULATION WITH REINFORCEMENT LEARNING\n  - Query Expansion Techniques for Information Retrieval a Survey\n  - Learning to Rewrite Queries \n  - Managing Diversity in Airbnb Search\n- 新向量模型用于Recall和Ranking\n  - Augmented Embeddings for Custom Retrievals\n  - BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation\n  - [网易为RAG设计的BCE Embedding技术报告](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F681370855)\n  - BGE Landmark Embedding: A Chunking-Free Embedding Method For Retrieval Augmented Long-Context Large Language Models\n  - D2LLM: Decomposed and Distilled Large Language Models for Semantic Search\n  - Piccolo2: General Text Embedding with Multi-task Hybrid Loss Training\n  - UniSearch: Rethinking Search System with a Unified Generative Architecture\n  - UniDex: Rethinking Search Inverted Indexing with Unified Semantic Modeling\n- 优化推理结果\n  - Speculative RAG: Enhancing Retrieval Augmented Generation through Drafting\n- 动态RAG（When to Search & Search Plan）\n  - SELF-RAG: LEARNING TO RETRIEVE, GENERATE, AND CRITIQUE THROUGH SELF-REFLECTION  :star:\n  - Self-Knowledge Guided Retrieval Augmentation for Large Language Models\n  - Self-DC: When to retrieve and When to generate Self Divide-and-Conquer for Compositional Unknown Questions\n  - Small Models, Big Insights: Leveraging Slim Proxy Models To Decide When and What to Retrieve for LLMs\n  - Adaptive-RAG: Learning to Adapt Retrieval-Augmented Large Language Models through Question Complexity\n  - REAPER: Reasoning based Retrieval Planning for Complex RAG Systems\n  - When to Retrieve: Teaching LLMs to Utilize Information Retrieval Effectively\n  - PlanRAG: A Plan-then-Retrieval Augmented Generation for Generative Large Language Models as Decision Makers\n  - ONEGEN: EFFICIENT ONE-PASS UNIFIED GENERATION AND RETRIEVAL FOR LLMS\n  - Probing-RAG: Self-Probing to Guide Language Models in Selective Document Retrieval\n- Graph RAG \n  - GRAPH Retrieval-Augmented Generation: A Survey\n  - From Local to Global: A Graph RAG Approach to Query-Focused Summarization\n  - GRAG: Graph Retrieval-Augmented Generation\n  - GNN-RAG: Graph Neural Retrieval for Large Language Model Reasoning \n  - THINK-ON-GRAPH: DEEP AND RESPONSIBLE REASONING OF LARGE LANGUAGE MODEL ON KNOWLEDGE GRAPH\n  - LightRAG: Simple and Fast Retrieval-Augmented Generation\n  - THINK-ON-GRAPH: DEEP AND RESPONSIBLE REASON- ING OF LARGE LANGUAGE MODEL ON KNOWLEDGE GRAPH\n  - StructRAG: Boosting Knowledge Intensive Reasoning of LLMs via Inference-time Hybrid Information Structurization\n- Multistep RAG\n  - SYNERGISTIC INTERPLAY BETWEEN SEARCH AND LARGE LANGUAGE MODELS FOR INFORMATION RETRIEVAL\n  - Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions\n  - Enhancing Retrieval-Augmented Large Language Models with Iterative Retrieval-Generation Synergy\n  - RAT: Retrieval Augmented Thoughts Elicit Context-Aware Reasoning in Long-Horizon Generation\n  - IM-RAG: Multi-Round Retrieval-Augmented Generation Through Learning Inner Monologues\n  - Demonstrate-Search-Predict: Composing retrieval and language models for knowledge-intensive NLP\n  - Search-in-the-Chain: Towards Accurate, Credible and Traceable Large Language Models for Knowledge-intensive  Tasks\n  - MindSearch 思·索: Mimicking Human Minds Elicits Deep AI Searcher\n  - RQ-RAG: LEARNING TO REFINE QUERIES FOR RETRIEVAL AUGMENTED GENERATION\n  - AutoPRM: Automating Procedural Supervision for Multi-Step Reasoning via Controllable Question Decomposition\n- Timeline RAG \n  - Unfolding the Headline: Iterative Self-Questioning for News Retrieval and Timeline Summarization\n- fast rag\n  - MINIRAG: TOWARDS EXTREMELY SIMPLE RETRIEVAL-AUGMENTED GENERATION\n  - EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations\n- Deep Research\n  - Deep Researcher with Test-Time Diffusion\n\n\n### Other Prompt Engineer(prompt_engineer) \n- PDL: A Declarative Prompt Programming Language\n- Why Prompt Design Matters and Works: A Complexity Analysis of Prompt Search Space in LLMs\n- Prompting_as_Scientific_Inquiry\n- Calibrate Before Use: Improving Few-Shot Performance of Language Models\n- In-Context Instruction Learning\n- LEARNING PERFORMANCE-IMPROVING CODE EDITS\n- Boosting Theory-of-Mind Performance in Large Language Models via Prompting\n- Generated Knowledge Prompting for Commonsense Reasoning\n- RECITATION-AUGMENTED LANGUAGE MODELS\n- kNN PROMPTING: BEYOND-CONTEXT LEARNING WITH CALIBRATION-FREE NEAREST NEIGHBOR INFERENCE\n- EmotionPrompt: Leveraging Psychology for Large Language Models Enhancement via Emotional Stimulus\n- Causality-aware Concept Extraction based on Knowledge-guided Prompting\n- LARGE LANGUAGE MODELS AS OPTIMIZERS\n- Prompts As Programs: A Structure-Aware Approach to Efficient Compile-Time Prompt Optimization\n- Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V\n- RePrompt: Automatic Prompt Editing to Refine AI-Generative Art Towards Precise Expressions\n- MedPrompt: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine\n- DSPy Assertions: Computational Constraints for Self-Refining Language Model Pipelines\n- Prompts as Auto-Optimized Training Hyperparameters: Training Best-in-Class IR Models from Scratch with 10 Gold Labels\n- In-Context Learning for Extreme Multi-Label Classification\n- Optimizing Instructions and Demonstrations for Multi-Stage Language Model Programs\n- DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines\n- CONNECTING LARGE LANGUAGE MODELS WITH EVOLUTIONARY ALGORITHMS YIELDS POWERFUL PROMP OPTIMIZERS\n- TextGrad: Automatic \"Differentiation\" via Text\n- Task Facet Learning: A Structured Approach to Prompt Optimization\n- LangGPT: Rethinking Structured Reusable Prompt Design Framework for LLMs from the Programming Language\n- PAS: Data-Efficient Plug-and-Play Prompt Augmentation System\n- Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models\n- From Pen to Prompt: How Creative Writers Integrate AI into their Writing Practice\n- Does Prompt Formatting Have Any Impact on LLM Performance?\n- AUTO-DEMO PROMPTING: LEVERAGING GENERATED OUTPUTS AS DEMONSTRATIONS FOR ENHANCED BATCH PROMPTING\n- PROMPTBREEDER: SELF-REFERENTIAL SELF-IMPROVEMENT VIA PROMPT EVOLUTION\n- Psychologically Enhanced AI Agents\n- Attentive Reasoning Queries: A Systematic Method for Optimizing Instruction-Following in Large Language Models\n- Deterministic AI Agent Personality Expression through Standard Psychological Diagnostics\n\n### 大模型图表理解和生成\n- survey\n  - Table Meets LLM: Can Large Language Models Understand Structured Table Data? A Benchmark and Empirical Study\n  - Large Language Models(LLMs) on Tabular Data: Prediction, Generation, and Understanding - A Survey\n  - Exploring the Numerical Reasoning Capabilities of Language Models: A Comprehensive Analysis on Tabular Data\n- prompt \n  - Large Language Models are Versatile Decomposers: Decompose Evidence and Questions for Table-based Reasoning \n  - Tab-CoT: Zero-shot Tabular Chain of Thought\n  - Chain-of-Table: Evolving Tables in the Reasoning Chain for Table Understanding\n- fintuning\n  - TableLlama: Towards Open Large Generalist Models for Tables\n  - TableLLM: Enabling Tabular Data Manipulation by LLMs in Real Office Usage Scenarios\n- multimodal\n  - MMC: Advancing Multimodal Chart Understanding with Large-scale Instruction Tuning\n  - ChartLlama: A Multimodal LLM for Chart Understanding and Generation\n  - ChartAssisstant: A Universal Chart Multimodal Language Model via Chart-to-Table Pre-training and Multitask Instruction Tuning\n  - ChartInstruct: Instruction Tuning for Chart Comprehension and Reasoning\n  - ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for Complicated Chart Reasoning\n  - MATCHA : Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering\n  - UniChart: A Universal Vision-language Pretrained Model for Chart Comprehension and Reasoning\n  - TinyChart: Efficient Chart Understanding with Visual Token Merging and Program-of-Thoughts Learning\n  - Tables as Texts or Images: Evaluating the Table Reasoning Ability of LLMs and MLLMs\n  - TableVQA-Bench: A Visual Question Answering Benchmark on Multiple Table Domains\n  - TabPedia: Towards Comprehensive Visual Table Understanding with Concept Synergy\n- generative UI\n  - Generative UI: LLMs are Effective UI Generators\n\n### LLM+KG\n- 综述类\n  - Unifying Large Language Models and Knowledge Graphs: A Roadmap\n  - Large Language Models and Knowledge Graphs: Opportunities and Challenges\n  - [知识图谱与大模型融合实践研究报告2023](https:\u002F\u002Fblog.csdn.net\u002Fm0_37586850\u002Farticle\u002Fdetails\u002F132463508)\n- KG用于大模型推理\n  - Using Large Language Models for Zero-Shot Natural Language Generation from Knowledge Graphs\n  - MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large Language Models\n  - Knowledge-Augmented Language Model Prompting for Zero-Shot Knowledge Graph Question Answering\n  - Domain Specific Question Answering Over Knowledge Graphs Using Logical Programming and Large Language Models\n  - BRING YOUR OWN KG: Self-Supervised Program Synthesis for Zero-Shot KGQA\n  - StructGPT: A General Framework for Large Language Model to Reason over Structured Data\n- 大模型用于KG构建\n  - Enhancing Knowledge Graph Construction Using Large Language Models \n  - LLM-assisted Knowledge Graph Engineering: Experiments with ChatGPT\n  - ITERATIVE ZERO-SHOT LLM PROMPTING FOR KNOWLEDGE GRAPH CONSTRUCTION\n  - Exploring Large Language Models for Knowledge Graph Completion\n\n### Humanoid Agents\n- HABITAT 3.0: A CO-HABITAT FOR HUMANS, AVATARS AND ROBOTS\n- Humanoid Agents: Platform for Simulating Human-like Generative Agents\n- Voyager: An Open-Ended Embodied Agent with Large Language Models\n- [Shaping the future of advanced robotics](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fshaping-the-future-of-advanced-robotics\u002F)\n- AUTORT: EMBODIED FOUNDATION MODELS FOR LARGE SCALE ORCHESTRATION OF ROBOTIC AGENTS\n- ROBOTIC TASK GENERALIZATION VIA HINDSIGHT TRAJECTORY SKETCHES\n- ALFWORLD: ALIGNING TEXT AND EMBODIED ENVIRONMENTS FOR INTERACTIVE LEARNING\n- MINEDOJO: Building Open-Ended  Embodied Agents with Internet-Scale Knowledge\n- LEGENT: Open Platform for Embodied Agents\n\n### pretrain_data & pretrain\n- DoReMi: Optimizing Data Mixtures Speeds Up Language Model Pretraining\n- The Pile: An 800GB Dataset of Diverse Text for Language Modeling\n- CCNet: Extracting High Quality Monolingual Datasets fromWeb Crawl Data\n- WanJuan: A Comprehensive Multimodal Dataset for Advancing English and Chinese Large Models\n- CLUECorpus2020: A Large-scale Chinese Corpus for Pre-training Language Model\n- In-Context Pretraining: Language Modeling Beyond Document Boundaries\n- Data Mixing Laws: Optimizing Data Mixtures by Predicting Language Modeling Performance \n- Zyda: A 1.3T Dataset for Open Language Modeling\n- Entropy Law: The Story Behind Data Compression and LLM Performance\n- Data, Data Everywhere: A Guide for Pretraining Dataset Construction\n- Data curation via joint example selection further accelerates multimodal learning\n- IMPROVING PRETRAINING DATA USING PERPLEXITY CORRELATIONS\n- AI models collapse when trained on recursively generated data\n\n### 领域模型SFT(domain_llms)\n- 金融\n  - BloombergGPT： A Large Language Model for Finance   \n  - FinVis-GPT: A Multimodal Large Language Model for Financial Chart Analysis\n  - CFGPT: Chinese Financial Assistant with Large Language Model\n  - CFBenchmark: Chinese Financial Assistant Benchmark for Large Language Model\n  - InvestLM: A Large Language Model for Investment using Financial Domain Instruction Tuning\n  - BBT-Fin: Comprehensive Construction of Chinese Financial Domain Pre-trained Language Model, Corpus and Benchmark\n  - PIXIU: A Large Language Model, Instruction Data and Evaluation Benchmark for Finance\n  - The FinBen: An Holistic Financial Benchmark for Large Language Models\n  - XuanYuan 2.0: A Large Chinese Financial Chat Model with Hundreds of Billions Parameters\n  - Towards Trustworthy Large Language Models in Industry Domains\n  - When AI Meets Finance (StockAgent): Large Language Model-based Stock Trading in Simulated Real-world Environments\n  - A Survey of Large Language Models for Financial Applications: Progress, Prospects and Challenges\n- 生物医疗\n  - MedGPT: Medical Concept Prediction from Clinical Narratives\n  - BioGPT：Generative Pre-trained Transformer for Biomedical Text Generation and Mining\n  - PubMed GPT: A Domain-specific large language model for biomedical text :star:\n  - ChatDoctor：Medical Chat Model Fine-tuned on LLaMA Model using Medical Domain Knowledge\n  - Med-PaLM：Large Language Models Encode Clinical Knowledge[V1,V2] :star:\n  - SMILE: Single-turn to Multi-turn Inclusive Language Expansion via ChatGPT for Mental Health Support\n  - Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language Model through Expert Feedback and Real-world Multi-turn Dialogue\n- 其他\n  - Galactia：A Large Language Model for Science\n  - Augmented Large Language Models with Parametric Knowledge Guiding\n  - ChatLaw Open-Source Legal Large Language Model :star:\n  - MediaGPT : A Large Language Model For Chinese Media\n  - KITLM: Domain-Specific Knowledge InTegration into Language Models for Question Answering\n  - EcomGPT: Instruction-tuning Large Language Models with Chain-of-Task Tasks for E-commerce \n  - TableGPT: Towards Unifying Tables, Nature Language and Commands into One GPT\n  - LLEMMA: AN OPEN LANGUAGE MODEL FOR MATHEMATICS\n  - MEDITAB: SCALING MEDICAL TABULAR DATA PREDICTORS VIA DATA CONSOLIDATION, ENRICHMENT, AND REFINEMENT\n  - PLLaMa: An Open-source Large Language Model for Plant Science\n  - ADAPTING LARGE LANGUAGE MODELS VIA READING COMPREHENSION\n\n\n### LLM超长文本处理 (long_input)\n- 位置编码、注意力机制优化\n  - Unlimiformer: Long-Range Transformers with Unlimited Length Input\n  - Parallel Context Windows for Large Language Models\n  - [苏剑林, NBCE：使用朴素贝叶斯扩展LLM的Context处理长度](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9617) :star:\n  - Structured Prompting: Scaling In-Context Learning to 1,000 Examples\n  - Vcc: Scaling Transformers to 128K Tokens or More by Prioritizing Important Tokens\n  - Scaling Transformer to 1M tokens and beyond with RMT\n  - TRAIN SHORT, TEST LONG: ATTENTION WITH LINEAR BIASES ENABLES INPUT LENGTH EXTRAPOLATION :star:\n  - Extending Context Window of Large Language Models via Positional Interpolation\n  - LongNet: Scaling Transformers to 1,000,000,000 Tokens\n  - https:\u002F\u002Fkaiokendev.github.io\u002Ftil#extending-context-to-8k\n  - [苏剑林,Transformer升级之路：10、RoPE是一种β进制编码](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9675) :star:\n  - [苏剑林,Transformer升级之路：11、将β进制位置进行到底](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9706)\n  - [苏剑林,Transformer升级之路：12、无限外推的ReRoPE？](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9708)\n  - [苏剑林,Transformer升级之路：15、Key归一化助力长度外推](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9859)\n  - EFFICIENT STREAMING LANGUAGE MODELS WITH ATTENTION SINKS\n  - Ring Attention with Blockwise Transformers for Near-Infinite Context\n  - YaRN: Efficient Context Window Extension of Large Language Models\n  - LM-INFINITE: SIMPLE ON-THE-FLY LENGTH GENERALIZATION FOR LARGE LANGUAGE MODELS\n  - EFFICIENT STREAMING LANGUAGE MODELS WITH ATTENTION SINKS\n  - Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention\n- 上文压缩排序方案\n  - Lost in the Middle: How Language Models Use Long Contexts :star:\n  - LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models\n  - LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios via Prompt Compression  :star:\n  - Learning to Compress Prompts with Gist Tokens\n  - Unlocking Context Constraints of LLMs: Enhancing Context Efficiency of LLMs with Self-Information-Based Content Filtering\n  - LongAgent: Scaling Language Models to 128k Context through Multi-Agent Collaboration\n  - PCToolkit: A Unified Plug-and-Play Prompt Compression Toolkit of Large Language Models\n  - Are Long-LLMs A Necessity For Long-Context Tasks?\n  - QwenLong-CPRS: Towards \\infty-LLMs with Dynamic Context Optimization\n- 训练和模型架构方案\n  - Never Train from Scratch: FAIR COMPARISON OF LONGSEQUENCE MODELS REQUIRES DATA-DRIVEN PRIORS\n  - Soaring from 4K to 400K: Extending LLM's Context with Activation Beacon\n  - Never Lost in the Middle: Improving Large Language Models via Attention Strengthening Question Answering\n  - Focused Transformer: Contrastive Training for Context Scaling\n  - Effective Long-Context Scaling of Foundation Models\n  - ON THE LONG RANGE ABILITIES OF TRANSFORMERS\n  - Efficient Long-Range Transformers: You Need to Attend More, but Not Necessarily at Every Layer\n  - POSE: EFFICIENT CONTEXT WINDOW EXTENSION OF LLMS VIA POSITIONAL SKIP-WISE TRAINING\n  - LONGLORA: EFFICIENT FINE-TUNING OF LONGCONTEXT LARGE LANGUAGE MODELS\n  - LongAlign: A Recipe for Long Context Alignment of Large Language Models\n  - Data Engineering for Scaling Language Models to 128K Context\n  - MEGALODON: Efficient LLM Pretraining and Inference with Unlimited Context Length\n  - Make Your LLM Fully Utilize the Context\n  - Untie the Knots: An Efficient Data Augmentation Strategy for Long-Context Pre-Training in Language Models\n  - LIFT: Improving Long Context Understanding of Large Language Models through Long Input Fine-Tuning\n  - REFRAG: Rethinking RAG based Decoding\n- 效率优化\n  - Efficient Attention: Attention with Linear Complexities\n  - Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention\n  - HyperAttention: Long-context Attention in Near-Linear Time\n  - FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness\n  - With Greater Text Comes Greater Necessity: Inference-Time Training Helps Long Text Generation\n- 评估\n  - NOLIMA: Long-Context Evaluation Beyond Literal Matching\n  - The Illusion of Diminishing Returns: Measuring Long Horizon Execution in LLMs\n- 原理分析\n  - Retrieval Head Mechanistically Explains Long-Context Factuality\n\n### LLM长文本生成（long_output）\n- Re3 : Generating Longer Stories With Recursive Reprompting and Revision\n- RECURRENTGPT: Interactive Generation of (Arbitrarily) Long Text \n- DOC: Improving Long Story Coherence With Detailed Outline Control\n- Weaver: Foundation Models for Creative Writing\n- Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models\n- Into the Unknown Unknowns: Engaged Human Learning through Participation in Language Model Agent Conversations\n- Beyond Outlining: Heterogeneous Recursive Planning for Adaptive Long-form Writing with Language Models\n\n### NL2SQL\n- 大模型方案\n  - DIN-SQL: Decomposed In-Context Learning of Text-to-SQL with Self-Correction :star:\n  - C3: Zero-shot Text-to-SQL with ChatGPT  :star:\n  - SQL-PALM: IMPROVED LARGE LANGUAGE MODEL ADAPTATION FOR TEXT-TO-SQL\n  - BIRD Can LLM Already Serve as A Database Interface? A BIg Bench for Large-Scale Database Grounded Text-to-SQL :star:\n  - A Case-Based Reasoning Framework for Adaptive Prompting in Cross-Domain Text-to-SQL\n  - ChatDB: AUGMENTING LLMS WITH DATABASES AS THEIR SYMBOLIC MEMORY\n  - A comprehensive evaluation of ChatGPT’s zero-shot Text-to-SQL capability\n  - Few-shot Text-to-SQL Translation using Structure and Content Prompt Learning\n  - Tool-Assisted Agent on SQL Inspection and Refinement in Real-World Scenarios\n  - Agentar-Scale-SQL: Advancing Text-to-SQL through Orchestrated Test-Time Scaling\n- Domain Knowledge Intensive\n  - Towards Knowledge-Intensive Text-to-SQL Semantic Parsing with Formulaic Knowledge\n  - Bridging the Generalization Gap in Text-to-SQL Parsing with Schema Expansion\n  - Towards Robustness of Text-to-SQL Models against Synonym Substitution\n  - FinQA: A Dataset of Numerical Reasoning over Financial Data\n- others\n  - RESDSQL: Decoupling Schema Linking and Skeleton Parsing for Text-to-SQL\n  - MIGA: A Unified Multi-task Generation Framework for Conversational Text-to-SQL\n\n\n### Code Generation\n- Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering\n- Codeforces as an Educational Platform for Learning Programming in Digitalization\n- Competition-Level Code Generation with AlphaCode\n- CODECHAIN: TOWARDS MODULAR CODE GENERATION THROUGH CHAIN OF SELF-REVISIONS WITH REPRESENTATIVE SUB-MODULES\n- AI Coders Are Among Us: Rethinking Programming Language Grammar Towards Efficient Code Generation\n\n### 降低模型幻觉 (reliability)\n- Survey \n  - Large language models and the perils of their hallucinations\n  - Survey of Hallucination in Natural Language Generation\n  - Siren's Song in the AI Ocean: A Survey on Hallucination in Large Language Models\n  - A Survey of Hallucination in Large Foundation Models\n  - A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions\n  - Calibrated Language Models Must Hallucinate\n  - Why Does ChatGPT Fall Short in Providing Truthful Answers?\n  - Why Language Models Hallucinate\n- Prompt or Tunning\n  - R-Tuning: Teaching Large Language Models to Refuse Unknown Questions\n  - PROMPTING GPT-3 TO BE RELIABLE\n  - ASK ME ANYTHING: A SIMPLE STRATEGY FOR PROMPTING LANGUAGE MODELS  :star:\n  - On the Advance of Making Language Models Better Reasoners\n  - RefGPT: Reference → Truthful & Customized Dialogues Generation by GPTs and for GPTs\n  - Rethinking with Retrieval: Faithful Large Language Model Inference\n  - GENERATE RATHER THAN RETRIEVE: LARGE LANGUAGE MODELS ARE STRONG CONTEXT GENERATORS\n  - Large Language Models Struggle to Learn Long-Tail Knowledge\n- Decoding Strategy\n  - Trusting Your Evidence: Hallucinate Less with Context-aware Decoding  :star:\n  - SELF-REFINE:ITERATIVE REFINEMENT WITH SELF-FEEDBACK  :star:\n  - Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference\n  - Inference-Time Intervention: Eliciting Truthful Answers from a Language Model\n  - Enabling Large Language Models to Generate Text with Citations\n  - Factuality Enhanced Language Models for Open-Ended Text Generation\n  - KL-Divergence Guided Temperature Sampling\n  - KCTS: Knowledge-Constrained Tree Search Decoding with Token-Level Hallucination Detection\n  - CONTRASTIVE DECODING IMPROVES REASONING IN LARGE LANGUAGE MODEL\n  - Contrastive Decoding: Open-ended Text Generation as Optimization\n- Probing and Detection\n  - Automatic Evaluation of Attribution by Large Language Models\n  - QAFactEval: Improved QA-Based Factual Consistency Evaluation for Summarization\n  - Zero-Resource Hallucination Prevention for Large Language Models\n  - LLM Lies: Hallucinations are not Bugs, but Features as Adversarial Examples\n  - Language Models (Mostly) Know What They Know  :star:\n  - LM vs LM: Detecting Factual Errors via Cross Examination\n  - Do Language Models Know When They’re Hallucinating References?\n  - SELFCHECKGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models\n  - SELF-CONTRADICTORY HALLUCINATIONS OF LLMS: EVALUATION, DETECTION AND MITIGATION\n  - Self-consistency for open-ended generations\n  - Improving Factuality and Reasoning in Language Models through Multiagent Debate\n  - Selective-LAMA: Selective Prediction for Confidence-Aware Evaluation of Language Models\n  - Can LLMs Express Their Uncertainty? An Empirical Evaluation of Confidence Elicitation in LLMs\n- Reviewing and Calibration\n  - Truth-o-meter: Collaborating with llm in fighting its hallucinations\n  - RARR: Researching and Revising What Language Models Say, Using Language Models\n  - CRITIC: LARGE LANGUAGE MODELS CAN SELFCORRECT WITH TOOL-INTERACTIVE CRITIQUING\n  - VALIDATING LARGE LANGUAGE MODELS WITH RELM\n  - PURR: Efficiently Editing Language Model Hallucinations by Denoising Language Model Corruptions\n  - Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback\n  - Adaptive Chameleon or Stubborn Sloth: Unraveling the Behavior of Large Language Models in Knowledge Clashes\n  - Woodpecker: Hallucination Correction for Multimodal Large Language Models \n  - Zero-shot Faithful Factual Error Correction\n  - LARGE LANGUAGE MODELS CANNOT SELF-CORRECT REASONING YET\n  - Training Language Models to Self-Correct via Reinforcement Learning\n- Training LLMs for Honesty via Confessions\n\n\n### 大模型评估（evaluation）\n- 事实性评估\n  - TRUSTWORTHY LLMS: A SURVEY AND GUIDELINE FOR EVALUATING LARGE LANGUAGE MODELS’ ALIGNMENT\n  - TrueTeacher: Learning Factual Consistency Evaluation with Large Language Models\n  - TRUE: Re-evaluating Factual Consistency Evaluation\n  - FACTSCORE: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation\n  - KoLA: Carefully Benchmarking World Knowledge of Large Language Models\n  - When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories\n  - FACTOOL: Factuality Detection in Generative AI A Tool Augmented Framework for Multi-Task and Multi-Domain Scenarios \n  - LONG-FORM FACTUALITY IN LARGE LANGUAGE MODELS\n- 检测任务\n  - Detecting Pretraining Data from Large Language Models\n  - Scalable Extraction of Training Data from (Production) Language Models\n  - Rethinking Benchmark and Contamination for Language Models with Rephrased Samples\n- 通用评估\n  - G-EVAL: NLG Evaluation using GPT-4 with Better Human Alignment\n- 工具调用评估\n  - ToolRM: Outcome Reward Models for Tool-Calling Large Language Models\n- Agent 评估\n  - SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?\n  - ALE-Bench: A Benchmark for Long-Horizon Objective-Driven Algorithm Engineering\n  - FinSearchComp: Towards a Realistic, Expert-Level Evaluation of Financial Search and Reasoning\n  - Supporting Our AI Overlords: Redesigning Data Systems to be Agent-First\n\n### 推理优化(inference)\n- Fast Transformer Decoding: One Write-Head is All You Need\n- Fast Inference from Transformers via Speculative Decoding\n- GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints\n- Skeleton-of-Thought: Large Language Models Can Do Parallel Decoding\n- SkipDecode: Autoregressive Skip Decoding with Batching and Caching for Efficient LLM Inference\n- BatchPrompt: Accomplish more with less\n- You Only Cache Once: Decoder-Decoder Architectures for Language Models\n- XGrammar: Flexible and Efficient Structured Generation Engine for Large Language Models\n- Precise Length Control in Large Language Models\n- Top-nσ: Not All Logits Are You Need\n-  context cache\n  - Prompt Cache: Modular Attention Reuse for Low-Latency Inference \n  - SGLang: Efficient Execution of Structured Language Model Programs\n  - Efficient Prompt Caching via Embedding Similarity\n  - ChunkAttention: Efficient Self-Attention with Prefix-Aware KV Cache and Two-Phase Partition\n  - Hydragen: High-Throughput LLM Inference with Shared Prefixes\n  - Efficient Memory Management for Large Language Model Serving with PagedAttention\n\n### 模型知识编辑黑科技(model_edit)\n- ROME：Locating and Editing Factual Associations in GPT\n- Transformer Feed-Forward Layers Are Key-Value Memories\n- MEMIT: Mass-Editing Memory in a Transformer\n- MEND：Fast Model Editing at Scale\n- Editing Large Language Models: Problems, Methods, and Opportunities\n- Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch\n- Automata-based constraints for language model decoding\n- SGLang: Efficient Execution of Structured Language Model Programs\n\n### 模型合并和剪枝(model_merge)\n- Blending Is All You Need: Cheaper, Better Alternative to Trillion-Parameters LLM\n- DARE Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch\n- EDITING MODELS WITH TASK ARITHMETIC\n- TIES-Merging: Resolving Interference When Merging Models\n- LM-Cocktail: Resilient Tuning of Language Models via Model Merging\n- SLICEGPT: COMPRESS LARGE LANGUAGE MODELS BY DELETING ROWS AND COLUMNS\n- Checkpoint Merging via Bayesian Optimization in LLM Pretrainin\n- Arcee's MergeKit: A Toolkit for Merging Large Language Models\n\n### MOE\n- Tricks for Training Sparse Translation Models\n- ST-MoE: Designing Stable and Transferable Sparse Expert Models\n- Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity \n- GLaM: Efficient Scaling of Language Models with Mixture-of-Experts\n- GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding\n- OUTRAGEOUSLY LARGE NEURAL NETWORKS: THE SPARSELY-GATED MIXTURE-OF-EXPERTS LAYER\n- DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale\n- Dense-to-Sparse Gate for Mixture-of-Experts\n- Efficient Large Scale Language Modeling with Mixtures of Experts\n\n### Multimodal\n- InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning\n- BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models\n- Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models\n- LLava Visual Instruction Tuning\n- MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models\n- BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual Questions\n- mPLUG-Owl : Modularization Empowers Large Language Models with Multimodality\n- LVLM eHub: A Comprehensive Evaluation Benchmark for Large VisionLanguage Models\n- Mirasol3B: A Multimodal Autoregressive model for time-aligned and contextual modalities\n- PaLM-E: An Embodied Multimodal Language Model\n- TabLLM: Few-shot Classification of Tabular Data with Large Language Models\n- AnyGPT: Unified Multimodal LLM with Discrete Sequence Modeling\n- [Sora tech report](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fvideo-generation-models-as-world-simulators)\n- Towards General Computer Control: A Multimodal Agent for Red Dead Redemption II as a Case Study\n- OCR\n  - Vary: Scaling up the Vision Vocabulary for Large Vision-Language Models\n  - Large OCR Model:An Empirical Study of Scaling Law for OCR\n  - ON THE HIDDEN MYSTERY OF OCR IN LARGE MULTIMODAL MODELS\n  - DeepSeek-OCR: Contexts Optical Compression\n- PreFLMR: Scaling Up Fine-Grained Late-Interaction Multi-modal Retrievers\n- Many-Shot In-Context Learning in Multimodal Foundation Models\n- Adding Conditional Control to Text-to-Image Diffusion Models\n- Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs\n- ShowUI: One Vision-Language-Action Model for GUI Visual Agent\n- Flamingo: a Visual Language Model for Few-Shot Learning\n- Segment Anything\n- Monkey : Image Resolution and Text Label Are Important Things for Large Multi-modal Models\n- Learning Transferable Visual Models From Natural Language Supervision\n- AN IMAGE IS WORTH 16X16 WORDS: TRANSFORMERS FOR IMAGE RECOGNITION AT SCALE\n- InternVL1: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks\n- Vary: Scaling up the Vision Vocabulary for Large Vision-Language Models\n- InternVL1.5： How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites\n- Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond\n- Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution\n\n### 综述\n- A Survey of Large Language Models\n- Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing :star:\n- Paradigm Shift in Natural Language Processing\n- Pre-Trained Models: Past, Present and Future\n- What Language Model Architecture and Pretraining objects work best for zero shot generalization  :star:\n- Towards Reasoning in Large Language Models: A Survey\n- Reasoning with Language Model Prompting: A Survey :star:\n- An Overview on Language Models: Recent Developments and Outlook  :star:\n- A Survey of Large Language Models[6.29更新版]\n- Unifying Large Language Models and Knowledge Graphs: A Roadmap\n- Augmented Language Models: a Survey :star:\n- Domain Specialization as the Key to Make Large Language Models Disruptive: A Comprehensive Survey\n- Challenges and Applications of Large Language Models\n- The Rise and Potential of Large Language Model Based Agents: A Survey\n- Large Language Models for Information Retrieval: A Survey\n- AI Alignment: A Comprehensive Survey\n- Trends in Integration of Knowledge and Large Language Models: A Survey and Taxonomy of Methods, Benchmarks, and Applications\n- Large Models for Time Series and Spatio-Temporal Data: A Survey and Outlook\n- A Survey on Language Models for Code\n- Model-as-a-Service (MaaS): A Survey\n\n### 大模型能力探究\n- In Context Learning \n  - LARGER LANGUAGE MODELS DO IN-CONTEXT LEARNING DIFFERENTLY\n  - How does in-context learning work? A framework for understanding the differences from traditional supervised learning\n  - Why can GPT learn in-context? Language Model Secretly Perform Gradient Descent as Meta-Optimizers :star:\n  - Rethinking the Role of Demonstrations What Makes incontext learning work? :star:\n  - Trained Transformers Learn Linear Models In-Context\n  - In-Context Learning Creates Task Vectors\n  - FUNCTION VECTORS IN LARGE LANGUAGE MODELS\n  - Learning without training: The implicit dynamics of in-context learning\n  - LANGUAGE MODELS ARE INJECTIVE AND HENCE INVERTIBLE\n- 涌现能力\n  - Sparks of Artificial General Intelligence: Early experiments with GPT-4\n  - Emerging Ability of Large Language Models :star:\n  - LANGUAGE MODELS REPRESENT SPACE AND TIME\n  - Are Emergent Abilities of Large Language Models a Mirage?\n- 能力评估\n  - IS CHATGPT A GENERAL-PURPOSE NATURAL LANGUAGE PROCESSING TASK SOLVER?\n  - Can Large Language Models Infer Causation from Correlation?\n  - Holistic Evaluation of Language Model\n  - Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond\n  - Theory of Mind May Have Spontaneously Emerged in Large Language Models\n  - Beyond The Imitation Game: Quantifying And Extrapolating The Capabilities Of Language Models\n  - Do Models Explain Themselves? Counterfactual Simulatability of Natural Language Explanations\n  - Demystifying GPT Self-Repair for Code Generation\n  - Evidence of Meaning in Language Models Trained on Programs\n  - Can Explanations Be Useful for Calibrating Black Box Models\n  - On the Robustness of ChatGPT: An Adversarial and Out-of-distribution Perspective \n  - Language acquisition: do children and language models follow similar learning stages?\n  - Language is primarily a tool for communication rather than thought\n- 领域能力\n  - Capabilities of GPT-4 on Medical Challenge Problems\n  - Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine\n  - Persona Vectors: Monitoring and Controlling Character Traits in Language Models\n- 可解释性\n  - Understanding LLM Embeddings for Regression\n  - [When Models Manipulate Manifolds: The Geometry of a Counting Task](https:\u002F\u002Ftransformer-circuits.pub\u002F2025\u002Flinebreaks\u002Findex.html)\n  - Weight-sparse transformers have interpretable circuits\n \n### Prompt Tunning范式\n- Tunning Free Prompt\n  - GPT2: Language Models are Unsupervised Multitask Learners\n  - GPT3: Language Models are Few-Shot Learners   :star:\n  - LAMA: Language Models as Knowledge Bases?\n  - AutoPrompt: Eliciting Knowledge from Language Models\n- Fix-Prompt LM Tunning\n  - T5: Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer\n  - PET-TC(a): Exploiting Cloze Questions for Few Shot Text Classification and Natural Language Inference  :star:\n  - PET-TC(b): PETSGLUE It’s Not Just Size That Matters Small Language Models are also few-shot learners\n  - GenPET: Few-Shot Text Generation with Natural Language Instructions\n  - LM-BFF: Making Pre-trained Language Models Better Few-shot Learners  :star:\n  - ADEPT: Improving and Simplifying Pattern Exploiting Training\n- Fix-LM Prompt Tunning \n  - Prefix-tuning: Optimizing continuous prompts for generation  \n  - Prompt-tunning: The power of scale for parameter-efficient prompt tuning :star:\n  - P-tunning: GPT Understands Too :star:\n  - WARP: Word-level Adversarial ReProgramming\n- LM + Prompt Tunning \n  - P-tunning v2: Prompt Tuning Can Be Comparable to Fine-tunning Universally Across Scales and Tasks\n  - PTR: Prompt Tuning with Rules for Text Classification\n  - PADA: Example-based Prompt Learning for on-the-fly Adaptation to Unseen Domains\n- Fix-LM Adapter Tunning\n  - LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS :star:\n  - LST: Ladder Side-Tuning for Parameter and Memory Efficient Transfer Learning\n  - Parameter-Efficient Transfer Learning for NLP\n  - INTRINSIC DIMENSIONALITY EXPLAINS THE EFFECTIVENESS OF LANGUAGE MODEL FINE-TUNING\n  - DoRA: Weight-Decomposed Low-Rank Adaptation\n-  Representation Tuning\n  - ReFT: Representation Finetuning for Language Models\n\n\n### Timeseries LLM\n- TimeGPT-1\n- Large Models for Time Series and Spatio-Temporal Data: A Survey and Outlook\n- TIME-LLM: TIME SERIES FORECASTING BY REPROGRAMMING LARGE LANGUAGE MODELS\n- Large Language Models Are Zero-Shot Time Series Forecasters\n- TEMPO: PROMPT-BASED GENERATIVE PRE-TRAINED TRANSFORMER FOR TIME SERIES FORECASTING\n- Generative Pre-Training of Time-Series Data for Unsupervised Fault Detection in Semiconductor Manufacturing\n- Lag-Llama: Towards Foundation Models for Time Series Forecasting\n- PromptCast: A New Prompt-based Learning Paradigm for Time Series Forecasting\n\n### Quanization\n- AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration\n- LLM-QAT: Data-Free Quantization Aware Training for Large Language Models\n- LLM.int8() 8-bit Matrix Multiplication for Transformers at Scale\n- SmoothQuant Accurate and Efficient Post-Training Quantization for Large Language Models\n\n### Adversarial Attacking \n- Curiosity-driven Red-teaming for Large Language Models\n- Red Teaming Language Models with Language Models\n- EXPLORE, ESTABLISH, EXPLOIT: RED-TEAMING LANGUAGE MODELS FROM SCRATCH\n\n\n### 对话模型\n- LaMDA: Language Models for Dialog Applications\n- Sparrow: Improving alignment of dialogue agents via targeted human judgements :star:\n- BlenderBot 3: a deployed conversational agent that continually learns to responsibly engage\n- How NOT To Evaluate Your Dialogue System: An Empirical Study of Unsupervised Evaluation Metrics for Dialogue Response Generation\n- DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI\n- Enhancing Chat Language Models by Scaling High-quality Instructional Conversations\n- DiagGPT: An LLM-based Chatbot with Automatic Topic Management for Task-Oriented Dialogue\n\n\n### Others\n- Pretraining on the Test Set Is All You Need 哈哈作者你是懂讽刺文学的\n- Learnware: Small Models Do Big\n- The economic potential of generative AI\n- A PhD Student’s Perspective on Research in NLP in the Era of Very Large Language Models\n- How People Use ChatGPT\n","# 解密提示\n> 如果大语言模型的突然兴起让你感到沮丧，不妨阅读一下项目根目录下的《Choose Your Weapon：抑郁的AI研究者的生存策略》。\n以下内容将持续更新，请点赞以保持关注~\n\n## LLM资源汇总\n- [开源模型和评测榜单](开源模型.MD)\n- [开源推理、微调、Agent、RAG、Prompt框架](开源框架.MD)\n- [开源SFT、RLHF、预训练数据集](开源数据.MD)\n- [AIGC各领域应用汇总](AIGC各领域应用.MD)\n- [Prompt教程、经典博客和AI会议访谈](教程博客会议.MD)\n\n## 跟着博客读论文\n- [解密Prompt系列1. Tunning-Free Prompt：GPT2 & GPT3 & LAMA & AutoPrompt](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2215545?areaSource=&traceId=)\n- [解密Prompt系列2. 冻结Prompt微调LM： T5 & PET & LM-BFF](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2223355?areaSource=&traceId=)\n- [解密Prompt系列3. 冻结LM微调Prompt: Prefix-tuning & Prompt-tuning & P-tuning](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2237259?areaSource=&traceId=)\n- [解密Prompt系列4. 升级Instruction Tuning：Flan\u002FT0\u002FInstructGPT\u002FTKInstruct](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2245094?areaSource=&traceId=)\n- [解密prompt系列5. APE+SELF=自动化指令集构建代码实现](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2260697?areaSource=&traceId=)\n- [解密Prompt系列6. lora指令微调扣细节-请冷静,1个小时真不够~](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2276508)\n- [解密Prompt系列7. 偏好对齐RLHF-OpenAI·DeepMind·Anthropic对比分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2289566?areaSource=&traceId=)\n- [解密Prompt系列8. 无需训练让LLM支持超长输入:知识库 & Unlimiformer & PCW & NBCE ](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2295783?areaSource=&traceId=)\n- [解密Prompt系列9. COT：模型复杂推理-思维链基础和进阶玩法](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2296079?areaSource=&traceId=)\n- [解密Prompt系列10. COT：思维链COT原理探究](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2298660)\n- [解密Prompt系列11. COT：小模型也能COT，先天不足后天补](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002Fold\u002F2301999)\n- [解密Prompt系列12. LLM Agent零微调范式 ReAct & Self Ask](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2305421)\n- [解密Prompt系列13. LLM Agent指令微调方案: Toolformer & Gorilla](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2312674)\n- [解密Prompt系列14. LLM Agent之搜索应用设计：WebGPT & WebGLM & WebCPM](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2319879)\n- [解密Prompt系列15. LLM Agent之数据库应用设计：DIN & C3 & SQL-Palm & BIRD](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2328749)\n- [解密Prompt系列16. LLM对齐经验之数据越少越好？LTD & LIMA & AlpaGasus](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2333495)\n- [解密Prompt系列17. LLM对齐方案再升级 WizardLM & BackTranslation & SELF-ALIGN](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2338592)\n- [解密Prompt系列18. LLM Agent之只有智能体的世界](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2351540)\n- [解密Prompt系列19. LLM Agent之数据分析领域的应用：Data-Copilot & InsightPilot](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2358413)\n- [解密Prompt系列20. RAG之再谈召回多样性优化](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2365050)\n- [解密Prompt系列21. RAG之再谈召回信息密度和质量](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2369977)\n- [​解密Prompt系列22. RAG的反思：放弃了压缩还是智能么？](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2375066)\n- [解密Prompt系列23.大模型幻觉分类&归因&检测&缓解方案脑图全梳理](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2378383)\n- [解密prompt系列24. RLHF新方案之训练策略：SLiC-HF & DPO & RRHF & RSO](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2389619)\n- [解密prompt系列25. RLHF改良方案之样本标注：RLAIF & SALMON](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2398654)\n- [解密prompt系列26. 人类思考vs模型思考：抽象和发散思维](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2394120)\n- [解密prompt系列27. LLM对齐经验之如何降低通用能力损失](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2406888)\n- [解密Prompt系列28. LLM Agent之金融领域智能体：FinMem & FinAgent](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2411792)\n- [解密Prompt系列29. LLM Agent之真实世界海量API解决方案：ToolLLM & AnyTool](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2415908)\n- [解密Prompt系列30. LLM Agent之互联网冲浪智能体们](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2419768)\n- [​解密Prompt系列31. LLM Agent之从经验中不断学习的智能体](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2425139)\n- [解密Prompt系列32. LLM之表格理解任务-文本模态](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2429900)\n- [解密Prompt系列33. LLM之图表理解任务-多模态篇](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2433883)\n- [​解密prompt系列34. RLHF之训练另辟蹊径：循序渐进 & 青出于蓝](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2437031)\n- [解密prompt系列35. Prompt标准化进行时！ DSPy论文串烧和代码示例](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2441201)\n- [解密Prompt系列36. Prompt结构化编写和最优化算法UNIPROMPT](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2444167)\n- [解密Prompt系列37. RAG之前置决策何时联网的多种策略](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2448156)\n- [解密Prompt系列38. 多Agent路由策略](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2451000)\n- [解密prompt系列39. RAG之借助LLM优化精排环节](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2453693)\n- [解密prompt系列40. LLM推理scaling Law](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2456441)\n- [解密prompt系列41. GraphRAG真的是Silver Bullet？](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2461325)\n- [解密prompt系列42. LLM通往动态复杂思维链之路](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2464011)\n- [解密prompt系列43. LLM Self Critics](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2468406)\n- [解密prompt系列44. RAG探索模式？深度思考模式？](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2474048)\n- [解密Prompt系列45. 再探LLM Scalable Oversight -辩论、博弈哪家强](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2479401)\n- [解密prompt系列46. LLM结构化输出代码示例和原理分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2483500)\n- [解密prompt系列47. O1 Long Thought的一些特征分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2487221)\n- [​解密prompt系列48. DeepSeek R1 & Kimi 1.5长思维链 - RL Scaling](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2493924)\n- [​解密prompt系列49. 回顾R1之前的思维链发展](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2497501)\n- [解密prompt系列50. RL用于优化Agent行为路径的一些思路](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2502322)\n- [解密prompt系列51. R1实验的一些细节讨论](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2506684)\n- [解密prompt系列52. 闲聊大模型还有什么值得探索的领域](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2510004)\n- [解密prompt系列53. 再谈大模型Memory](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2514545)\n- [解密prompt系列54. Context Cache代码示例和原理分析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2522820)\n- [解密prompt系列55. Agent Memory的工程实现 - Mem0 & LlamaIndex](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2528447)\n- [解密prompt系列56. Agent context Engineering - 单智能体代码剖析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2537040)\n- [​解密prompt系列57. Agent Context Engineering - 多智能体代码剖析](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2541926)\n- [解密prompt系列58. MCP - 工具演变 & MCP基础](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2549927)\n- [解密prompt系列59. MCP实战：从Low-Level到FastMCP的搭建演进](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2554794)\n- [​解密prompt系列60. Agent实战：从0搭建Jupter数据分析智能体](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2563549)\n- [​解密prompt系列61. 手搓代码沙箱与FastAPI-MCP实战](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2570796)\n- [​解密prompt系列62. Agent Memory新视角 - MATTS&CFGM&MIRIX](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2577365)\n- [解密prompt系列63. Agent训练方案: RStar2 & Early Experience etc](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2581959)\n- [解密Prompt系列64. Anthropic Skils的延伸思考](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2586667)\n- [解密Prompt系列65. 三巨头关于大模型内景的硬核论文](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2594738) \n- [解密Prompt系列66. 视觉Token爆炸→DeepSeek-OCR光学压缩](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2600104)\n- [解密Prompt系列67. 智能体的经济学：从架构选型到工具预算](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2610869)\n- [解密Prompt系列68. 告别逐词蹦字 - Transformer 的新推理范式](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2616180)\n\n## 论文汇总\n\n### 论文列表\n- https:\u002F\u002Fgithub.com\u002Fdongguanting\u002FIn-Context-Learning_PaperList\n- https:\u002F\u002Fgithub.com\u002Fthunlp\u002FPromptPapers\n- https:\u002F\u002Fgithub.com\u002FTimothyxxx\u002FChain-of-ThoughtsPapers\n- https:\u002F\u002Fgithub.com\u002Fthunlp\u002FToolLearningPapers\n- https:\u002F\u002Fgithub.com\u002FMLGroupJLU\u002FLLM-eval-survey\n- https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FPaperForONLG\n- https:\u002F\u002Fgithub.com\u002Fkhuangaf\u002FAwesome-Chart-Understanding\n- https:\u002F\u002Fgithub.com\u002Fsrush\u002Fawesome-o1\u002F?tab=readme-ov-file\n\n### 图像生成\n- 神经离散表征学习\n- 去噪扩散概率模型\n- 基于Transformer的可扩展扩散模型\n- 具有深度语言理解能力的逼真文本到图像扩散模型\n- 基于潜在扩散模型的高分辨率图像合成\n\n### 后训练（与COT、RL有交集）\n- 推理扩展\n  - 针对语言模型问题解决的计算最优推理的实证分析\n  - 更多的LM调用就是全部所需吗？迈向复合AI系统的扩展特性\n  - 大型语言猴子：通过重复采样扩展推理计算\n  - 在测试时以最优方式扩展LLM的计算量，可能比单纯增加模型参数更有效   :star:\n  - Q*：通过深思熟虑的规划提升LLM的多步推理能力\n  - 以自然语言进行规划可改善LLM在代码生成中的搜索表现\n  - ReST-MCTS∗：通过过程奖励引导的树搜索实现LLM自我训练\n  - 类AlphaZero的树搜索可以指导大型语言模型的解码和训练\n  - 更小、更弱但更好：通过计算最优采样训练LLM推理器\n  - 测试时训练在抽象推理中的惊人效果\n  - 长上下文检索增强生成的推理扩展\n  - 通过想象、搜索和批判，迈向LLM的自我改进\n  - InfAlign：推理感知的语言模型对齐\n  - 利用潜在推理扩大测试时计算规模：一种递归深度方法\n  - 规划属于哪种类型的推理？\n  - Goedel-Prover：开源自动定理证明的前沿模型\n  - 从Grokking的学习动力学中涌现特征的可证明扩展规律\n  - 机器学习模型是记忆还是泛化？\n- 慢速思维COT\n  - O1复现之旅：战略进展报告——第一部分  :star:\n  - Marco-o1：迈向用于开放式解决方案的开放推理模型\n  - OpenAI o1模型推理模式的比较研究\n  - 模仿、探索与自我改进：慢速思维推理系统的复现报告\n  - Dualformer：通过随机化推理轨迹学习，实现可控的快慢思维\n  - 训练大型语言模型在连续潜在空间中进行推理\n  - 超越A∗：利用搜索动态自举，通过Transformer实现更好的规划\n  - o1-Coder：面向编码的o1复现版\n  - 搜索与学习的扩展：从强化学习视角复现o1的路线图\n  - Sky-T1：仅需450美元即可训练自己的O1预览模型\n  - 向LLM中的系统2式推理迈进：通过元思维链学习如何思考\n  - rStar-Math：小型LLM可通过自我进化式深度思考掌握数学推理 :star:\n  - 解密LLM中的长思维链推理\n  - 向大型推理模型迈进：大型语言模型强化推理综述\n  - [Huggingface Open R1](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fopen-r1\u002Fupdate-1)\n  - CODEI\u002FO：通过代码输入输出预测凝练推理模式\n  - 训练语言模型高效推理\n  - s1：简单的测试时扩展\n  - 内省Transformer：利用动态深度缩放促进自适应内部思考\n  - ALPHAONE：测试时兼具慢速与快速思维的推理模型\n- O3相关\n  - 使用大型推理模型进行竞技编程\n- RL COT原理\n  - SFT记忆，RL泛化：基础模型后训练的比较研究\n  - 赋予推理模型自我改进能力的认知行为，或高效STaR的四种习惯\n  - 思维四处游离：关于o1类LLM的思考不足\n  - 条条大路通似然：强化学习在微调中的价值\n  - 强化学习是否真的能在基础模型之外激励LLM的推理能力？\n  - 不要只追求长度，更要深入思考：通过深度思考标记衡量LLM的推理努力\n- R1复现\n  - LogicRL：基于规则的强化学习释放LLM推理潜能\n  - [SimpleR1](https:\u002F\u002Fhkust-nlp.notion.site\u002Fsimplerl-reason)\n  - [Huggingface Open R1](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fopen-r1\u002Fupdate-1)\n  - DianJin-R1：评估并提升大型语言模型的金融推理能力\n  - 大型混合推理模型仅在需要时才进行思考\n  - 推理拓扑：通过推理图属性理解大型推理模型\n  - Skywork开放推理者1技术报告\n  - 学习推理：使用GPT-OSS或DeepSeek R1推理轨迹训练LLM\n- RL智能体\n  - RAGEN：通过多轮强化学习理解LLM智能体的自我进化\n  - ToolRL：奖励是工具学习所需要的全部\n  - ReTool：针对LLM的战略性工具使用强化学习\n  - ReSearch：通过强化学习让LLM学会利用搜索进行推理\n  - [利用强化学习改进多轮工具使用](https:\u002F\u002Fwww.bespokelabs.ai\u002Fblog\u002Fimproving-multi-turn-tool-use-with-reinforcement-learning)\n  - WebThinker：赋予大型推理模型深度研究能力\n  - 面向机器学习工程领域的强化学习智能体\n  - AgentGym-RL：通过多轮强化学习训练LLM智能体进行长期决策\n  - rStar2-Agent：代理式推理技术报告\n  - LLM代理式强化学习的现状：综述\n  - 流畅的代理系统优化，实现高效规划与工具使用\n  - UI-TARS-2技术报告：利用多轮强化学习推进GUI智能体发展\n  - PokeeResearch：通过来自AI的反馈和稳健的推理框架，借助强化学习实现有效的深度研究\n  - DeepAnalyze：面向自主数据科学的代理式大型语言模型\n  - 以编程视觉思考：迈向统一的图像思考视角\n  - 通过经验合成扩展智能体学习\n  - CaveAgent：将LLM转变为有状态的运行时操作员\n- 经验学习\n  - 欢迎来到经验时代\n  - 通过早期经验学习智能体\n- 其他训练方式\n  - QWENLONG-L1：迈向具有强化学习的长上下文大型推理模型\n  - REWARDBENCH 2：推进奖励模型评估\n  - 计算即教师：将推理计算转化为无参考监督\n  - DiffusionNFT：利用前向过程进行在线扩散强化\n  - 大规模进化策略：超越强化学习的LLM微调\n  - 在平行样本间学习推理，以提升LLM的推理能力\n  - PARAM∆用于直接权重混合：零成本后训练大型语言模型\n  - LaSeR：基于最后一令牌自我奖励的强化学习\n  - Delta学习假设：在弱数据上进行偏好调整也能带来显著收益\n- RL概述\n  - 强化学习：概述\n  - 迈向大型语言模型后训练的统一视角\n- RL数据集\n  - ReasonMed：一个包含37万个多智能体生成的数据集，用于推动医学推理的发展\n\n### 上下文工程\n- 针对大型语言模型的上下文工程综述\n- 代理式上下文工程：为自我改进的语言模型演化上下文\n- 通过上下文折叠扩展长周期LLM智能体\n- 向代理系统扩展科学迈进\n- 预算意识下的工具使用有助于有效扩展智能体\n- 上下文工程2.0\n- 面向长上下文的端到端测试时训练\n\n### 新模型架构\n- SPG：用于掩码扩散语言模型的夹层策略梯度\n- 少即是多：基于小型网络的递归推理\n- 连续思维机器\n- TiDAR：在扩散过程中思考，在自回归过程中生成\n- 嵌套学习：深度学习架构的幻象\n\n### 主流大语言模型及预训练\n- GLM-130B：一款开放的双语预训练模型\n- PaLM：通过Pathways扩展语言建模\n- PaLM 2 技术报告\n- GPT-4 技术报告\n- Backpack 语言模型\n- LLaMA：开放且高效的基座语言模型\n- Llama 2：开放的基座模型与微调后的对话模型\n- Sheared LLaMA：通过结构化剪枝加速语言模型预训练\n- OpenBA：一款从头开始预训练的开源150亿参数双语非对称序列到序列模型\n- Mistral 7B\n- Ziya2：以数据为中心的学习是所有大语言模型所需要的\n- MEGABLOCKS：使用专家混合实现高效的稀疏训练\n- TUTEL：大规模下的自适应专家混合\n- Phi1——教科书就是全部所需 ：star:\n- Phi1.5——教科书就是全部所需 II：phi-1.5技术报告\n- Phi-3技术报告：一款可在您手机本地运行的强大语言模型\n- Gemini：一系列强大的多模态模型家族\n- 上下文预训练：超越文档边界的语言建模\n- LLAMA PRO：具有块扩展能力的渐进式LLaMA\n- QWEN 技术报告\n- 减少截断可提升语言建模效果\n- ChatGLM：从GLM-130B到GLM-4的一系列大型语言模型，涵盖所有工具\n- Phi-4技术报告\n- Byte潜伏变换器：补丁的扩展性优于标记\n- Qwen2.5技术报告\n- DeepSeek-V3技术报告\n- 混合专家模型\n- DeepSeek_R1 ：star:\n- KIMI K1.5：利用大语言模型扩展强化学习 ：star:\n- CWM：一款权重公开的大语言模型，用于研究结合世界模型的代码生成\n- DeepSeek V3.2技术报告\n- DeepSeek-V3.2：推动开源大型语言模型的前沿发展\n\n### 思维链 (prompt_chain_of_thought)\n- 基础&进阶用法\n    - 【zero-shot-COT】 大型语言模型是零样本推理者 :star:\n    - 【few-shot COT】 思维链提示在大型语言模型中激发推理能力 :star:\n    - 【SELF-CONSISTENCY 】提升语言模型中的思维链推理\n    - 【LEAST-TO-MOST】 提示使大型语言模型具备复杂推理能力 :star:\n    - 【TOT】思维之树：利用大型语言模型进行深思熟虑的问题解决 :star:\n    - 【Plan-and-Solve】 提示：通过大型语言模型改进零样本思维链推理\n    - 【Verify-and-Edit】：一种知识增强的思维链框架\n    - 【GOT】超越思维链，大型语言模型中的有效图式推理\n    - 【TOMT】混合思维之树：结合快思考与慢思考的多跳视觉推理\n    - 【LAMBADA】：自然语言中的自动推理反向链\n    - 【AOT】思想算法：增强大型语言模型中的创意探索 :star:\n    - 【GOT】思想图谱：利用大型语言模型解决复杂问题 :star:\n    - 【PHP】渐进式提示改进大型语言模型的推理能力\n    - 【HtT】大型语言模型可以学习规则 :star:\n    - 【DIVSE】思想多样性提升大型语言模型的推理能力\n    - 【CogTree】从复杂到简单：为小型语言模型解开认知树以进行推理\n    - 【Step-Back】退一步：通过抽象化激发大型语言模型的推理能力 :star:\n    - 【OPRO】大型语言模型作为优化器 :star:\n    - 【BOT】思想缓冲区：基于思想增强的大型语言模型推理\n    - 思想抽象让语言模型成为更好的推理者\n    - 【SymbCoT】通过符号化思维链实现忠实的逻辑推理\n    - 【XOT】一切关于思想：违背彭罗斯三角定律生成思想\n    - 【IoT】思想迭代：利用内部对话实现大型语言模型的自主推理\n    - 【DOT】论思想图谱\n    - 【ROT】思想反转：通过偏好引导的逆向推理预热来增强大型语言模型\n    - 正向与逆向思考：利用大型语言模型进行有效的逆向规划\n    - 【KR】K级推理：在大型语言模型中建立高阶信念以进行战略推理\n    - 【Self-Discover】自我发现：大型语言模型自行构建推理结构\n    - 【Theory-of-Mind】大型语言模型距离具有心智理论的智能体还有多远？\n    - 【PC-SUBQ】提示策略，使大型语言模型能够从相关性中推断因果关系\n    - 逆向思维使LLM成为更强大的推理者\n    - 草稿链：通过减少写作加快思考速度\n    - 马尔可夫LLM测试时缩放的思想原子\n- 非传统COT问题分解方向\n    - 分解式提示：一种模块化方法来解决复杂任务\n    - 连续提示用于分解复杂问题\n- 分领域COT [数学、代码、表格、问答]\n    - 利用语言模型解决定量推理问题\n    - 展示你的解题过程：语言模型的中间计算草稿本\n    - 通过过程和结果反馈解决数学应用题\n    - CodeRL：通过预训练模型和深度强化学习掌握代码生成\n    - T-SciQ：通过大型语言模型信号教授科学问答中的多模态思维链推理\n    - 学习性能提升的代码编辑\n    - 代码链：使用语言模型增强的代码模拟器进行推理\n- 原理分析\n    - 思维链赋予Transformer解决固有串行问题的能力 :star:\n    - 向理解思维链提示迈进：一项关于关键因素的实证研究 :star:\n    - 文本与模式：有效的思维链需要双方配合\n    - 揭示思维链背后的奥秘：理论视角\n    - 大型语言模型很容易被无关上下文分散注意力\n    - 无需提示的思维链推理\n    - 归纳还是演绎？重新思考LLM的基本推理能力\n    - 超越思维链：面向LLM的Chain-of-X范式综述\n    - 用不用思维链？思维链主要帮助数学和符号推理 :star:\n    - 为什么需要逐步思考？推理源于经验的局部性\n   - 大型语言模型中的内部一致性与自我反馈：综述 :star:\n   - 迭代之头：思维链的机制性研究 :star:\n   - 推理步骤长度对大型语言模型的影响 :star:\n   - 大型语言模型是否会在不利用捷径的情况下进行潜在的多跳推理？\n   - 压缩思维链：通过密集表示实现高效推理\n   - LLM真的会在隐式推理中逐步思考吗？\n   - 推理的认知基础及其在LLM中的体现\n- 小模型COT蒸馏\n    - 专门化小型语言模型以实现多步推理 :star:\n    - 教导小型语言模型进行推理\n    - 大型语言模型是推理教师\n    - 将推理能力蒸馏到小型语言模型中\n    - CoT合集：通过思维链微调提升语言模型的零样本和少样本学习能力\n    - 将系统2蒸馏到系统1中\n- COT样本自动构建\u002F选择\n    - AutoCOT：大型语言模型中的自动思维链提示\n    - 针对大型语言模型的思维链主动提示\n    - 基于复杂度的多步推理提示\n- COT能力学习\n   - 大型语言模型可以自我提升\n   - 通过潜变量推理训练思维链\n   - Quiet-STaR：语言模型可以自学在开口前思考\n   - STaR：自学者推理者通过推理自我启动推理\n   - V-STaR：为自学者推理者培训验证者\n   - 先思考再说话：用暂停标记训练语言模型\n   - 自主合成对话与修订技术报告\n   - COT-SELF-INSTRUCT：为推理和非推理任务构建高质量合成提示\n- others\n    - OlaGPT 赋予LLM类人般的问题解决能力\n    - 挑战BIG-Bench任务以及思维链能否解决它们\n    - 大型语言模型在自我验证后成为更好的推理者\n    - ThoughtSource 大型语言模型推理数据的中心枢纽\n    - LLM多步推理中自我一致性失败的两个案例\n\n### 自我进化\n- 达尔文哥德尔机器：自我改进智能体的开放式进化\n- [Alpha Evolve](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Falphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms\u002F)\n- 大型推理模型能否自我训练\n- 通过GRPO对多模态LLM推理进行无监督后训练\n- 超大规模下的进化策略\n- 在极少人类监督下引导LLM自我进化\n\n### RLHF\n- DeepMind\n  - 教导语言模型以经验证的引用支持答案\n  - Sparrow：通过目标性人类判断改进对话代理的一致性 :star:\n  - 统计拒绝采样提升偏好优化效果\n  - 面向语言建模的强化自训练（ReST）\n  - SLiC-HF：结合人类反馈的序列似然校准\n  - 序列似然校准改善条件语言生成\n  - 基于语言模型的奖励设计\n  - 最终答案RL：利用过程与结果反馈解决数学应用题\n  - 利用过程与结果反馈解决数学应用题\n  - 超越人类数据：扩展语言模型的问题解决自训练规模\n  - BOND：通过最佳N个蒸馏对齐大语言模型\n  - 在错误的合成数据上进行RL可将LLM数学推理效率提升8倍\n  - 生成式验证器：将奖励建模视为下一个标记预测\n  - 通过强化学习训练语言模型自我修正\n- OpenAI\n  - PPO：近端策略优化算法 :star:\n  - 面向人类偏好的人工智能深度强化学习\n  - 基于人类偏好微调语言模型\n  - 从人类反馈中学习总结\n  - InstructGPT：利用人类反馈训练语言模型遵循指令 :star:\n  - 奖励模型过度优化的规模法则 :star:\n  - 弱到强泛化：以弱监督激发强大能力 :star:\n  - PRM：让我们逐步验证 :star:\n  - 训练验证者解决数学应用题 [PRM的前置依赖]\n  - [OpenAI超级对齐博客](https:\u002F\u002Fopenai.com\u002Fblog\u002Fintroducing-superalignment)\n  - LLM批评家有助于发现LLM中的漏洞 :star:\n  - 证明者-验证者游戏提升LLM输出的可读性\n  - 基于规则的语言模型安全奖励\n  - 自我批评模型辅助人类评估者\n- Anthropic\n  - 通用语言助手作为对齐研究的实验室\n  - 衡量大规模语言模型可扩展监管的进展\n  - 红队测试语言模型以减少危害：方法、规模化行为及经验教训\n  - 使用来自人类反馈的强化学习训练有益且无害的助手 :star:\n  - 宪法AI：基于AI反馈实现无害性 :star:\n  - 使用人类偏好预训练语言模型\n  - 大型语言模型的道德自我修正能力\n  - 潜伏代理：训练能够通过安全训练持续存在的欺骗性LLM\n- AllenAI，RL4LM：强化学习（不）适用于自然语言处理基准测试\n- 改良方案\n  - RRHF：无需泪水地按响应排名使语言模型与人类反馈对齐\n  - 后见之明链使语言模型与反馈对齐\n  - AlpacaFarm：用于学习人类反馈方法的仿真框架\n  - RAFT：针对生成式基础模型对齐的奖励排序微调\n  - RLAIF：借助AI反馈扩大来自人类反馈的强化学习规模\n  - 在模拟人类社会中训练具有社会一致性的语言模型\n  - RAIN：您的语言模型无需微调即可自行对齐\n  - 用于评估对齐的生成式法官\n  - 透过偏好窥探：解开大规模语言模型对齐的反馈获取难题\n  - SALMON：遵循原则的奖励模型实现自我对齐\n  - 大型语言模型遗忘机制 :star:\n  - 对抗性偏好优化 :star:\n  - 面向人类对齐的偏好排序优化\n  - 道路漫漫：探究RLHF中的长度相关性\n  - 让语言模型从数据中隐式学习自我提升\n  - 奖励模型集成有助于缓解过度优化\n  - 从偏好中学习最优优势并误以为是奖励\n  - 超级反馈：用高质量反馈增强语言模型\n  - MOTIF：来自人工智能反馈的内在动机\n  - 通过优势模型和选择性复习稳定RLHF\n  - Shepherd：语言模型生成的批评家\n  - 学习生成比您的LLM更好的内容\n  - 细粒度人类反馈为语言模型训练提供更好奖励\n  - 从零开始、在最少人类监督下实现语言模型的原则驱动自我对齐\n  - 直接偏好优化：您的语言模型其实是奖励模型\n  - HIR：后见之智让语言模型更善于遵循指令\n  - Aligner：通过弱到强纠正实现高效对齐\n  - 基于最小化主义的人类反馈强化学习方法\n  - PANDA：偏好适应以增强LLM的特定领域能力\n  - 弱到强搜索：通过小型语言模型搜索对齐大型语言模型\n  - 弱到强外推加速对齐进程\n  - DPO是否优于PPO用于LLM对齐？一项全面研究\n  - 标记级直接偏好优化\n  - SimPO：无参考奖励的简单偏好优化\n  - AUTODETECT：迈向大型语言模型自动化弱点检测的统一框架\n  - 元奖励语言模型：以LLM作为元评判者实现自我改进的对齐\n  - HELPSTEER：面向STEERLM的多属性助益数据集\n  - 递归内省：教导语言模型代理如何自我改进\n  - 通过直接Q函数优化提升语言模型的多步推理能力\n  - DeepSeekMath：推动开放语言模型的数学推理极限\n  - GLoRe：何时、何地以及如何通过全局和局部优化提升LLM推理能力\n  - REFT：强化微调下的推理\n  - SCPO：自我一致性偏好优化\n  - MONA：采用非近视批准的近视优化可缓解多步奖励作弊\n  - 通过元强化微调优化测试时计算资源\n  - 预训练策略判别器是通用奖励模型\n- RL探究\n  - 理解RLHF对LLM泛化能力和多样性的影响\n  - 道路漫漫：探究RLHF中的长度相关性\n  - 奖励（不）一致性对RLHF的涓滴效应\n  - 来自人类反馈的强化学习的未解决问题和根本局限性\n  - 人类反馈并非金标准\n  - 基于数据课程对比训练大型语言模型\n  - 语言模型抵制对齐\n  - 探索大型语言模型偏好学习的统一视角：综述\n\n### 记忆\n> 摆脱仅从长度这一狭隘视角，重新审视模型记忆\n- A-MEM：面向LLM智能体的主体性记忆\n- MemInsight：面向LLM智能体的自主记忆增强\n- G-Memory：用于多智能体系统的层次化追踪记忆\n- 智能体工作流记忆\n- KBLAM：基于知识库增强的语言模型\n- MIRIX：基于LLM的多智能体系统记忆框架\n- M3-Agent：看见、倾听、记忆与推理——具备长期记忆的多模态智能体\n- MemTool：优化LLM智能体多轮对话中动态工具调用的短期记忆管理\n- Memory-R1：通过强化学习提升大型语言模型智能体的记忆管理与利用能力\n- 多重记忆系统以增强智能体的长期记忆\n- PerPilot：通过记忆与探索实现基于VLM的移动智能体个性化\n- 由粗到精的具身化记忆用于LLM智能体规划\n- 内在记忆智能体：通过结构化情境记忆构建异构多智能体LLM系统\n- Memp：探索智能体程序性记忆\n- RCR-Router：面向具有结构化记忆的多智能体LLM系统的高效角色感知上下文路由\n- A-MEM：面向LLM智能体的主体性记忆\n- MemoryBank：为大型语言模型增添长期记忆功能\n- 元认知复用：将重复性LLM推理转化为简洁行为\n- 面向语言智能体的认知架构\n- ReasoningBank：借助推理记忆推动智能体自我进化规模化\n- LIGHTMEM：轻量高效的记忆增强生成\n- 泰坦：在测试时学习记忆\n- 在测试时根据反馈学习推理\n- 带有测试时扩散机制的深度研究员\n- 一切相互关联：一场关于测试时记忆、注意偏向、保持与在线优化的旅程\n- 主体性记忆：为大型语言模型智能体学习统一的长短期记忆管理\n- MEMRL：基于情景记忆的运行时强化学习实现智能体自我进化\n\n### 多轮对话\n>- 近期我们也陷入多轮对话优化，发现了角色混乱、理解下降等很多问题 \n- LLM在多轮对话中迷失方向\n\n### 指令微调&对齐 (instruction_tunning)\n- 经典方案\n   - Flan: 微调后的语言模型是零样本学习者 :star:\n   - Flan-T5: 扩展指令微调语言模型\n   - ExT5: 面向迁移学习的极端多任务扩展\n   - Instruct-GPT: 通过人类反馈训练语言模型遵循指令 :star:\n   - T0: 多任务提示训练实现零样本任务泛化\n   - Natural Instructions: 通过自然语言众包指令实现跨任务泛化\n   - Tk-INSTRUCT: 超自然指令：基于1600多个NLP任务的声明式指令实现泛化\n   - ZeroPrompt: 将基于提示的预训练扩展到1000个任务，提升零样本泛化能力\n   - Unnatural Instructions: 几乎无需人工劳动的语言模型微调\n   - INSTRUCTEVAL：迈向指令微调大型语言模型的全面评估\n- SFT数据缩放定律\n    - LIMA: 对齐时“少即是多” :star:\n    - 或许仅需0.5%的数据：低训练数据指令微调的初步探索\n    - AlpaGasus: 用更少的数据训练更好的Alpaca\n    - InstructionGPT-4：用于微调MiniGPT-4的200条指令范式\n    - 指令挖掘：面向大型语言模型的高质量指令数据选择\n    - 带礼貌的Flamingo进行视觉指令微调\n    - 探索指令数据规模对大型语言模型的影响：基于真实场景的实证研究\n    - 大型语言模型学习数学推理的规模效应\n    - 当规模效应遇上LLM微调：数据、模型与微调方法的影响\n- 新对齐\u002F微调方案\n   - WizardLM: 赋能大型语言模型遵循复杂指令 :star:\n   - 自我指令化：引入早停准则以实现最小化指令微调\n   - 基于指令反译的自我对齐 :star:\n   - 混合专家模型与指令微调：大型语言模型的制胜组合\n   - Goat: 经过微调的LLaMA在算术任务上表现超越GPT-4\n   - PROMPT2MODEL: 从自然语言指令生成可部署模型\n   - OpinionGPT: 在指令微调的LLM中建模显式偏见\n   - 通过自我博弈和基于AI反馈的上下文学习提升语言模型谈判能力\n   - 通过元学习神经网络实现类人系统的泛化能力\n   - Magicoder: 只需要源代码\n   - 超越人类数据：利用语言模型进行问题解决的自训练扩展\n   - 生成式表征指令微调\n   - InsCL：一种数据高效的持续学习范式，用于指令微调大型语言模型\n   - 指令层级：训练LLM优先处理特权指令\n   - Magpie：通过使用无任何输入的对齐LLM生成提示，从零开始合成对齐数据\n- 指令数据生成\n  - APE: 大型语言模型是人类级别的提示工程师 :star:\n  - SELF-INSTRUCT: 使用自动生成的指令对齐语言模型 :star:\n  - iPrompt: 通过可解释的自动提示功能，用自然语言解释数据模式\n  - 翻转学习：猜指令！翻转学习让语言模型成为更强的零样本学习者\n  - 面向大型语言模型的公平性导向少量示例提示\n  - 指令归纳：从少量示例到自然语言任务描述\n  - SELF-QA：无监督知识引导的对齐\n  - GPT自我监督以成为更好的数据标注者\n  - Flan数据集的设计与方法\n  - 自消费生成模型走向疯狂\n  - InstructEval：系统性评估指令选择方法\n  - 用微调数据覆盖预训练偏置\n  - 利用大型语言模型改进文本嵌入\n  - MAGPIE：通过使用无任何输入的对齐LLM生成提示，从零开始合成对齐数据\n  - 通过10亿个人物角色扩展合成数据的创建\n  - 通过可扩展的全新问题合成释放LLM的推理能力\n  - 关于大型语言模型数据合成与增强的综述\n  - AgentInstruct：迈向基于代理流的生成式教学\n  - 揭示缺陷：探索大型语言模型中合成数据的不足及缓解策略\n- 如何降低通用能力损失\n  - 大型语言模型的能力如何受监督微调数据组成的影响\n  - 两阶段LLM微调：减少专业化，增加泛化能力\n- 微调经验\u002F实验报告\n    - BELLE: 探索指令数据规模对大型语言模型的影响：基于真实场景的实证研究\n    - Baize: Baize：一款开源聊天模型，采用参数高效微调技术，在自我对话数据上进行训练\n    - 面向中文指令数据的大型语言模型全参数微调与LoRA微调对比研究\n    - 探讨ChatGPT的内容排序能力：一项关于其与人类偏好一致性的初步研究\n    - 为中文用户打造更好的指令遵循语言模型：探究训练数据与评估的影响\n    - 面向企业的LLM微调：实用指南与建议\n- 其他\n   - 通过多任务微调实现跨语言泛化\n   - 通过自然语言众包指令实现跨任务泛化\n   - UNIFIEDSKG：利用文本到文本的语言模型实现结构化知识的统一与多任务应用\n   - PromptSource：一个集成的自然语言提示开发环境与资源库\n   - ROLELLM：大型语言模型的角色扮演能力的基准测试、激发与提升\n\n### LLM代理让模型使用工具 (llm_agent)\n- AGENT AI: 探索多模态交互的前沿\n- 基于大型语言模型的自主代理综述\n- 个人LLM代理：能力、效率与安全性的洞察与调查\n- 基于prompt通用方案\n  - ReAct: 在语言模型中协同推理与行动  :star:\n  - Self-ask: 衡量并缩小语言模型中的组合性差距  :star:\n  - MRKL系统：一种模块化、神经符号架构，结合了大型语言模型、外部知识源和离散推理\n  - PAL：程序辅助语言模型\n  - ART：大型语言模型的自动多步推理与工具使用\n  - ReWOO：为高效增强型语言模型解耦推理与观察  :star:\n  - 将检索与思维链推理交织用于知识密集型多步问题\n  - Chameleon：使用大型语言模型进行即插即用的组合式推理  :star:\n  - 忠实的思维链推理\n  - Reflexion：具有言语强化学习的语言代理  :star:\n  - Verify-and-Edit：一种知识增强型思维链框架\n  - RestGPT：将大型语言模型与现实世界的RESTful API连接起来\n  - ChatCoT：基于聊天的大型语言模型上的工具增强型思维链推理\n  - InstructTODS：面向端到端任务导向对话系统的大型语言模型\n  - TPTU：基于大型语言模型的AI代理的任务规划与工具使用\n  - ControlLLM：通过图搜索为语言模型添加工具\n  - Reflexion：一种具有动态记忆和自我反思的自主代理\n  - AutoAgents：一个自动代理生成框架\n  - GitAgent：通过工具扩展促进GitHub上的自主代理\n  - PreAct：在ReAct中预测未来以增强代理的规划能力\n  - TOOLLLM：助力大型语言模型掌握16000+真实世界API  :star:\n  - AnyTool：用于大规模API调用的自省式分层代理\n  - AIOS：LLM代理操作系统\n  - LLMCompiler：用于并行函数调用的LLM编译器\n  - Re-Invoke：用于零样本工具检索的工具调用重写\n- 基于微调通用方案\n  - TALM：工具增强型语言模型\n  - Toolformer：语言模型可以自我教授如何使用工具  :star:\n  - 使用基础模型进行工具学习\n  - Tool Maker：大型语言模型作为工具制造者\n  - TaskMatrix.AI：通过连接基础模型与数百万个API来完成任务\n  - AgentTuning：为LLM赋予通用代理能力\n  - SWIFTSAGE：一种具有快慢思维的生成式代理，适用于复杂的交互任务\n  - FireAct：迈向语言代理的微调\n  - Pangu-Agent：一种可微调的通用代理，具备结构化推理能力\n  - REST遇见REACT：多步推理LLM代理的自我改进\n  - 通过抽象链推理实现高效的工具使用\n  - Agent-FLAN：为大型语言模型设计有效的代理微调数据与方法\n  - AgentOhana：设计统一的数据与训练管道，以实现高效的代理学习\n  - Agent Lumos：面向开源语言代理的统一且模块化的训练\n  - ToolGen：通过生成实现统一的工具检索与调用\n  - 通过持续预训练扩展代理规模\n  - LIMI：少即是多的代理之道\n- 调用模型方案\n  - HuggingGPT：利用ChatGPT及其在HuggingFace中的伙伴解决AI任务\n  - Gorilla：与海量API连接的大语言模型  :star:\n  - OpenAGI：当LLM遇到领域专家时\n- 垂直领域\n  - 数据分析\n    - DS-Agent：通过案例推理赋能大型语言模型实现自动化数据科学\n    - InsightLens：在大型语言模型驱动的数据分析中从对话上下文中发现并探索洞见\n    - Data-Copilot：通过自主工作流连接数十亿条数据与人类\n    - InsightPilot演示：一个由LLM赋能的自动化数据探索系统\n    - TaskWeaver：一个以代码优先的代理框架\n    - 自动化社会科学：语言模型作为科学家和研究对象\n    - Data Interpreter：一款用于数据科学的LLM代理\n    - FDABench：针对异构数据上分析查询的数据代理基准测试\n  - 金融\n    - WeaverBird：利用大型语言模型、知识库和搜索引擎赋能金融决策\n    - FinGPT：开源的金融大型语言模型\n    - FinMem：一种性能增强的LLM交易代理，具有分层记忆和角色设计\n    - AlphaFin：使用检索增强的股票链框架对财务分析进行基准测试\n    - FinAgent：一种多模态基础代理，用于金融交易：工具增强、多样化且通用  :star:\n    - 大型语言模型能否击败华尔街？揭示AI在选股方面的潜力\n    - 利用基于LLM的多智能体框架提升金融市场异常检测能力\n    - TRADINGGPT：具有分层记忆和不同角色的多智能体系统，以提升金融交易表现\n    - FinRobot：一个开源的AI代理平台，用于基于大型语言模型的金融应用\n    - LLMFactor：通过提示提取盈利因子，实现可解释的股票走势预测\n    - Alpha-GPT：人机交互的Alpha挖掘，用于量化投资\n    - 推进异常检测：利用LLM对非语义金融数据进行编码\n    - TradExpert：混合专家LLM彻底革新交易方式\n    - FinVision：一个用于股市预测的多智能体框架\n    - 投资分析中的AI：LLM用于股票评级\n    - AAPM：基于大型语言模型代理的资产定价模型\n    - FinCon：一个合成的LLM多智能体系统，采用概念性言语强化，以提升金融决策\n    - TradingAgents：多智能体LLM金融交易框架\n    - 经LoRA适配的预训练LLM作为决策转换器，用于量化交易中的离线强化学习\n    - FinRL-DeepSeek：为交易代理注入LLM的敏感风险强化学习\n    - FinSphere：一款配备实时数据库定量工具的会话式股票分析代理\n    - FLAG-Trader：将LLM代理与基于梯度的强化学习融合，用于金融交易\n    - Ploutos：朝着可解释的股票走势预测迈进，利用金融大型语言模型\n    - HedgeAgents：一个注重平衡的多智能体金融交易系统\n    - TIMERAG：通过检索增强生成提升LLM时间序列预测\n    - CausalStock：深度端到端因果发现，用于新闻驱动的股票走势预测\n    - 基于LLM的金融投资策略能否在长期中跑赢市场？\n    - 利用基础模型推进金融工程：进展、应用与挑战\n    - AlphaAgents：基于大型语言模型的多智能体，用于构建股票投资组合\n  - 生物医疗\n    - GeneGPT：通过领域工具增强大型语言模型，以改善生物医学信息的获取\n    - ChemCrow：用化学工具增强大型语言模型\n    - 通过证据上的期望最大化推断，在医学问答中生成解释\n    - Agent Hospital：一个拥有可进化医疗代理的医院模拟器\n    - 通过提示工程将化学知识整合到大型语言模型中\n    - CHEMAGENT：大型语言模型中的自我更新库提升了化学推理能力\n  - web\u002Fmobile代理\n    - AutoWebGLM：启动并强化一个基于大型语言模型的网页导航代理\n    - 一个具备规划、长上下文理解及程序合成能力的现实世界Web代理\n    - Mind2Web：迈向通用的网络代理\n    - MiniWoB++：利用工作流引导的探索，在网页界面上进行强化学习\n    - WEBARENA：一个用于构建自主代理的真实网络环境\n    - AutoCrawler：一个渐进式理解网络的代理，用于生成网络爬虫\n    - WebLINX：通过多轮对话进行现实网站导航\n    - WebVoyager：利用大型多模态模型构建端到端网络代理\n    - CogAgent：一款用于GUI代理的视觉语言模型\n    - Mobile-Agent-v2：通过多智能体协作实现高效导航的移动设备操作助手\n    - WebCanvas：在网络环境中对网络代理进行基准测试\n    - GUI代理的黎明：Claude 3.5计算机使用的初步案例研究\n    - UI-TARS：率先使用原生代理实现自动化GUI交互\n    - 揭示语言模型代理在网页上顺序任务组合中的局限性\n    - WebSailor：以超人般的推理能力导航网络代理\n    - WebShaper：通过信息寻求的形式化进行数据合成的代理\n    - WebWatcher：突破视觉-语言深度研究代理的新边界\n    - OS-Genesis：通过逆向任务合成自动化GUI代理轨迹构建\n    - 可扩展的视频到数据集生成，用于跨平台移动代理\n    - Explorer：为多模态网络代理扩展探索驱动的网络轨迹合成\n    - 观看并学习：从在线视频中学习使用电脑\n    - Fara-7B：一款高效的用于计算机使用的代理模型\n  - 软件工程师\n    - 软件工程中的代理：调查、现状与展望\n    - ChatDev：用于软件开发的沟通型代理\n  - 研究代理\n    - PaSa：一款用于全面学术论文搜索的LLM代理\n    - ResearchAgent：利用大型语言模型在科学文献中迭代生成研究想法\n    - Agent Laboratory：将LLM代理用作研究助理\n    - 自动假设验证：通过代理式的顺序证伪\n    - 迈向AI联合科学家\n    - AI4Research：人工智能在科学研究中的综述\n    - Kosmos：一位用于自主发现的AI科学家\n    - 通过协作的大型语言模型代理进行知识驱动的自动特征提取\n  - 设计\n    - PosterGen：一款考虑美学的纸张到海报生成工具，由多智能体LLM提供支持\n    - Paper2Poster：迈向从科学论文出发的多模态海报自动化\n  - 其他\n    - WebShop：迈向与具身语言代理的可扩展现实世界互动\n    - ToolkenGPT：通过工具嵌入为冻结语言模型增添大量工具\n    - PointLLM：赋能大型语言模型理解点云数据\n    - 可解释的长篇法律问答：借助检索增强型大型语言模型\n    - CarExpert：利用大型语言模型实现车载会话式问答\n    - SCIAGENTS：通过多智能体智能图推理自动化科学发现\n- 评估\n  - 评估生成式搜索引擎的可验证性\n  - Auto-GPT用于在线决策：基准测试与补充意见\n  - API-Bank：一个用于工具增强型LLM的基准测试\n  - ToolLLM：助力大型语言模型掌握16000+真实世界API\n  - 大型语言模型对归属权的自动评估\n  - 大型语言模型在检索增强生成中的基准测试\n  - ARES：一个用于检索增强生成系统的自动化评估框架\n  - 代理作为法官：用代理评估代理\n- 多智能体\n  - 对AI代理框架中代理开发者实践的实证研究\n   - 生成式代理\n  - 让模型说密码：通过嵌入进行多智能体辩论\n  - 战争与和平（WarAgent）：基于大型语言模型的世界大战多智能体模拟\n  - 小型LLM是薄弱的工具学习者：一个多LLM代理\n  - 合并、集成与合作！大型语言模型时代下协作策略的调查\n  - 生成式代理：互动的人类行为模拟  :star:\n  - AgentVerse：促进多智能体协作并探索代理中的涌现行为\n  - System-1.x：学习如何用语言模型平衡快速与慢速规划\n  - 思考快与慢的代理：一种说话者-推理者架构\n  - 生成式代理对1000人的模拟\n  - 面向自主AI代理的高级推理与学习\n  - 多智能体设计：通过更好的提示和拓扑优化代理\n  - 多智能体语言模型中的涌现协调\n  - TUMIX：多智能体在测试时通过工具使用混合进行扩展\n  - 以零错误解决一百万步的LLM任务\n  - 多智能体系统中的潜在协作\n  - 智能体互联网：编织异质智能体网络，实现协作智能\n  - 多智能体协作：释放智能LLM代理的力量\n  - Magentic-One：一个通用的多智能体系统，用于解决复杂任务\n  - 组建你的团队：通过自回归图生成自动设计多智能体通信拓扑\n - 任务型智能体协作\n    - METAAGENTS：通过协作模拟人类行为，实现基于LLM的任务导向协调\n    - CAMEL：用于探索大型语言模型社会“心智”的沟通型代理  :star:\n    - 探索大型语言模型在交流游戏中的应用：关于狼人杀的实证研究\n    - 用于软件开发的沟通型代理  :star:\n    - MedAgents：大型语言模型作为零样本医学推理的合作者\n    - METAGPT：用于多智能体协作框架的元编程\n  - 智能体路由\n     - 一个代理统治一切：迈向多智能体会话式AI\n     - 一个多智能体会话式推荐系统\n  - 基础模型路由&Ensemble\n     - 大型语言模型路由与基准数据集\n     - LLM-BL E N D E R：通过两两排名和生成式融合对大型语言模型进行集成\n     - RouteLLM：学习如何根据偏好数据路由LLM\n     - 更多代理就是你需要的一切\n     - 路由至专家：高效奖励引导的大型语言模型集成\n- 自主学习和探索进化\n  - AppAgent：多模态代理作为智能手机用户\n  - 调查-整合-利用：代理跨任务自我进化的通用策略\n  - LLMs在想象空间中：通过模拟试错进行工具学习\n  - 通过行动学习赋能大型语言模型代理\n  - 试错：基于探索的LLM代理轨迹优化\n  - OS-COPILOT：迈向具有自我改进能力的通用计算机代理\n  - LLAMA RIDER：激励大型语言模型探索开放世界\n  - 以过去为指导：利用回顾性学习完成Python代码\n  - AutoGuide：为大型语言模型代理自动生成和选择状态感知指南\n  - 关于大型语言模型自我进化的调查\n  - ExpeL：LLM代理是体验式学习者\n  - ReAct遇见ActRe：当语言代理享受训练数据自主权时\n  - 主动代理：将LLM代理从被动响应转向主动协助\n  - 从新手到专家：通过逐步强化学习优化LLM代理政策\n  - AGILE：一种新颖的LLM代理强化学习框架\n  - Agent Q：面向自主AI代理的高级推理与学习\n  - ARMAP：通过自动奖励建模和计划扩大自主代理规模\n  - Search-R1：通过强化学习训练LLM进行推理并利用搜索引擎\n  - 语言代理的持续学习：情境经验回放\n  - TaskCraft：自动生成代理任务\n- MCP\n  - SCALEMCP：为LLM代理提供动态且自动同步的模型上下文协议工具\n  - LIVEMCP-101：对启用MCP的代理在挑战性查询上的压力测试与诊断\n- 其他\n  - LLM+P：赋能大型语言模型最佳的规划能力\n  - 带参考的推理：无损加速大型语言模型\n  - RecallM：一种用于时间上下文理解和问答的架构\n  - LLaMA Rider：激励大型语言模型探索开放世界\n  - LLM无法规划，但在LLM模组框架中可以帮助规划\n  - Routine：一种用于企业中LLM代理系统的结构性规划框架\n- 自定义代理\n  - 从计算机使用中创建通用用户模型\n\n\n\n### RAG\n- 经典论文\n  - WebGPT：浏览器辅助的带人类反馈的问题回答 \n  - WebGLM：面向高效网络增强型问题回答系统的、结合人类偏好的方法 \n  - WebCPM：用于中文长文本问题回答的交互式网络搜索 :star:\n  - REPLUG：检索增强的黑盒语言模型 :star:\n  - RETA-LLM：一个检索增强型大语言模型工具包\n  - Atlas：基于检索增强语言模型的小样本学习\n  - RRAML：强化学习驱动的检索增强机器学习\n  - FRESHLLMS：利用搜索引擎增强刷新大型语言模型\n- 微调\n  - RLCF：通过对比反馈对齐大语言模型能力与信息检索上下文\n  - RA-DIT：检索增强的双指令微调\n  - CHAIN-OF-NOTE：提升检索增强语言模型的鲁棒性\n  - RAFT：将语言模型适配到特定领域的RAG场景\n  - 丰富的知识来源带来复杂的知识冲突：重新校准模型以反映相互矛盾的证据\n- 其他论文\n  - 探究检索增强下大语言模型的事实知识边界\n  - PDFTriage：针对长篇结构化文档的问题回答\n  - 穿越记忆迷宫：通过交互式阅读突破上下文限制 :star:\n  - 主动检索增强生成\n  - kNN-LM并不能改善开放式文本生成\n  - 检索增强语言模型能进行推理吗？检索器与语言模型之间的责任归属\n  - DORIS-MAE：使用多层级基于方面的查询进行科学文献检索\n  - 面向开放式文本生成的事实性增强语言模型  \n  - KwaiAgents：基于大语言模型的通用信息搜索代理系统\n  - 基于野外检索证据的复杂主张验证\n  - 大语言模型的检索增强生成综述\n  - ChatQA：构建GPT-4级别的对话式问答模型\n  - RAG与微调：流程、权衡及农业领域的案例研究\n  - 检索增强生成中大语言模型的基准测试\n  - T-RAG：来自大语言模型实战的经验教训\n  - ARAGOG：高级RAG输出评分\n  - ActiveRAG：通过主动学习揭示知识宝藏\n  - OpenResearcher：释放AI潜力加速科学研究\n  - [Contextual.ai-RAG2.0](https:\u002F\u002Fcontextual.ai\u002Fintroducing-rag2\u002F)\n  - Mindful-RAG：检索增强生成中的失效点研究\n  - Memory3：带有显式记忆的语言建模\n- 优化检索\n  - IAG：用于解答推理型问题的归纳增强生成框架\n  - HyDE：无需相关性标签的精准零样本密集检索\n  - PROMPTAGATOR：仅需8个示例即可实现的少样本密集检索\n  - 检索增强型大语言模型的查询重写\n  - Query2doc：利用大语言模型进行查询扩展 :star:\n  - 通过提示词引导大语言模型进行查询扩展 :star:\n  - [Anthropic情境检索](https:\u002F\u002Fwww.anthropic.com\u002Fnews\u002Fcontextual-retrieval)\n  - 基于知识金字塔的多层级查询\n  - 大语言模型中查询优化综述\n- 排序\n  - 使用大语言模型进行有效且高效的零样本排序的集合式方法\n  - RankVicuna：利用开源大语言模型进行零样本列表式文档重排\n  - 通过零样本问题生成改进段落检索\n  - 大语言模型在配对排序提示下是有效的文本排序器\n  - RankRAG：统一上下文排序与大语言模型中的检索增强生成\n  - 针对对话式搜索引擎的排序操控\n  - ChatGPT擅长搜索吗？探究大语言模型作为重排序代理的作用\n  - 开源大语言模型是强大的零样本查询似然模型，可用于文档排序\n  - T2Ranking：大规模中文段落排序基准\n  - 学习为检索增强生成过滤上下文\n- 传统搜索方案\n  - 提出正确的问题：利用强化学习进行主动问题重述\n  - 信息检索中的查询扩展技术综述\n  - 学习如何改写查询\n  - 管理Airbnb搜索中的多样性\n- 新型向量模型用于召回和排序\n  - 面向定制检索的增强嵌入\n  - BGE M3-Embedding：通过自我知识蒸馏实现的多语言、多功能、多粒度文本嵌入\n  - [网易为RAG设计的BCE嵌入技术报告](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F681370855)\n  - BGE Landmark Embedding：一种无需分块的嵌入方法，适用于检索增强型长上下文大语言模型\n  - D2LLM：分解并蒸馏的大语言模型，用于语义搜索\n  - Piccolo2：采用多任务混合损失训练的通用文本嵌入\n  - UniSearch：用统一生成式架构重新思考搜索系统\n  - UniDex：用统一语义建模重新思考搜索倒排索引\n- 优化推理结果\n  - Speculative RAG：通过草稿机制增强检索增强生成\n- 动态RAG（何时检索与检索计划）\n  - SELF-RAG：通过自我反思学习检索、生成和批判 :star:\n  - 自我知识引导的大语言模型检索增强\n  - Self-DC：何时检索、何时生成——针对组合型未知问题的自我分割与征服策略\n  - 小模型，大洞察：利用精简代理模型决定大语言模型何时以及检索什么内容\n  - Adaptive-RAG：通过问题复杂度学习适应检索增强型大语言模型\n  - REAPER：面向复杂RAG系统的基于推理的检索规划\n  - 何时检索：教导大语言模型有效利用信息检索\n  - PlanRAG：一种先规划后检索的生成式大语言模型增强生成方式，作为决策者\n  - ONEGEN：面向大语言模型的高效单次统一生成与检索\n  - Probing-RAG：自我探测引导语言模型进行选择性文档检索\n- 图RAG\n  - 图形检索增强生成综述\n  - 从局部到全局：面向查询聚焦摘要的图RAG方法\n  - GRAG：图检索增强生成\n  - GNN-RAG：用于大语言模型推理的图神经网络检索 \n  - THINK-ON-GRAPH：大语言模型在知识图谱上的深度且负责任的推理\n  - LightRAG：简单快速的检索增强生成\n  - THINK-ON-GRAPH：大语言模型在知识图谱上的深度且负责任的推理\n  - StructRAG：通过推理时的混合信息结构化提升大语言模型的知识密集型推理能力\n- 多步RAG\n  - 搜索与大语言模型之间在信息检索中的协同作用\n  - 在知识密集型多步问题中穿插检索与思维链推理\n  - 通过迭代的检索-生成协同作用增强检索增强型大语言模型\n  - RAT：检索增强思维激发长跨度生成中的上下文感知推理\n  - IM-RAG：通过学习内部独白实现多轮检索增强生成\n  - 展示-搜索-预测：将检索与语言模型结合用于知识密集型自然语言处理\n  - 链中搜索：迈向准确、可信且可追溯的大语言模型，以应对知识密集型任务\n  - MindSearch 思·索：模仿人类思维激发深度AI搜索者\n  - RQ-RAG：学习为检索增强生成提炼查询\n  - AutoPRM：通过可控的问题分解自动化多步推理的过程性监督\n- 时间线RAG\n  - 展开头条：新闻检索与时间线摘要的迭代自问\n- 快速RAG\n  - MINIRAG：迈向极其简单的检索增强生成\n  - EasyRAG：面向自动化网络运营的高效检索增强生成框架\n- 深度研究\n  - 带有测试时扩散的深度研究员\n\n### 其他提示工程师(prompt_engineer)\n- PDL：一种声明式提示编程语言\n- 为什么提示设计很重要且有效：大型语言模型中提示搜索空间的复杂性分析\n- 提示作为科学探究\n- 使用前校准：提升语言模型的少样本性能\n- 上下文指令学习\n- 学习提升性能的代码编辑\n- 通过提示增强大型语言模型的心智理论能力\n- 基于生成知识的提示用于常识推理\n- 背诵增强的语言模型\n- kNN提示：无需校准的最近邻推理实现超越上下文的学习\n- EmotionPrompt：利用心理学通过情感刺激增强大型语言模型\n- 基于知识引导提示的因果感知概念提取\n- 大型语言模型作为优化器\n- 提示即程序：一种结构感知的高效编译时提示优化方法\n- 集合标记提示释放GPT-4V中的非凡视觉接地能力\n- RePrompt：自动提示编辑以将AI生成艺术细化为精确表达\n- MedPrompt：通用基础模型能否胜过专用微调？以医学为例\n- DSPy断言：用于自我改进语言模型流水线的计算约束\n- 提示作为自动优化的训练超参数：仅用10个黄金标签从头训练一流的信息检索模型\n- 极端多标签分类的上下文学习\n- 优化多阶段语言模型程序的指令和演示\n- DSPy：将声明式语言模型调用编译成自我改进的流水线\n- 将大型语言模型与进化算法结合可产生强大的提示优化器\n- TextGrad：通过文本实现自动“微分”\n- 任务面学习：一种结构化的提示优化方法\n- LangGPT：从编程语言角度重新思考LLM的结构化可重用提示设计框架\n- PAS：数据高效的即插即用提示增强系统\n- 让我自由表达吗？格式限制对大型语言模型性能影响的研究\n- 从笔到提示：创意作家如何将AI融入写作实践\n- 提示格式是否会影响LLM性能？\n- 自动演示提示：利用生成输出作为演示以增强批量提示效果\n- PROMPTBREEDER：通过提示进化实现自指式的自我改进\n- 心理学增强的AI智能体\n- 注意力推理查询：一种系统化的方法来优化大型语言模型的指令遵循能力\n- 通过标准心理诊断实现确定性的AI智能体人格表达\n\n### 大模型图表理解和生成\n- 综述\n  - 表格遇见LLM：大型语言模型能否理解结构化表格数据？基准测试与实证研究\n  - 大型语言模型(LLMs)在表格数据上的应用：预测、生成与理解——综述\n  - 探索语言模型的数值推理能力：基于表格数据的全面分析\n- 提示\n  - 大型语言模型是多功能分解者：分解证据和问题以进行基于表格的推理\n  - Tab-CoT：零样本表格思维链\n  - 表格链：在推理链中演化表格以理解表格\n- 微调\n  - TableLlama：迈向开放的大型通用表格模型\n  - TableLLM：使LLM能够在实际办公场景中操作表格数据\n- 多模态\n  - MMC：通过大规模指令微调推进多模态图表理解\n  - ChartLlama：一款用于图表理解和生成的多模态LLM\n  - ChartAssisstant：通过图表转表格预训练和多任务指令微调打造的通用图表多模态语言模型\n  - ChartInstruct：用于图表理解和推理的指令微调\n  - ChartX & ChartVLM：一套功能强大且适用于复杂图表推理的基准和基础模型\n  - MATCHA：借助数学推理和图表去渲染技术增强视觉语言预训练\n  - UniChart：一款用于图表理解和推理的通用视觉-语言预训练模型\n  - TinyChart：通过视觉令牌合并和思维程序学习实现高效图表理解\n  - 表格作为文本或图像：评估LLM和MLLM的表格推理能力\n  - TableVQA-Bench：一个多表格领域的视觉问答基准测试\n  - TabPedia：通过概念协同迈向全面的视觉表格理解\n- 生成式UI\n  - 生成式UI：LLM是有效的UI生成器\n\n### LLM+KG\n- 综述类\n  - 统一大型语言模型和知识图谱：路线图\n  - 大型语言模型和知识图谱：机遇与挑战\n  - [知识图谱与大模型融合实践研究报告2023](https:\u002F\u002Fblog.csdn.net\u002Fm0_37586850\u002Farticle\u002Fdetails\u002F132463508)\n- KG用于大模型推理\n  - 利用大型语言模型从知识图谱中进行零样本自然语言生成\n  - MindMap：知识图谱提示激发大型语言模型中的思维图\n  - 知识增强的语言模型提示用于零样本知识图谱问答\n  - 利用逻辑编程和大型语言模型进行特定领域的知识图谱问答\n  - 携带你的KG：自监督程序合成实现零样本KGQA\n  - StructGPT：大型语言模型处理结构化数据的通用框架\n- 大模型用于KG构建\n  - 利用大型语言模型增强知识图谱构建\n  - LLM辅助的知识图谱工程：使用ChatGPT的实验\n  - 迭代式零样本LLM提示用于知识图谱构建\n  - 探索大型语言模型用于知识图谱补全\n\n### 类人智能体\n- HABITAT 3.0：人类、化身和机器人的共居地\n- 类人智能体：模拟类人生成式智能体的平台\n- Voyager：一个基于大型语言模型的开放式具身智能体\n- [塑造先进机器人技术的未来](https:\u002F\u002Fdeepmind.google\u002Fdiscover\u002Fblog\u002Fshaping-the-future-of-advanced-robotics\u002F)\n- AUTORT：用于大规模协调机器人智能体的具身基础模型\n- 基于事后轨迹草图的机器人任务泛化\n- ALFWORLD：对齐文本与具身环境以实现交互式学习\n- MINEDOJO：利用互联网规模的知识构建开放式具身智能体\n- LEGENT：具身智能体的开放平台\n\n### 预训练数据与预训练\n- DoReMi：优化数据混合加速语言模型预训练\n- The Pile：用于语言建模的800GB多样化文本数据集\n- CCNet：从网络爬取数据中提取高质量单语数据集\n- WanJuan：一个用于推动英中文大型模型发展的综合性多模态数据集\n- CLUECorpus2020：用于预训练语言模型的大规模中文语料库\n- 上下文内预训练：超越文档边界的语言建模\n- 数据混合法则：通过预测语言建模性能来优化数据混合\n- Zyda：一个用于开放语言建模的1.3T数据集\n- 熵定律：数据压缩与大语言模型性能背后的故事\n- 无处不在的数据：预训练数据集构建指南\n- 通过联合示例选择进行数据筛选进一步加速多模态学习\n- 利用困惑度相关性改进预训练数据\n- 当AI模型在递归生成的数据上训练时会崩溃\n\n### 领域模型SFT（domain_llms）\n- 金融\n  - BloombergGPT：面向金融领域的大型语言模型\n  - FinVis-GPT：用于金融图表分析的多模态大型语言模型\n  - CFGPT：基于大型语言模型的中文金融助手\n  - CFBenchmark：大型语言模型中文金融助手基准测试\n  - InvestLM：利用金融领域指令微调的大型投资语言模型\n  - BBT-Fin：全面构建中文金融领域预训练语言模型、语料库和基准测试\n  - PIXIU：面向金融领域的大型语言模型、指令数据及评估基准\n  - The FinBen：面向大型语言模型的综合性金融基准\n  - XuanYuan 2.0：一款拥有数千亿参数的大型中文金融聊天模型\n  - 向工业领域可信大型语言模型迈进\n  - 当AI遇见金融（StockAgent）：基于大型语言模型的股票交易，模拟真实环境\n  - 大型语言模型在金融领域的应用综述：进展、前景与挑战\n- 生物医疗\n  - MedGPT：从临床叙述中预测医学概念\n  - BioGPT：用于生物医学文本生成与挖掘的生成式预训练Transformer\n  - PubMed GPT：一种面向生物医学文本的领域专用大型语言模型 :star:\n  - ChatDoctor：基于LLaMA模型，结合医学领域知识微调的医疗聊天模型\n  - Med-PaLM：大型语言模型编码临床知识[V1,V2] :star:\n  - SMILE：通过ChatGPT实现从单轮到多轮的包容性语言扩展，用于心理健康支持\n  - Zhongjing：通过专家反馈和真实世界多轮对话提升大型语言模型的中文医疗能力\n- 其他\n  - Galactia：面向科学领域的大型语言模型\n  - 带有参数化知识引导的增强型大型语言模型\n  - ChatLaw：开源法律大型语言模型 :star:\n  - MediaGPT：面向中国媒体的大型语言模型\n  - KITLM：将领域特定知识整合到语言模型中以进行问答\n  - EcomGPT：使用任务链任务对大型语言模型进行指令微调，用于电子商务\n  - TableGPT：致力于将表格、自然语言和指令统一到一个GPT中\n  - LLEMMA：面向数学的开放语言模型\n  - MEDITAB：通过数据整合、丰富和精炼扩展医疗表格数据预测器\n  - PLLaMa：面向植物科学的开源大型语言模型\n  - 通过阅读理解调整大型语言模型\n\n### LLM超长文本处理 (long_input)\n- 位置编码、注意力机制优化\n  - Unlimiformer: 长范围Transformer，支持无限长度输入\n  - 大型语言模型的并行上下文窗口\n  - [苏剑林, NBCE：使用朴素贝叶斯扩展LLM的Context处理长度](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9617) :star:\n  - 结构化提示：将上下文学习扩展到1,000个示例\n  - Vcc：通过优先处理重要token，将Transformer扩展到128K token或更多\n  - 使用RMT将Transformer扩展到1M token及以上\n  - 短训练，长测试：带有线性偏置的注意力机制实现输入长度外推 :star:\n  - 通过位置插值扩展大型语言模型的上下文窗口\n  - LongNet：将Transformer扩展到1,000,000,000个token\n  - https:\u002F\u002Fkaiokendev.github.io\u002Ftil#extending-context-to-8k\n  - [苏剑林, Transformer升级之路：10、RoPE是一种β进制编码](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9675) :star:\n  - [苏剑林, Transformer升级之路：11、将β进制位置进行到底](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9706)\n  - [苏剑林, Transformer升级之路：12、无限外推的ReRoPE？](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9708)\n  - [苏剑林, Transformer升级之路：15、Key归一化助力长度外推](https:\u002F\u002Fspaces.ac.cn\u002Farchives\u002F9859)\n  - 带有注意力汇流的高效流式语言模型\n  - 带有块状Transformer的环形注意力，用于近乎无限的上下文\n  - YaRN：大型语言模型的高效上下文窗口扩展\n  - LM-INFINITE：大型语言模型的简单在线长度泛化\n  - 带有注意力汇流的高效流式语言模型\n  - 原生稀疏注意力：与硬件对齐且可原生训练的稀疏注意力\n- 上文压缩排序方案\n  - 迷失在中间：语言模型如何利用长上下文 :star:\n  - LLMLingua：压缩提示以加速大型语言模型的推理\n  - LongLLMLingua：通过提示压缩加速和增强长上下文场景下的LLM :star:\n  - 使用要点token学习压缩提示\n  - 解锁LLM的上下文限制：基于自信息的内容过滤提升LLM的上下文效率\n  - LongAgent：通过多智能体协作将语言模型的上下文扩展到128k\n  - PCToolkit：大型语言模型的统一即插即用提示压缩工具包\n  - 长LLM是否是长上下文任务的必要条件？\n  - QwenLong-CPRS：通过动态上下文优化迈向\\infty-LLMs\n- 训练和模型架构方案\n  - 永远不要从零开始训练：对长序列模型的公平比较需要数据驱动的先验\n  - 从4K到400K飞跃：用激活信标扩展LLM的上下文\n  - 永远不会迷失在中间：通过强化问答任务改进大型语言模型\n  - 聚焦Transformer：用于上下文扩展的对比训练\n  - 基础模型的有效长上下文扩展\n  - 关于Transformer的长距离能力\n  - 高效长距离Transformer：你需要关注更多，但不一定要每一层都关注\n  - POSE：通过位置跳过式训练高效扩展LLM的上下文窗口\n  - LONGLORA：长上下文大型语言模型的高效微调\n  - LongAlign：大型语言模型长上下文对齐的配方\n  - 数据工程：将语言模型扩展到128K上下文\n  - MEGALODON：具有无限上下文长度的高效LLM预训练和推理\n  - 让你的LLM充分利用上下文\n  - 解开结：一种高效的长上下文预训练数据增强策略\n  - LIFT：通过长输入微调提升大型语言模型的长上下文理解能力\n  - REFRAG：重新思考基于RAG的解码\n- 效率优化\n  - 高效注意力：线性复杂度的注意力\n  - Transformers就是RNNs：具有线性注意力的快速自回归Transformer\n  - HyperAttention：近线性时间内的长上下文注意力\n  - FlashAttention：具有IO感知的快速且内存高效的精确注意力\n  - 文本越长，需求越大：推理时训练有助于长文本生成\n- 评估\n  - NOLIMA：超越字面匹配的长上下文评估\n  - 边际收益递减的幻觉：衡量LLM的长 horizon 执行能力\n- 原理分析\n  - 检索头机制性地解释了长上下文的事实性\n\n### LLM长文本生成（long_output）\n- Re3：通过递归重提和修订生成更长的故事\n- RECURRENTGPT：交互式生成（任意）长文本\n- DOC：通过详细的大纲控制提升长篇故事的连贯性\n- Weaver：面向创意写作的基础模型\n- 利用大型语言模型从零开始协助撰写维基百科式文章\n- 进入未知的未知领域：通过参与语言模型代理对话进行主动式人类学习\n- 不止于大纲：语言模型用于自适应长文写作的异构递归规划\n\n### NL2SQL\n- 大模型方案\n  - DIN-SQL：带自我修正的分解式上下文学习文本转SQL :star:\n  - C3：使用ChatGPT进行零样本文本转SQL :star:\n  - SQL-PALM：改进大型语言模型的文本转SQL适应能力\n  - BIRD LLM是否已经可以作为数据库接口？大规模数据库驱动的文本转SQL大基准测试 :star:\n  - 用于跨领域文本转SQL中自适应提示的一致性推理框架\n  - ChatDB：将数据库作为符号记忆增强LLM\n  - 对ChatGPT零样本文本转SQL能力的全面评估\n  - 使用结构和内容提示学习进行少样本文本转SQL翻译\n  - 工具辅助代理在真实场景下进行SQL检查和精炼\n  - Agentar-Scale-SQL：通过协调的测试时扩展推进文本转SQL\n- 领域知识密集型\n  - 基于公式化知识迈向知识密集型文本转SQL语义解析\n  - 通过模式扩展弥合文本转SQL解析中的泛化差距\n  - 努力提高文本转SQL模型对同义词替换的鲁棒性\n  - FinQA：金融数据上的数值推理数据集\n- 其他\n  - RESDSQL：将模式链接与骨架解析分离用于文本转SQL\n  - MIGA：用于会话式文本转SQL的统一多任务生成框架\n\n\n### 代码生成\n- 使用AlphaCodium进行代码生成：从提示工程到流程工程\n- Codeforces作为数字化时代学习编程的教育平台\n- 使用AlphaCode进行竞赛级代码生成\n- CODECHAIN：迈向模块化代码生成，通过代表子模块的自我修订链\n- AI程序员就在我们身边：重新思考编程语言语法以实现高效代码生成\n\n### 降低模型幻觉 (reliability)\n- 调查研究\n  - 大型语言模型及其幻觉的危险性\n  - 自然语言生成中的幻觉综述\n  - AI海洋中的塞壬之歌：大型语言模型中幻觉的综述\n  - 大型基础模型中的幻觉综述\n  - 大型语言模型中幻觉的综述：原理、分类、挑战与开放问题\n  - 校准后的语言模型必然会产生幻觉\n  - 为什么ChatGPT在提供真实答案方面有所不足？\n  - 语言模型为何会产生幻觉\n- 提示工程或微调\n  - R-Tuning：教会大型语言模型拒绝回答未知问题\n  - 通过提示使GPT-3更加可靠\n  - 随便问：一种简单的语言模型提示策略  :star:\n  - 关于提升语言模型推理能力的研究进展\n  - RefGPT：参考→由GPT生成并为GPT定制的真实对话\n  - 基于检索的反思：忠实的大语言模型推理\n  - 生成而非检索：大型语言模型是强大的上下文生成器\n  - 大型语言模型难以学习长尾知识\n- 解码策略\n  - 相信你的证据：通过上下文感知解码减少幻觉  :star:\n  - 自我精炼：基于自我反馈的迭代优化  :star:\n  - 通过自然语言推理提升预训练语言模型的自洽性和性能\n  - 推理时干预：从语言模型中获取真实答案\n  - 使大型语言模型能够生成带有引用的文本\n  - 面向开放式文本生成的事实性增强语言模型\n  - 基于KL散度的温度采样\n  - KCTS：基于知识约束的树搜索解码，具备令牌级别的幻觉检测功能\n  - 对比解码提升大型语言模型的推理能力\n  - 对比解码：将开放式文本生成视为优化问题\n- 探测与检测\n  - 大型语言模型归属关系的自动评估\n  - QAFactEval：改进的基于问答的事实一致性评估，用于摘要生成\n  - 零资源大型语言模型幻觉预防\n  - LLM谎言：幻觉并非缺陷，而是作为对抗样本的特性\n  - 语言模型（大多）知道自己知道什么  :star:\n  - LM vs LM：通过交叉检验检测事实错误\n  - 语言模型是否知道自己在编造引用？\n  - SELFCHECKGPT：面向生成式大型语言模型的零资源黑盒幻觉检测\n  - 大型语言模型的自相矛盾的幻觉：评估、检测与缓解\n  - 开放式生成中的自洽性\n  - 通过多智能体辩论提升语言模型的事实性和推理能力\n  - Selective-LAMA：面向信心感知评估的语言模型选择性预测\n  - LLM能否表达其不确定性？LLM中置信度 elicitation 的实证评估\n- 审查与校准\n  - 真相计量器：与LLM合作对抗其幻觉\n  - RARR：利用语言模型研究和修订语言模型所说内容\n  - CRITIC：大型语言模型可通过工具交互式批评实现自我修正\n  - 使用RELM验证大型语言模型\n  - PURR：通过去噪语言模型的污染来高效编辑语言模型的幻觉\n  - 核实事实并重试：借助外部知识和自动化反馈改进大型语言模型\n  - 自适应变色龙还是顽固的树懒？揭示大型语言模型在知识冲突中的行为\n  - 啄木鸟：针对多模态大型语言模型的幻觉修正\n  - 零样本忠实的事实错误修正\n  - 大型语言模型目前仍无法自我修正推理\n  - 通过强化学习训练语言模型进行自我修正\n- 通过忏悔训练LLM保持诚实\n\n\n### 大模型评估（evaluation）\n- 事实性评估\n  - 可靠的LLMs：大型语言模型对齐性的调查与评估指南\n  - TrueTeacher：与大型语言模型一起学习事实一致性评估\n  - TRUE：重新评估事实一致性评估\n  - FACTSCORE：长篇文本生成中事实精确度的细粒度原子级评估\n  - KoLA：精心基准测试大型语言模型的世界知识\n  - 何时不应信任语言模型：探究参数化与非参数化记忆的有效性\n  - FACTOOL：生成式AI中的事实性检测——一个适用于多任务、多领域场景的工具增强框架\n  - 大型语言模型中的长篇事实性\n- 检测任务\n  - 从大型语言模型中检测预训练数据\n  - 可扩展地从（生产用）语言模型中提取训练数据\n  - 重新思考语言模型的基准测试与污染问题：使用改写样本\n- 通用评估\n  - G-EVAL：使用GPT-4进行NLG评估，具有更好的人类对齐性\n- 工具调用评估\n  - ToolRM：面向调用工具的大型语言模型的结果奖励模型\n- 代理评估\n  - SWE-Bench Pro：AI代理能否解决长期软件工程任务？\n  - ALE-Bench：面向长期目标驱动算法工程的基准测试\n  - FinSearchComp：迈向现实且专家级的金融搜索与推理评估\n  - 支持我们的AI霸主：重新设计数据系统以代理优先\n\n### 推理优化(inference)\n- 快速Transformer解码：一个写头就足够了\n- 通过推测解码实现Transformer的快速推理\n- GQA：从多头检查点训练通用多查询Transformer模型\n- 思维骨架：大型语言模型可以进行并行解码\n- SkipDecode：带批处理和缓存的自回归跳过解码，用于高效LLM推理\n- BatchPrompt：用更少的资源完成更多工作\n- 你只需缓存一次：面向语言模型的解码器-解码器架构\n- XGrammar：灵活高效的大型语言模型结构化生成引擎\n- 大型语言模型中的精确长度控制\n- Top-nσ：并非所有logits都是你需要的\n- 上下文缓存\n  - 提示缓存：模块化注意力复用，实现低延迟推理\n  - SGLang：高效执行结构化语言模型程序\n  - 通过嵌入相似性实现高效的提示缓存\n  - ChunkAttention：带有前缀感知KV缓存和两阶段分区的高效自注意力机制\n  - Hydragen：使用共享前缀实现高吞吐量的LLM推理\n  - 使用PagedAttention实现大型语言模型服务中的高效内存管理\n\n### 模型知识编辑黑科技(model_edit)\n- ROME：定位并编辑GPT中的事实关联\n- Transformer前馈层是键值记忆\n- MEMIT：大规模编辑Transformer中的记忆\n- MEND：大规模快速模型编辑\n- 编辑大型语言模型：问题、方法与机遇\n- 语言模型就像超级马里奥：可以免费吸收同源模型的能力\n- 基于自动机的约束条件用于语言模型解码\n- SGLang：高效执行结构化语言模型程序\n\n### 模型合并和剪枝(model_merge)\n- 融合就是全部：更便宜、更好的万亿参数大模型替代方案\n- DARE语言模型就像超级马里奥：免费吸收同源模型的能力\n- 使用任务算术编辑模型\n- TIES-Merging：解决模型合并时的干扰问题\n- LM-Cocktail：通过模型合并实现语言模型的稳健微调\n- SLICEGPT：通过删除行和列压缩大型语言模型\n- 基于贝叶斯优化的LLM预训练检查点合并\n- Arcee's MergeKit：用于合并大型语言模型的工具包\n\n### MOE\n- 训练稀疏翻译模型的技巧\n- ST-MoE：设计稳定且可迁移的稀疏专家模型\n- Switch Transformers：通过简单高效的稀疏性扩展到万亿参数模型\n- GLaM：利用专家混合高效扩展语言模型\n- GShard：通过条件计算和自动分片扩展巨型模型\n- 极其庞大的神经网络：稀疏门控专家混合层\n- DeepSpeed-MoE：推进专家混合的推理和训练，以支持下一代AI规模\n- 专家混合的稠密到稀疏门控机制\n- 利用专家混合高效进行大规模语言建模\n\n### 多模态\n- InstructBLIP：通过指令微调迈向通用视觉-语言模型\n- BLIP-2：利用冻结的图像编码器和大型语言模型启动语言-图像预训练\n- Visual ChatGPT：与视觉基础模型对话、绘图和编辑\n- LLava视觉指令微调\n- MiniGPT-4：借助先进大型语言模型增强视觉-语言理解能力\n- BLIVA：一款简单的多模态LLM，更好地处理富含文本的视觉问题\n- mPLUG-Owl：模块化赋能大型语言模型多模态能力\n- LVLM eHub：大型视觉语言模型的综合评估基准\n- Mirasol3B：一种多模态自回归模型，适用于时间对齐和上下文相关的模态\n- PaLM-E：具身多模态语言模型\n- TabLLM：利用大型语言模型进行少样本表格数据分类\n- AnyGPT：统一的多模态LLM，支持离散序列建模\n- [Sora技术报告](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fvideo-generation-models-as-world-simulators)\n- 向通用计算机控制迈进：以《荒野大镖客2》为例的多模态智能体\n- OCR\n  - Vary：为大型视觉-语言模型扩展视觉词汇量\n  - 大型OCR模型：OCR缩放规律的实证研究\n  - 大型多模态模型中OCR的隐藏奥秘\n  - DeepSeek-OCR：上下文光学压缩\n- PreFLMR：扩展细粒度晚期交互式多模态检索器\n- 多模态基础模型中的多样本上下文学习\n- 为文生图扩散模型添加条件控制\n- Ferret-UI：基于多模态LLM的接地移动UI理解\n- ShowUI：一个视觉-语言-行动模型，用于GUI视觉智能体\n- Flamingo：用于少样本学习的视觉语言模型\n- Segment Anything\n- Monkey：图像分辨率和文本标签对大型多模态模型至关重要\n- 从自然语言监督中学习可迁移的视觉模型\n- 一张图片胜过16×16个词：大规模图像识别的Transformer\n- InternVL1：扩展视觉基础模型并对其对齐，以应对通用视觉-语言任务\n- Vary：为大型视觉-语言模型扩展视觉词汇量\n- InternVL1.5：我们距离GPT-4V还有多远？通过开源套件缩小与商用多模态模型的差距\n- Qwen-VL：一款多功能视觉-语言模型，可用于理解、定位、文本阅读等任务\n- Qwen2-VL：在任何分辨率下提升视觉-语言模型的世界感知能力\n\n### 综述\n- 大型语言模型综述\n- 预训练、提示与预测：自然语言处理中提示方法的系统性综述 :star:\n- 自然语言处理的范式转变\n- 预训练模型：过去、现在与未来\n- 哪种语言模型架构和预训练目标最适合零样本泛化 :star:\n- 向大型语言模型推理迈进：综述\n- 利用语言模型提示进行推理：综述 :star:\n- 语言模型概述：最新进展与展望 :star:\n- 大型语言模型综述[6.29更新版]\n- 统一大型语言模型与知识图谱：路线图\n- 增强型语言模型：综述 :star:\n- 领域专业化是使大型语言模型具有颠覆性的关键：全面综述\n- 大型语言模型的挑战与应用\n- 基于大型语言模型的智能体的兴起与潜力：综述\n- 大型语言模型在信息检索中的应用：综述\n- AI对齐：全面综述\n- 知识与大型语言模型融合的趋势：方法、基准和应用的综述与分类\n- 用于时间序列和时空数据的大模型：综述与展望\n- 代码相关语言模型综述\n- 模型即服务(MaaS)：综述\n\n### 大模型能力探究\n- 上下文学习\n  - 更大的语言模型以不同方式执行上下文学习\n  - 上下文学习是如何工作的？理解其与传统监督学习差异的框架\n  - 为什么GPT能在上下文中学习？语言模型作为元优化器秘密地执行梯度下降：star:\n  - 重新思考示范的作用：是什么让上下文学习奏效？：star:\n  - 训练后的Transformer在上下文中学习线性模型\n  - 上下文学习会生成任务向量\n  - 大型语言模型中的函数向量\n  - 无需训练的学习：上下文学习的隐式动态\n  - 语言模型是单射的，因此可逆的\n- 涌现能力\n  - 人工通用智能的火花：GPT-4的早期实验\n  - 大型语言模型的涌现能力：star:\n  - 语言模型表征空间和时间\n  - 大型语言模型的涌现能力是幻象吗？\n- 能力评估\n  - ChatGPT是否为通用自然语言处理任务求解器？\n  - 大型语言模型能否从相关性中推断出因果关系？\n  - 语言模型的整体评估\n  - 在实践中利用LLM的力量：关于ChatGPT及更广泛领域的综述\n  - 理解他人心理的能力可能已在大型语言模型中自发涌现\n  - 超越模仿游戏：量化并外推语言模型的能力\n  - 模型能自我解释吗？自然语言解释的反事实可模拟性\n  - 揭秘GPT用于代码生成的自我修复机制\n  - 在基于程序训练的语言模型中发现意义的证据\n  - 解释是否有助于校准黑箱模型\n  - 关于ChatGPT的鲁棒性：对抗性和分布外视角\n  - 语言习得：儿童和语言模型是否遵循相似的学习阶段？\n  - 语言主要是交流工具，而非思维工具\n- 领域能力\n  - GPT-4在医学挑战性问题上的能力\n  - 通用基础模型能否胜过特定用途的微调？以医学为例\n  - 人格向量：监控和控制语言模型中的角色特征\n- 可解释性\n  - 理解用于回归的LLM嵌入\n  - [当模型操纵流形时：计数任务的几何结构](https:\u002F\u002Ftransformer-circuits.pub\u002F2025\u002Flinebreaks\u002Findex.html)\n  - 权重稀疏的Transformer具有可解释的电路\n\n### Prompt调优范式\n- 无调优Prompt\n  - GPT2：语言模型是无监督的多任务学习者\n  - GPT3：语言模型是少样本学习者：star:\n  - LAMA：语言模型是知识库吗？\n  - AutoPrompt：从语言模型中提取知识\n- 固定Prompt的LM调优\n  - T5：用统一的文本到文本Transformer探索迁移学习的极限\n  - PET-TC(a)：利用完形填空题进行少样本文本分类和自然语言推理：star:\n  - PET-TC(b)：PETSGLUE 不只是规模的问题 小型语言模型也是少样本学习者\n  - GenPET：通过自然语言指令进行少样本文本生成\n  - LM-BFF：使预训练语言模型成为更好的少样本学习者：star:\n  - ADEPT：改进并简化模式挖掘训练\n- 固定LM的Prompt调优\n  - Prefix-tuning：优化连续提示以用于生成\n  - Prompt调优：规模的力量实现参数高效的提示调优：star:\n  - P-tunning：GPT也懂：star:\n  - WARP：词级对抗性重编程\n- LM + Prompt调优\n  - P-tunning v2：提示调优在跨规模和任务上均可与微调相媲美\n  - PTR：用于文本分类的规则驱动提示调优\n  - PADA：基于示例的提示学习，用于即时适应未见领域\n- 固定LM的Adapter调优\n  - LORA：大型语言模型的低秩适配：star:\n  - LST：阶梯式侧向调优，实现参数和内存高效的迁移学习\n  - NLP中的参数高效迁移学习\n  - 内在维度解释了语言模型微调的有效性\n  - DoRA：基于权重分解的低秩适配\n- 表征调优\n  - ReFT：语言模型的表征微调\n\n### 时间序列LLM\n- TimeGPT-1\n- 用于时间序列和时空数据的大模型：综述与展望\n- TIME-LLM：通过重编程大型语言模型进行时间序列预测\n- 大型语言模型是零样本时间序列预测器\n- TEMPO：基于Prompt的生成式预训练Transformer用于时间序列预测\n- 用于半导体制造中无监督故障检测的时间序列数据生成式预训练\n- Lag-Llama：迈向时间序列预测的基础模型\n- PromptCast：一种新的基于Prompt的学习范式，用于时间序列预测\n\n### 量化\n- AWQ：激活感知的权重量化，用于LLM压缩和加速\n- LLM-QAT：面向大型语言模型的数据无关量化感知训练\n- LLM.int8()：大规模Transformer的8位矩阵乘法\n- SmoothQuant：针对大型语言模型的准确且高效的后训练量化\n\n### 对抗性攻击\n- 基于好奇心的大型语言模型红队测试\n- 使用语言模型对语言模型进行红队测试\n- 探索、建立、利用：从零开始的语言模型红队测试\n\n### 对话模型\n- LaMDA：用于对话应用的语言模型\n- Sparrow：通过有针对性的人类判断改善对话代理的一致性：star:\n  - BlenderBot 3：一款持续学习并负责任地互动的已部署对话代理\n  - 如何不要评估你的对话系统：对话响应生成无监督评估指标的实证研究\n  - DialogStudio：迈向最丰富、最多样化的统一对话AI数据集\n  - 通过扩大高质量指令性对话规模来增强聊天语言模型\n  - DiagGPT：一个基于LLM的聊天机器人，具备自动话题管理功能，适用于任务导向型对话\n\n### 其他\n- 在测试集上预训练就是你所需要的 哈哈作者你是懂讽刺文学的\n- Learnware：小模型也能做大事\n- 生成式AI的经济潜力\n- 一位博士生对超大规模语言模型时代NLP研究的看法\n- 人们如何使用ChatGPT","# DecryptPrompt 快速上手指南\n\n## 环境准备\n- **系统要求**：支持 Linux、macOS 或 Windows（推荐使用 Linux 或 macOS）\n- **前置依赖**：\n  - Python 3.8 或以上版本\n  - pip（Python 包管理工具）\n  - Git（用于克隆仓库）\n\n## 安装步骤\n1. 克隆 DecryptPrompt 项目到本地：\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fyour-repo\u002FDecryptPrompt.git\n   ```\n2. 进入项目目录：\n   ```bash\n   cd DecryptPrompt\n   ```\n3. 安装依赖包（可选，根据需要安装）：\n   ```bash\n   pip install -r requirements.txt\n   ```\n\n## 基本使用\n1. 查看 README 中的资源链接，例如：\n   - [开源模型和评测榜单](开源模型.MD)\n   - [解密Prompt系列1. Tunning-Free Prompt](https:\u002F\u002Fcloud.tencent.com\u002Fdeveloper\u002Farticle\u002F2215545)\n2. 根据需求访问对应的链接，阅读相关文章或文档。\n3. 如需运行代码示例，请参考各文章中的具体实现。","某AI实验室的研究员正在开发一个基于大语言模型的智能客服系统，需要快速了解当前Prompt工程、微调方法、对齐策略以及AIGC应用的最新进展，以便优化模型性能并提升用户体验。\n\n### 没有 DecryptPrompt 时  \n- 需要手动搜索大量论文和博客，耗时且容易遗漏关键信息  \n- 缺乏系统的资源整理，难以快速定位到具体技术方向  \n- 对Prompt工程、微调方法等概念理解不深入，影响实际应用效果  \n- 无法及时跟进最新的AIGC应用场景和案例  \n\n### 使用 DecryptPrompt 后  \n- 快速获取全面的Prompt工程、微调、对齐等技术总结，节省大量时间  \n- 通过结构化的资源分类，精准找到所需技术资料和实践案例  \n- 系统学习Prompt设计与优化方法，提升模型表现和用户交互体验  \n- 及时掌握AIGC在客服领域的最新应用，指导产品迭代方向  \n\nDecryptPrompt 帮助研究人员高效整合前沿知识，显著提升了模型开发与应用落地的效率和质量。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FDSXiangLi_DecryptPrompt_4a058669.png","DSXiangLi",null,"https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FDSXiangLi_a1a130ed.jpg","风雨中的小七\r\n","北京","https:\u002F\u002Fwww.cnblogs.com\u002FgogoSandy\u002F","https:\u002F\u002Fgithub.com\u002FDSXiangLi",3385,318,"2026-04-05T09:37:28",1,"Linux, macOS, Windows","需要 NVIDIA GPU，显存 8GB+，CUDA 11.7+","16GB+",{"notes":90,"python":91,"dependencies":92},"建议使用 conda 管理环境，首次运行需下载约 5GB 模型文件","3.8+",[93,94,95,96,97,98,99,100,101],"torch>=2.0","transformers>=4.30","accelerate","datasets","sentencepiece","peft","evaluate","deepspeed","wandb",[13,26,15],[104,105,106,107,108,109,110,111,112,113,114,115,116,117],"demonstration","in-context-learning","prompt","few-shot-learning","zero-shot-learning","aigc","papers","prompt-tuning","instruction-tuning","chain-of-thought","chatgpt","llm","llm-agent","prompt-engineering","2026-03-27T02:49:30.150509","2026-04-06T06:45:47.523812",[121,126],{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},5359,"作者是如何跟上LLM发展的？","作者每天固定留2-3个小时，1天看1篇，1年能看365篇。一般早上1-2小时读，晚上1-2小时写。焦虑是正常的，因为全世界的NLP工程师在写，而你只是在读，生产者和消费者不平衡会导致知识积压。","https:\u002F\u002Fgithub.com\u002FDSXiangLi\u002FDecryptPrompt\u002Fissues\u002F1",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},5360,"Llama Index 框架是否应该被加入？","用户提出 Llama Index 框架应该被加入，维护者回应表示会将其加入。","https:\u002F\u002Fgithub.com\u002FDSXiangLi\u002FDecryptPrompt\u002Fissues\u002F4",[]]