[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-onejune2018--Awesome-LLM-Eval":3,"tool-onejune2018--Awesome-LLM-Eval":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":80,"owner_email":80,"owner_twitter":80,"owner_website":80,"owner_url":81,"languages":80,"stars":82,"forks":83,"last_commit_at":84,"license":85,"difficulty_score":86,"env_os":87,"env_gpu":88,"env_ram":88,"env_deps":89,"category_tags":92,"github_topics":93,"view_count":23,"oss_zip_url":80,"oss_zip_packed_at":80,"status":16,"created_at":113,"updated_at":114,"faqs":115,"releases":116},2409,"onejune2018\u002FAwesome-LLM-Eval","Awesome-LLM-Eval","Awesome-LLM-Eval: a curated list of tools, datasets\u002Fbenchmark, demos, leaderboard, papers, docs and models, mainly for Evaluation on LLMs.  一个由工具、基准\u002F数据、演示、排行榜和大模型等组成的精选列表，主要面向基础大模型评测，旨在探求生成式AI的技术边界.","Awesome-LLM-Eval 是一个专注于大语言模型（LLM）评测的精选资源库，旨在帮助社区探索生成式 AI 的技术边界。面对当前大模型评估标准分散、维度单一的痛点，它系统性地整合了评测工具、基准数据集、演示案例、权威排行榜、前沿论文以及各类模型列表，为用户提供了一站式的导航服务。\n\n该资源库不仅涵盖了通用能力评估，还深入细分领域，包括代码生成、智能体协作、检索增强生成（RAG）、长上下文处理及多模态交互等关键场景。其独特的技术亮点在于引入了“拟人化与价值导向”的评估路线图，主张超越传统的分数比拼，从更接近人类认知和价值观的维度审视模型能力，并持续更新如 LightEval、OpenCompass 等主流评测框架。\n\nAwesome-LLM-Eval 非常适合 AI 研究人员、算法工程师及技术决策者使用。无论是需要复现最新研究成果的学者，还是致力于优化模型性能的开发者，都能在此快速找到所需的评估方案与数据支持，从而更高效地推动大模型技术的落地与迭代。","\u003Cdiv align=\"center\">\n    \u003Ch1>Awesome LLM Eval\u003C\u002Fh1>\n    \u003Ca href=\"https:\u002F\u002Fawesome.re\">\u003Cimg src=\"https:\u002F\u002Fawesome.re\u002Fbadge.svg\"\u002F>\u003C\u002Fa>\n\u003C\u002Fdiv>\n\n[English](README_EN.md) | [中文](README_CN.md)\n\n\nAwesome-LLM-Eval: a curated list of tools, datasets\u002Fbenchmark, demos, leaderboard, papers, docs and models, mainly for Evaluation on Large Language Models and exploring the boundaries and limits of Generative AI.\n\nThe is the official project of our survey: [Beyond Benchmark: LLMs Evaluation with an Anthropomorphic and Value-oriented Roadmap](arxiv.org\u002Fabs\u002F2508.18646). \n\n**NOTE:** As we cannot update the arXiv paper in real time, please refer to this repo for the latest updates and the paper may be updated later. We also welcome any pull request or issues to help us improve this work. Your contributions will be acknowledged in \u003Ca href=\"#acknowledgements\">acknowledgements\u003C\u002Fa>.\n\nIf you find our survey useful, please kindly cite our paper:\n\n```bibtex\n@misc{wang2025llmevalroadmap,\n      title={Beyond Benchmark: LLMs Evaluation with an Anthropomorphic and Value-oriented Roadmap}, \n      author={Jun Wang and Ninglun Gu and Kailai Zhang and Zijiao Zhang and Yelun Bao and Jin Yang and Xu Yin and Liwei Liu and Yihuan Liu and Pengyong Li and Gary G. Yen and Junchi Yan},\n      year={2025},\n      eprint={2508.18646},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.18646}, \n}\n```\n\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fonejune2018_Awesome-LLM-Eval_readme_31e6bafb0a3c.png)\n\n## Table of Contents\n\n- [News](#News)\n- [Tools](#Tools)\n- [Datasets \u002F Benchmark](#Datasets-or-Benchmark)\n  - [General](#General)\n  - [Domain](#Domain)\n  - [RAG-Evaluation](#RAG-Evaluation)\n  - [Agent-Capabilities](#Agent-Capabilities)\n  - [Coding-Capabilities](#Coding-Capabilities)\n  - [Multimodal\u002FCross-modal](#Multimodal-Cross-modal)\n  - [Long-Context](#Long-Context)\n  - [Inference-Speed](#Inference-Speed)\n  - [Quantization-and-Compression](#Quantization-and-Compression)\n- [Demos](#Demos)\n- [Leaderboards](#Leaderboards)\n- [Papers](#Papers)\n- [LLM-List](#LLM-List)\n  - [Pre-trained LLM](#Pre-trained-LLM)\n  - [Instruction Fine-tuned LLM](#Instruction-finetuned-LLM)\n  - [Aligned LLM](#Aligned-LLM)\n  - [Open LLM](#Open-LLM)\n  - [Popular LLM](#Popular-LLM)\n- [LLMOps](#LLMOps)\n- [Frameworks for Training](#Frameworks-for-Training)\n- [Courses](#Courses)\n- [Others](#Others)\n- [Other Awesome Lists](#Other-Awesome-Lists)\n- [Licenses](#Licenses)\n- [Citation](#Citation)\n\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fonejune2018_Awesome-LLM-Eval_readme_a285cd76374e.gif)\n\n\n## News\n\n- [2025\u002F08\u002F20] We added the [Anthropomorphic-Taxonomy](#Anthropomorphic-Taxonomy) section.\n- [2024\u002F04\u002F26] We added the [Inference-Speed](#Inference-Speed) section.\n- [2024\u002F02\u002F26] We added the [Coding-Evaluation](#Coding-Capabilities) section.\n- [2024\u002F02\u002F08] We added the [lighteval](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Flighteval) tool from Huggingface.\n- [2024\u002F01\u002F15] We added [CRUXEval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03065), [DebugBench](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FDebugBench), [OpenFinData](https:\u002F\u002Fopencompass.org.cn), and [LAiW](https:\u002F\u002Fgithub.com\u002FDai-shen\u002FLAiW).\n- [2023\u002F12\u002F20] We added the [RAG-Evaluation](#RAG-Evaluation) section.\n- [2023\u002F11\u002F15] We added [Instruction-Following-Evaluation](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finstruction_following_eval) and [LLMBar](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FLLMBar) for evaluating the instruction following capabilities of LLMs.\n- [2023\u002F10\u002F20] We added [SuperCLUE-Agent](https:\u002F\u002Fgithub.com\u002FCLUEbenchmark\u002FSuperCLUE-Agent) for LLM agent evaluation.\n- [2023\u002F09\u002F25] We added [ColossalEval](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI\u002Ftree\u002Fmain\u002Fapplications\u002FColossalEval) from Colossal-AI.\n- [2023\u002F09\u002F22] We added the [LeaderboardFinder](#Leaderboards) chapter.\n- [2023\u002F09\u002F20] We added [DeepEval](https:\u002F\u002Fgithub.com\u002Fmr-gpt\u002Fdeepeval), [FinEval](https:\u002F\u002Fgithub.com\u002FSUFE-AIFLM-Lab\u002FFinEval), and [SuperCLUE-Safety](https:\u002F\u002Fgithub.com\u002FCLUEbenchmark\u002FSuperCLUE-Safety) from CLUEbenchmark.\n- [2023\u002F09\u002F18] We added [OpenCompass](https:\u002F\u002Fgithub.com\u002FInternLM\u002Fopencompass\u002Ftree\u002Fmain) from Shanghai AI Lab.\n- [2023\u002F08\u002F03] We added new Chinese LLMs: [Baichuan](https:\u002F\u002Fgithub.com\u002Fbaichuan-inc\u002FBaichuan-13B) and [Qwen](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen-7B).\n- [2023\u002F06\u002F28] We added [AlpacaEval](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval) and multiple tools.\n- [2023\u002F04\u002F26] We released the V0.1 evaluation list with multiple benchmarks.\n\n\n\n## Anthropomorphic-Taxonomy\n\n### Typical Intelligence Quotient (IQ)-General Intelligence evaluation benchmarks\n\n| Name                  | Year | Task Type              | Institution         | Evaluation Focus                              | Datasets       | Url                                                          |\n| --------------------- | ---- | ---------------------- | ------------------- | --------------------------------------------- | -------------- | ------------------------------------------------------------ |\n| MMLU-Pro              | 2024 | Multi-Choice Knowledge | TIGER-AI-Lab        | Subtle Reasoning, Fewer Noise                 | MMLU-Pro       | [link](https:\u002F\u002Fgithub.com\u002FTIGER-AI-Lab\u002FMMLU-Pro )            |\n| DyVal                 | 2024 | Dynamic Evaluation     | Microsoft           | Data Pollution, Complexity Control            | DyVal          | [link](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpromptbench )            |\n| PertEval              | 2024 | General                | USTC                | Knowledge capacity                            | PertEval       | [link](https:\u002F\u002Fgithub.com\u002Faigc-apps\u002FPertEval )               |\n| LV-Eval               | 2024 | Long Text QA           | Infinigence-AI      | Length Variability, Factuality                | 11 Subsets     | [link](https:\u002F\u002Fgithub.com\u002Finfinigence\u002FLVEval )               |\n| LLM-Uncertainty-Bench | 2024 | NLP Tasks              | Tencent             | Uncertainty Quantification                    | 5 NLP Tasks    | [link](https:\u002F\u002Fgithub.com\u002Fsmartyfh\u002FLLM-Uncertainty-Bench )   |\n| CommonGen-Eval        | 2024 | Generation             | AI2                 | Common Sense                                  | CommonGen-lite | [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002FCommonGen-Eval )           |\n| MathBench             | 2024 | Math                   | Shanghai AI Lab     | Theoretical and practical problem-solving     | Various        | [link](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002FMathBench )           |\n| AIME                  | 2024 | Math                   | MAA                 | American Invitational Mathematics Examination | Various        | [link](https:\u002F\u002Fwww.kaggle.com\u002Fdatasets\u002Fhemishveeraboina\u002Faime-problem-set-1983-2024 ) |\n| FrontierMath          | 2024 | Math                   | Epoch AI            | Original, challenging mathematics problems    | Various        | [link](https:\u002F\u002Fepochai.org\u002Ffiles\u002Fsample_question_transcripts.zip ) |\n| FELM                  | 2023 | Factuality             | HKUST               | Factuality                                    | 847 Questions  | [link](https:\u002F\u002Fgithub.com\u002Fhkust-nlp\u002Ffelm )                   |\n| Just-Eval-Instruct    | 2023 | General                | AI2 Mosaic          | Helpfulness, Explainability                   | Various        | [link](https:\u002F\u002Fgithub.com\u002FRe-Align\u002Fjust-eval )               |\n| MLAgentBench          | 2023 | ML Research            | snap-stanford       | End-to-End ML Tasks                           | 15 Tasks       | [link](https:\u002F\u002Fgithub.com\u002Fsnap-stanford\u002FMLAgentBench )       |\n| UltraEval             | 2023 | General                | OpenBMB             | Lightweight, Flexible, Fast                   | Various        | [link](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FUltraEval )                |\n| FMTI                  | 2023 | Transparency           | Stanford            | Model Transparency                            | 100 Metrics    | [link](https:\u002F\u002Fcrfm.stanford.edu\u002Ffmti\u002F )                     |\n| BAMBOO                | 2023 | Long Text              | RUCAIBox            | Long Text Modeling                            | 10 Datasets    | [link](https:\u002F\u002Fgithub.com\u002FRUCAIBox\u002FBAMBOO )                  |\n| TRACE                 | 2023 | Continuous Learning    | Fudan University    | Continuous Learning                           | 8 Datasets     | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06762 )                    |\n| ColossalEval          | 2023 | General                | Colossal-AI         | Unified Evaluation                            | Various        | [link](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI\u002Ftree\u002Fmain\u002Fapplications\u002FColossalEval ) |\n| LLMEval²              | 2023 | General                | AlibabaResearch     | Wide and Deep Evaluation                      | 2,553 Samples  | [link](https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FDAMO-ConvAI\u002Ftree\u002Fmain\u002FWideDeep ) |\n| BigBench              | 2023 | General                | Google              | knowledge, language, reasoning                | Various        | [link](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench )                 |\n| LucyEval              | 2023 | General                | Oracle              | Maturity Assessment                           | Various        | [link](http:\u002F\u002Flucyeval.besteasy.com\u002F )                       |\n| Zhujiu                | 2023 | General                | IACAS               | Comprehensive Evaluation                      | 51 Tasks       | [link](http:\u002F\u002Fwww.zhujiu-benchmark.com )                     |\n| ChatEval              | 2023 | Chat                   | THU-NLP             | Human-like Evaluation                         | Various        | [link](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FChatEval )                  |\n| FlagEval              | 2023 | General                | THU                 | Subjective and Objective Scoring              | Various        | [link](https:\u002F\u002Fflageval.baai.ac.cn\u002F )                        |\n| AlpacaEval            | 2023 | General                | tatsu-lab           | Automatic Evaluation                          | Various        | [link](https:\u002F\u002Ftatsu-lab.github.io\u002Falpaca_eval\u002F )            |\n| GPQA                  | 2023 | General                | NYU                 | Graduate-Level Google-Proof QA                | Various        | [link](https:\u002F\u002Fgithub.com\u002Fidavidrein\u002Fgpqa )                  |\n| MuSR                  | 2023 | Reasoning              | Zayne Sprague       | Narrative-Based Reasoning                     | 756            | [link](https:\u002F\u002Fgithub.com\u002FZayne-sprague\u002FMuSR )               |\n| FreshQA               | 2023 | Knowledge              | FreshLLMs           | Current World Knowledge                       | 599            | [link](https:\u002F\u002Fgithub.com\u002Ffreshllms\u002Ffreshqa )                |\n| AGIEval               | 2023 | General                | Microsoft           | Human-Centric Reasoning                       | NA             | [link](https:\u002F\u002Fgithub.com\u002Fruixiangcui\u002FAGIEval )              |\n| SummEdits             | 2023 | General                | Salesforce          | Inconsistency Detection                       | 6,348          | [link](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FfactualNLG )            |\n| ScienceQA             | 2022 | Reasoning              | UCLA                | Science Reasoning                             | 21,208         | [link](https:\u002F\u002Fgithub.com\u002Flupantech\u002FScienceQA )              |\n| e-CARE                | 2022 | Reasoning              | HIT                 | Explainable Causality                         | 21,000         | [link](https:\u002F\u002Fgithub.com\u002FWaste-Wood\u002Fe-CARE )                |\n| BigBench Hard         | 2022 | Reasoning              | BigBench            | Challenging Subtasks                          | 6,500          | [link](https:\u002F\u002Fgithub.com\u002Fsuzgunmirac\u002FBIG-Bench-Hard )       |\n| PlanBench             | 2022 | Reasoning              | ASU                 | Action Planning                               | 11,113         | [link](https:\u002F\u002Fgithub.com\u002Fkarthikv792\u002FLLMs-Planning )        |\n| MGSM                  | 2022 | Math                   | Google              | Grade-school math problems in 10 languages    | Various        | [link](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Furl-nlp\u002Ftree\u002Fmain\u002Fmgsm ) |\n| MATH                  | 2021 | Math                   | UC Berkeley         | Mathematical Problem Solving                  | Various        | [link](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fmath\u002F )                  |\n| GSM8K                 | 2021 | Math                   | OpenAI              | Diverse grade school math word problems       | Various        | [link](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math )         |\n| SVAMP                 | 2021 | Math                   | Microsoft           | Arithmetic Reasoning                          | 1,000          | [link](https:\u002F\u002Fgithub.com\u002Farkilpatel\u002FSVAMP )                 |\n| SpartQA               | 2021 | Reasoning              | MSU                 | Textual Spatial QA                            | 510            | [link](https:\u002F\u002Fgithub.com\u002FHLR\u002FSpartQA-baselines )            |\n| MLSUM                 | 2020 | General                | Thomas Scialom      | News Summarization                            | 535,062        | [link](https:\u002F\u002Fgithub.com\u002FThomasScialom\u002FMLSUM )              |\n| Natural Questions     | 2019 | Language, Reasoning    | Google              | Search-Based QA                               | 300,000        | [link](https:\u002F\u002Fgithub.com\u002Fgoogle-research-datasets\u002Fnatural-questions ) |\n| ANLI                  | 2019 | Language, Reasoning    | Facebook AI         | Adversarial Reasoning                         | 169,265        | [link](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fanli )            |\n| BoolQ                 | 2019 | Language, Reasoning    | Google              | Binary QA                                     | 16,000         | [link](https:\u002F\u002Fgithub.com\u002Fgoogle-research-datasets\u002Fboolean-questions ) |\n| SuperGLUE             | 2019 | Language, Reasoning    | NYU                 | Advanced GLUE Tasks                           | NA             | [link](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002Fjiant )                    |\n| DROP                  | 2019 | Language, Reasoning    | UCI NLP             | Paragraph-Level Reasoning                     | 96,000         | [link](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Flm-evaluation-harness ) |\n| HellaSwag             | 2019 | Language, Reasoning    | AI2                 | Commonsense Inference                         | 59,950         | [link](https:\u002F\u002Fgithub.com\u002Frowanz\u002Fhellaswag )                 |\n| Winogrande            | 2019 | Language, Reasoning    | AI2                 | Pronoun Disambiguation                        | 44,000         | [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fwinogrande )               |\n| PIQA                  | 2019 | Language, Reasoning    | AI2                 | Physical Interaction QA                       | 18,000         | [link](https:\u002F\u002Fgithub.com\u002Fybisk\u002Fybisk.github.io\u002Ftree\u002Fmaster\u002Fpiqa ) |\n| HotpotQA              | 2018 | Language, Reasoning    | HotpotQA            | Explainable QA                                | 113,000        | [link](https:\u002F\u002Fgithub.com\u002Fhotpotqa\u002Fhotpot )                  |\n| GLUE                  | 2018 | Language, Reasoning    | NYU                 | Foundational NLU Tasks                        | NA             | [link](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002FGLUE-baselines )           |\n| OpenBookQA            | 2018 | Language, Reasoning    | AI2                 | Open Book Exams                               | 12,000         | [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002FOpenBookQA )               |\n| SQuAD2.0              | 2018 | Language, Reasoning    | Stanford University | Unanswerable Questions                        | 150,000        | [link](https:\u002F\u002Frajpurkar.github.io\u002FSQuAD-explorer\u002F )         |\n| ARC                   | 2018 | Language, Reasoning    | AI2                 | AI2 Reasoning Challenge                       | 7,787          | [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Faristo-leaderboard )       |\n| SWAG                  | 2018 | Language, Reasoning    | AI2                 | Adversarial Commonsense                       | 113,000        | [link](https:\u002F\u002Fgithub.com\u002Frowanz\u002Fswagaf )                    |\n| CommonsenseQA         | 2018 | Language, Reasoning    | AI2                 | Commonsense Reasoning                         | 12,102         | [link](https:\u002F\u002Fgithub.com\u002Fjonathanherzig\u002Fcommonsenseqa )     |\n| RACE                  | 2017 | Language, Reasoning    | CMU                 | Exam-Style QA                                 | 100,000        | [link](https:\u002F\u002Fwww.cs.cmu.edu\u002F~glai1\u002Fdata\u002Frace\u002F )            |\n| SciQ                  | 2017 | Language, Reasoning    | AI2                 | Crowd-Sourced Science                         | 13,700         | [link](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fsciq )        |\n| TriviaQA              | 2017 | Language, Reasoning    | AI2                 | Distant Supervision                           | 650,000        | [link](https:\u002F\u002Fgithub.com\u002Fmandarjoshi90\u002Ftriviaqa )           |\n| MultiNLI              | 2017 | Language, Reasoning    | NYU                 | Cross-Genre Entailment                        | 433,000        | [link](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002FmultiNLI )                 |\n| SQuAD                 | 2016 | Language, Reasoning    | Stanford University | Wikipedia-Based QA                            | 100,000        | [link](https:\u002F\u002Frajpurkar.github.io\u002FSQuAD-explorer\u002F )         |\n| LAMBADA               | 2016 | Language, Reasoning    | CIMEC               | Discourse Context                             | 12,684         | [link](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcimec\u002Flambada )       |\n| MS MARCO              | 2016 | Language, Reasoning    | Microsoft           | Search-Based QA                               | 1,112,939      | [link](https:\u002F\u002Fmicrosoft.github.io\u002Fmsmarco\u002F )                |\n\n\n\n### Typical Professional Quotient (PQ)-Professional Expertise evaluation benchmarks\n\n| Domain     | Name                  | Institution              | Scope of Tasks                                         | Unique Contributions                                         | Url                                                          |\n| ---------- | --------------------- | ------------------------ | ------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |\n|            | BLURB                 | Mindrank AI              | Six diverse NLP tasks, thirteen datasets               | A macro-average score across all tasks                       | [link](https:\u002F\u002Fmicrosoft.github.io\u002FBLURB\u002Findex.html )        |\n|            | Seismometer           | Epic                     | Using local data and workflows                         | patient demographics, clinical interventions, and outcomes   | [link](https:\u002F\u002Fgithub.com\u002Fepic-open-source\u002Fseismometer )     |\n| Healthcare | Medbench              | OpenMEDLab               | Emphasizes scientific rigor and fairness               | 40,041 questions from medical exams and reports              | [link](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fopencompass\u002Ftree\u002Fmain\u002Fopencompass\u002Fdatasets\u002Fmedbench\u002F ) |\n|            | GenMedicalEval        | E                        | 16 majors, 3 training stages, 6 clinical scenarios     | Open-ended metrics and automated assessment models           | [link](https:\u002F\u002Fgithub.com\u002FMediaBrain-SJTU\u002FGenMedicalEval )   |\n|            | PsyEval               | SJTU                     | Six subtasks covering three dimensions                 | Customized benchmark for mental health LLMs                  | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09189 )                    |\n|            | Fin-Eva               | Ant Group                | Wealth management, insurance, investment research      | Both industrial and academic financial evaluations           | [link](https:\u002F\u002Fgithub.com\u002Falipay\u002Ffinancial_evaluation_dataset ) |\n| Finance    | FinEval               | SUFE-AIFLM-Lab           | Multiple-choice QA on finance, economics, accounting   | Focuses on high-quality evaluation questions                 | [link](https:\u002F\u002Fgithub.com\u002FSUFE-AIFLM-Lab\u002FFinEval )           |\n|            | OpenFinData           | Shanghai AI Lab          | Multi-scenario financial tasks                         | First comprehensive finance evaluation dataset               | [link](https:\u002F\u002Fopencompass.org.cn )                          |\n|            | FinBen                | FinAI                    | 35 datasets across 23 financial tasks                  | Inductive reasoning, quantitative reasoning                  | [link](https:\u002F\u002Fgithub.com\u002FThe-FinAI\u002FPIXIU )                  |\n|            | LAiW                  | Sichuan University       | 13 fundamental legal NLP tasks                         | Divides legal NLP capabilities into three major abilities    | [link](https:\u002F\u002Fgithub.com\u002FDai-shen\u002FLAiW )                    |\n| Legal      | LawBench              | Nanjing University       | Legal entity recognition, reading comprehension        | Real-world tasks, \"abstention rate\" metric                   | [link](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Flawbench )            |\n|            | LegalBench            | Stanford University      | 162 tasks covering six types of legal reasoning        | Enables interdisciplinary conversations                      | [link](https:\u002F\u002Fgithub.com\u002FHazyResearch\u002Flegalbench\u002F )         |\n|            | LexEval               | Tsinghua University      | Legal cognitive abilities to organize different tasks  | Larger legal evaluation dataset, examining the ethical issues | [link](https:\u002F\u002Fgithub.com\u002FCSHaitao\u002FLexEval )                 |\n|            | SPEC5G                | Purdue University        | Security-related text classification and summarization | 5G protocol analysis automation                              | [link](https:\u002F\u002Fgithub.com\u002FImtiazkarimik23\u002FSPEC5G )           |\n| Telecom    | TeleQnA               | Huawei(Paris)            | General telecom inquiries                              | Proficiency in telecom-related questions                     | [link](https:\u002F\u002Fgithub.com\u002Fnetop-team\u002FTeleQnA )               |\n|            | OpsEval               | Tsinghua University      | Wired network ops, 5G, database ops                    | Focus on AIOps, evaluates proficiency                        | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07637 )                    |\n|            | TelBench              | SK Telecom               | Math modeling, open-ended QA, code generation          | Holistic evaluation in telecom                               | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.09424v1 )                  |\n|            | TelecomGPT            | UAE                      | Telecom Math Modeling, Open QnA and Code Tasks         | Holistic evaluation in telecom                               | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.09424v1 )                  |\n|            | Linguistic            | Queen's University       | Multiple language-centric tasks                        | zero-shot evaluation                                         | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15818 )                    |\n|            | TelcoLM               | Orange                   | Multiple-choice questionnaires                         | Domain-specific data (800M tokens, 80K instructions)         | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.15891 )                    |\n|            | ORAN-Bench-13K        | GMU                      | Multiple-choice questions                              | Open Radio Access Networks (O-RAN)                           | [link](https:\u002F\u002Fgithub.com\u002Fprnshv\u002FORAN-Bench-13K )            |\n|            | Open-Telco Benchmarks | GSMA                     | Multiple language-centric tasks                        | zero-shot evaluation                                         | [link](https:\u002F\u002Fwww.gsma.com\u002Fget-involved\u002Fgsma-foundry\u002Fgsma-open-telco-llm-benchmarks\u002F ) |\n|            | FullStackBench        | ByteDance                | Code writing, debugging, code review                   | Featuring the most recent Stack Overflow QA                  | [link](https:\u002F\u002Fgithub.com\u002Fbytedance\u002FFullStackBench )         |\n| Coding     | StackEval             | Prosus AI                | 11 real-world scenarios, 16 languages                  | Evaluation across diverse & practical coding environments    | [link](https:\u002F\u002Fgithub.com\u002FProsusAI\u002Fstack-eval )              |\n|            | CodeBenchGen          | Various Institutions     | Execution-based code generation tasks                  | Benchmarks scaling with the size and complexity              | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.00566 )                    |\n|            | HumanEval             | University of Washington | Rigorous testing                                       | Stricter protocol for assessing correctness of generated code | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374 )                    |\n|            | APPS                  | University of California | Coding challenges from competitive platforms           | Checking problem-solving of generated code on test cases     | [link](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fapps )                   |\n|            | MBPP                  | Google Research          | Programming problems sourced from various origins      | Diverse programming tasks                                    | [link](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fmbpp ) |\n|            | ClassEval             | Tsinghua University      | Class-level code generation                            | Manually crafted, object-oriented programming concepts       | [link](https:\u002F\u002Fgithub.com\u002FFudanSELab\u002FClassEval )             |\n|            | CoderEval             | Peking University        | Pragmatic code generation                              | Proficiency to generate functional code patches for described issues | [link](https:\u002F\u002Fgithub.com\u002FCoderEval\u002FCoderEval )              |\n|            | MultiPL-E             | Princeton University     | Neural code generation                                 | Benchmarking neural code generation models                   | [link](https:\u002F\u002Fgithub.com\u002Fnuprl\u002FMultiPL-E )                  |\n|            | CodeXGLUE             | Microsoft                | Code intelligence                                      | Wide tasks covering: code-code, text-code, code-text and text-text | [link](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FCodeXGLUE )              |\n|            | EvoCodeBench          | Peking University        | Evolving code generation benchmark                     | Aligned with real-world code repositories, evolving over time | [link](https:\u002F\u002Fgithub.com\u002Fseketeam\u002FEvoCodeBench )            |\n\n\n\n### Typical Emotional Quotient (EQ)-Alignment Ability evaluation benchmarks\n\n| Name | Year | Task Type | Institution | Category | Datasets | Url |\n| --- | --- | --- | --- | --- | --- | --- |\n| DiffAware | 2025 | Bias | Stanford | General Bias | 8 datasets | [link](https:\u002F\u002Fgithub.com\u002FAngelina-Wang\u002Fdifference_awareness ) |\n| CASE-Bench | 2025 | Safety | Cambridge | Context-Aware Safety | CASE-Bench | [link](https:\u002F\u002Fgithub.com\u002FBriansIDP\u002FCASEBench ) |\n| Fairness | 2025 | Fairness | PSU | Distributive Fairness | - | - |\n| HarmBench | 2024 | Safety | UIUC | Adversarial Behaviors | 510 | [link](https:\u002F\u002Fgithub.com\u002Fcenterforaisafety\u002FHarmBench ) |\n| SimpleQA | 2024 | Safety | OpenAI | Factuality | 4,326 | [link](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fsimple-evals ) |\n| AgentHarm | 2024 | Safety | BEIS | Malicious Agent Tasks | 110 | [link](https:\u002F\u002Fgithub.com\u002FUKGovernmentBEIS\u002Finspect_evals ) |\n| StrongReject | 2024 | Safety | dsbowen | Attack Resistance | n\u002Fa | [link](https:\u002F\u002Fgithub.com\u002Fdsbowen\u002Fstrong_reject ) |\n| LLMBar | 2024 | Instruction | Princeton | Instruction Following | 419 Instances | [link](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FLLMBar ) |\n| AIR-Bench | 2024 | Safety | Stanford | Regulatory Alignment | 5,694 | [link](https:\u002F\u002Fgithub.com\u002Fstanford-crfm\u002Fair-bench-2024 ) |\n| TrustLLM | 2024 | General | TrustLLM | Trustworthiness | 30+ | [link](https:\u002F\u002Ftrustllmbenchmark.github.io\u002FTrustLLM-Website\u002F ) |\n| RewardBench | 2024 | Alignment | AIAI | Human preference | RewardBench | [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench ) |\n| EQ-Bench | 2024 | Emotion | Paech | Emotional intelligence | 171 Questions | [link](https:\u002F\u002Fgithub.com\u002FEQ-bench\u002FEQ-Bench ) |\n| Forbidden | 2023 | Safety | CISPA | Jailbreak Detection | 15,140 | [link](https:\u002F\u002Fgithub.com\u002Fverazuo\u002Fjailbreak_llms ) |\n| MaliciousInstruct | 2023 | Safety | Princeton | Malicious Intentions | 100 | [link](https:\u002F\u002Fgithub.com\u002FPrinceton-SysML\u002FJailbreak_LLM ) |\n| SycophancyEval | 2023 | Safety | Anthropic | Opinion Alignment | n\u002Fa | [link](https:\u002F\u002Fgithub.com\u002Fmeg-tong\u002Fsycophancy-eval ) |\n| DecodingTrust | 2023 | Safety | UIUC | Trustworthiness | 243,877 | [link](https:\u002F\u002Fgithub.com\u002FAI-secure\u002FDecodingTrust ) |\n| AdvBench | 2023 | Safety | CMU | Adversarial Attacks | 1,000 | [link](https:\u002F\u002Fgithub.com\u002Fllm-attacks\u002Fllm-attacks ) |\n| XSTest | 2023 | Safety | Bocconi | Safety Overreach | 450 | [link](https:\u002F\u002Fgithub.com\u002Fpaul-rottger\u002Fexaggerated-safety ) |\n| OpinionQA | 2023 | Safety | tatsu-lab | Demographic Alignment | 1,498 | [link](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fopinions_qa ) |\n| SafetyBench | 2023 | Safety | THU | Content Safety | 11,435 | [link](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafetyBench ) |\n| HarmfulQA | 2023 | Safety | declare-lab | Harmful Topics | 1,960 | [link](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fred-instruct ) |\n| QHarm | 2023 | Safety | vinid | Safety Sampling | 100 | [link](https:\u002F\u002Fgithub.com\u002Fvinid\u002Fsafety-tuned-llamas ) |\n| BeaverTails | 2023 | Safety | PKU | Red Teaming | 334,000 | [link](https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fbeavertails ) |\n| DoNotAnswer | 2023 | Safety | Libr-AI | Safety Mechanisms | 939 | [link](https:\u002F\u002Fgithub.com\u002FLibr-AI\u002Fdo-not-answer ) |\n| AlignBench | 2023 | Alignment | THUDM | Alignment, Reliability | Various | [link](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAlignBench ) |\n| IFEval | 2023 | Instruction | Google | Instruction Following | 500 Prompts | [link](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finstruction_following_eval ) |\n| ToxiGen | 2022 | Safety | Microsoft | Toxicity Detection | 274,000 | [link](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FTOXIGEN ) |\n| HHH | 2022 | Safety | Anthropic | Human Preferences | 44,849 | [link](https:\u002F\u002Fgithub.com\u002Fanthropics\u002Fhh-rlhf ) |\n| RedTeam | 2022 | Safety | Anthropic | Red Teaming | 38,921 | [link](https:\u002F\u002Fgithub.com\u002Fanthropics\u002Fhh-rlhf ) |\n| BOLD | 2021 | Bias | Amazon | Bias in Generation | 23,679 | [link](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fbold ) |\n| BBQ | 2021 | Bias | NYU | Social Bias | 58,492 | [link](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002FBBQ ) |\n| StereoSet | 2020 | Bias | McGill | Stereotype Detection | 4,229 | [link](https:\u002F\u002Fgithub.com\u002Fmoinnadeem\u002FStereoSet ) |\n| ETHICS | 2020 | Ethics | Berkeley | Moral Judgement | 134,400 | [link](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fethics ) |\n| ToxicityPrompt | 2020 | Safety | AllenAI | Toxicity Assessment | 99,442 | [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freal-toxicity-prompts ) |\n| CrowS-Pairs | 2020 | Bias | NYU | Stereotype Measurement | 1,508 | [link](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002Fcrows-pairs ) |\n| SEAT | 2019 | Bias | Princeton | Encoder Bias | n\u002Fa | [link](https:\u002F\u002Fgithub.com\u002FW4ngatang\u002Fsent-bias ) |\n| WinoGender | 2018 | Bias | UMass | Gender Bias | 720 | [link](https:\u002F\u002Fgithub.com\u002Frudinger\u002Fwinogender-schemas ) |\n\n## Tools\n\n|       Name        |  Organization   |                           Website                            |                         Description                          |\n| :---------------: | :-------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n|  prometheus-eval  | prometheus-eval | [prometheus-eval](https:\u002F\u002Fgithub.com\u002Fprometheus-eval\u002Fprometheus-eval) | PROMETHEUS Open Evaluation Dedicated Language Model, which is more powerful than its predecessor. It can closely imitate the judgments of humans and GPT-4. Additionally, it can handle both direct evaluation and pairwise ranking formats, and can be used with user-defined evaluation criteria. On four direct evaluation benchmarks and four pairwise ranking benchmarks, PROMETHEUS 2 achieves the highest correlation and consistency with human evaluators and proprietary language models among all tested open-source evaluation language models (2024-05-04). |\n|   athina-evals    |    athina-ai    |    [athina-ai](https:\u002F\u002Fgithub.com\u002Fathina-ai\u002Fathina-evals)    | Athina-ai is an open-source library that provides plug-and-play preset evaluations and a modular, extensible framework for writing and running evaluations. It helps engineers systematically improve the reliability and performance of their large language models through evaluation-driven development. Athina-ai offers a system for evaluation-driven development, overcoming the limitations of traditional workflows, enabling rapid experimentation, and providing customizable evaluators with consistent metrics. |\n| LeaderboardFinder |   Huggingface   | [LeaderboardFinder](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fleaderboards\u002FLeaderboardFinder) | LeaderboardFinder helps you find suitable leaderboards for specific scenarios, a leaderboard of leaderboards (2024-04-02). |\n|     LightEval     |   Huggingface   |    [lighteval](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Flighteval)     | LightEval is a lightweight framework developed by Hugging Face for evaluating large language models (LLMs). Originally designed as an internal tool for assessing Hugging Face's recently released LLM data processing library datatrove and LLM training library nanotron, it is now open-sourced for community use and improvement. Key features of LightEval include: (1) lightweight design, making it easy to use and integrate; (2) an evaluation suite supporting multiple tasks and models; (3) compatibility with evaluation on CPUs or GPUs, and integration with Hugging Face's acceleration library (Accelerate) and frameworks like Nanotron; (4) support for distributed evaluation, which is particularly useful for evaluating large models; (5) applicability to all benchmarks on the Open LLM Leaderboard; and (6) customizability, allowing users to add new metrics and tasks to meet specific evaluation needs (2024-02-08). |\n|  LLM Comparator   |     Google      |    [LLM Comparator](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2402.10524v1)     | A visual analytical tool for comparing and evaluating large language models (LLMs). Compared to traditional human evaluation methods, this tool offers a scalable automated approach to comparative evaluation. It leverages another LLM as an evaluator to demonstrate quality differences between models and provide reasons for these differences. Through interactive tables and summary visualizations, the LLM Comparator helps users understand why models perform well or poorly in specific contexts, as well as the qualitative differences between model responses. Developed in collaboration with Google researchers and engineers, this tool has been widely used internally at Google, attracting over 400 users and evaluating more than 1,000 experiments within three months (2024-02-16).|\n|   Arthur Bench    |    Arthur-AI    |      [Arthur Bench](https:\u002F\u002Fgithub.com\u002Farthur-ai\u002Fbench)      | Arthur Bench is an open-source evaluation tool designed to compare and analyze the performance of large language models (LLMs). It supports various evaluation tasks, including question answering, summarization, translation, and code generation, and provides detailed reports on LLM performance across these tasks. Key features and advantages of Arthur Bench include: (1) model comparison, enabling the evaluation of different suppliers, versions, and training datasets of LLMs; (2) prompt and hyperparameter evaluation, assessing the impact of different prompts on LLM performance and testing the control of model behavior through various hyperparameter settings; (3) task definition and model selection, allowing users to define specific evaluation tasks and select evaluation targets from a range of supported LLM models; (4) parameter configuration, enabling users to adjust prompts and hyperparameters to finely control LLM behavior; (5) automated evaluation workflows, simplifying the execution of evaluation tasks; and (6) application scenarios such as model selection and validation, budget and privacy optimization, and the transformation of academic benchmarks into real-world performance evaluations. Additionally, it offers comprehensive scoring metrics, supports both local and cloud versions, and encourages community collaboration and project development (2023-10-06). |\n| llm-benchmarker-suite | FormulaMonks | [llm-benchmarker-suite](https:\u002F\u002Fgithub.com\u002FFormulaMonks\u002Fllm-benchmarker-suite) | This open-source initiative aims to address fragmentation and ambiguity in LLM benchmarking. The suite provides a structured methodology, a collection of diverse benchmarks, and toolkits to streamline the assessment of LLM performance. By offering a common platform, this project seeks to promote collaboration, transparency, and high-quality research in NLP. |\n| autoevals | braintrust | [autoevals](https:\u002F\u002Fgithub.com\u002Fbraintrustdata\u002Fautoevals) | AutoEvals is an AI model output evaluation tool that leverages best practices to quickly and easily assess AI model outputs. It integrates multiple automatic evaluation methods, supports customizable evaluation prompts and custom scorers, and simplifies the evaluation process of model outputs. Autoevals incorporates model-graded evaluation for various subjective tasks, including fact-checking, safety, and more. Many of these evaluations are adapted from OpenAI's excellent evals project but are implemented in a flexible way to allow users to tweak prompts and debug outputs. |\n| EVAL | OPENAI | [EVAL](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fevals) | EVAL is a tool developed by OpenAI for evaluating large language models (LLMs). It can test the performance and generalization capabilities of models across different tasks and datasets. |\n| lm-evaluation-harness | EleutherAI | [lm-evaluation-harness](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Flm-evaluation-harness) | lm-evaluation-harness is a tool developed by EleutherAI for evaluating large language models (LLMs). It can test the performance and generalization capabilities of models across different tasks and datasets. |\n| lm-evaluation | AI21Labs | [lm-evaluation](https:\u002F\u002Fgithub.com\u002FAI21Labs\u002Flm-evaluation) | Evaluations and reproducing the results from the Jurassic-1 Technical [Paper](https:\u002F\u002Fwww.ai21.com\u002Fblog\u002Fannouncing-ai21-studio-and-jurassic-1), with current support for running tasks via both the AI21 Studio API and OpenAI's GPT-3 API. |\n| OpenCompass | Shanghai AI Lab | [OpenCompass](https:\u002F\u002Fgithub.com\u002FInternLM\u002Fopencompass\u002Ftree\u002Fmain) | OpenCompass is a one-stop platform for evaluating large models. Its main features include: open-source and reproducible evaluation schemes; comprehensive capability dimensions covering five major areas with over 50 datasets and approximately 300,000 questions to assess model capabilities; support for over 20 Hugging Face and API models; distributed and efficient evaluation with one-line task splitting and distributed evaluation, enabling full evaluation of trillion-parameter models within hours; diverse evaluation paradigms supporting zero-shot, few-shot, and chain-of-thought evaluations with standard or conversational prompt templates to easily elicit peak model performance. |\n| Large language model evaluation and workflow framework from Phase AI | wgryc | [phasellm](https:\u002F\u002Fgithub.com\u002Fwgryc\u002Fphasellm) | A framework provided by Phase AI for evaluating and managing LLMs, helping users select appropriate models, datasets, and metrics, as well as visualize and analyze results. |\n| Evaluation benchmark for LLM | FreedomIntelligence | [LLMZoo](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FLLMZoo) | LLMZoo is an evaluation benchmark for LLMs developed by FreedomIntelligence, featuring multiple domain and task datasets, metrics, and pre-trained models with results. |\n| Holistic Evaluation of Language Models (HELM) | Stanford | [HELM](https:\u002F\u002Fgithub.com\u002Fstanford-crfm\u002Fhelm) | HELM is a comprehensive evaluation method for LLMs proposed by the Stanford research team, considering multiple aspects such as model language ability, knowledge, reasoning, fairness, and safety. |\n| A lightweight evaluation tool for question-answering | Langchain | [auto-evaluator](https:\u002F\u002Fgithub.com\u002Frlancemartin\u002Fauto-evaluator) | auto-evaluator is a lightweight tool developed by Langchain for evaluating question-answering systems. It can automatically generate questions and answers and calculate metrics such as model accuracy, recall, and F1 score. |\n| PandaLM | WeOpenML | [PandaLM](https:\u002F\u002Fgithub.com\u002FWeOpenML\u002FPandaLM) | PandaLM is an LLM assessment tool developed by WeOpenML for automated and reproducible evaluation. It allows users to select appropriate datasets, metrics, and models based on their needs and preferences, and generates reports and charts. |\n| FlagEval | Tsinghua University | [FlagEval](https:\u002F\u002Fgithub.com\u002FFlagOpen\u002FFlagEval) | FlagEval is an evaluation platform for LLMs developed by Tsinghua University, offering multiple tasks and datasets, as well as online testing, leaderboards, and analysis functions. |\n| AlpacaEval | tatsu-lab | [alpaca_eval](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval) | AlpacaEval is an evaluation tool for LLMs developed by tatsu-lab, capable of testing models across various languages, domains, and tasks, and providing explainability, robustness, and credibility metrics. |\n| Prompt flow | Microsoft | [promptflow](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpromptflow) | A set of development tools designed by Microsoft to simplify the end-to-end development cycle of AI applications based on LLMs, from conception, prototyping, testing, and evaluation to production deployment and monitoring. It makes prompt engineering easier and enables the development of product-level LLM applications. |\n| DeepEval | mr-gpt | [DeepEval](https:\u002F\u002Fgithub.com\u002Fconfident-ai\u002Fdeepeval) | DeepEval is a simple-to-use, open-source LLM evaluation framework. Similar to Pytest but specialized for unit testing LLM outputs, it incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, hallucination, answer relevance, RAGAS, etc., utilizing LLMs and various other NLP models that run locally on your machine for evaluation. |\n| CONNER | Tencent AI Lab | [CONNER](https:\u002F\u002Fgithub.com\u002FChanLiang\u002FCONNER) | CONNER is a comprehensive large model knowledge evaluation framework designed to systematically and automatically assess the information generated from six critical perspectives: factuality, relevance, coherence, informativeness, usefulness, and validity. |\n\n## Datasets or Benchmarks\n\n### General\n\n|         Name          |      Organization       |                           Website                            |                         Description                          |\n| :-------------------: | :---------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n|       MMLU-Pro        |      TIGER-AI-Lab       |     [MMLU-Pro](https:\u002F\u002Fgithub.com\u002FTIGER-AI-Lab\u002FMMLU-Pro)     | MMLU-Pro is an improved version of the MMLU dataset. MMLU has long been a reference for multiple-choice knowledge datasets. However, recent studies have shown that it contains noise (some questions are unanswerable) and is too easy (due to the evolution of model capabilities and increased contamination). MMLU-Pro provides ten options instead of four, requires reasoning on more questions, and has undergone expert review to reduce noise. It is of higher quality and more challenging than the original. MMLU-Pro reduces the impact of prompt variations on model performance, a common issue with its predecessor MMLU. Research indicates that models using \"Chain of Thought\" reasoning perform better on this new benchmark, suggesting that MMLU-Pro is better suited for evaluating the subtle reasoning abilities of AI. (2024-05-20) |\n|  TrustLLM Benchmark   |        TrustLLM         | [TrustLLM](https:\u002F\u002Ftrustllmbenchmark.github.io\u002FTrustLLM-Website\u002F) | TrustLLM is a benchmark for evaluating the trustworthiness of large language models. It covers six dimensions of trustworthiness and includes over 30 datasets to comprehensively assess the functional capabilities of LLMs, ranging from simple classification tasks to complex generative tasks. Each dataset presents unique challenges and has benchmarked 16 mainstream LLMs (including commercial and open-source models). |\n|         DyVal         |        Microsoft        |      [DyVal](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpromptbench)       | Concerns have been raised about the potential data contamination in the vast training corpora of LLMs. Additionally, the static nature and fixed complexity of current benchmarks may not adequately measure the evolving capabilities of LLMs. DyVal is a general and flexible protocol for dynamically evaluating LLMs. Leveraging the advantages of directed acyclic graphs, DyVal dynamically generates evaluation samples with controllable complexity. It has created challenging evaluation sets for reasoning tasks such as mathematics, logical reasoning, and algorithmic problems. Various LLMs, from Flan-T5-large to GPT-3.5-Turbo and GPT-4, have been evaluated. Experiments show that LLMs perform worse on DyVal-generated samples of different complexities, highlighting the importance of dynamic evaluation. The authors also analyze failure cases and results of different prompting methods. Furthermore, DyVal-generated samples not only serve as evaluation sets but also aid in fine-tuning to enhance LLM performance on existing benchmarks. (2024-04-20) |\n|      RewardBench      |          AIAI           |    [RewardBench](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench)    | RewardBench is an evaluation benchmark for language model reward models, assessing the strengths and weaknesses of various models. It reveals that existing models still exhibit significant shortcomings in reasoning and instruction following. It includes a [Leaderboard](https:\u002F\u002Fhf.co\u002Fspaces\u002Fallenai\u002Freward-bench), [Code](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench), and [Dataset](https:\u002F\u002Fhf.co\u002Fdatasets\u002Fallenai\u002Freward-bench) (2024-03-20). |\n|        LV-Eval        |     Infinigence-AI      |       [LVEval](https:\u002F\u002Fgithub.com\u002Finfinigence\u002FLVEval)        | LV-Eval is a long-text evaluation benchmark featuring five length tiers (16k, 32k, 64k, 128k, and 256k), with a maximum text test length of 256k. The average text length of LV-Eval is 102,380 characters, with a minimum\u002Fmaximum text length of 11,896\u002F387,406 characters. LV-Eval primarily consists of two types of evaluation tasks: single-hop QA and multi-hop QA, encompassing 11 sub-datasets in Chinese and English. During its design, LV-Eval introduced three key technologies: Confusion Facts Insertion (CFI) to enhance challenge, Keyword and Phrase Replacement (KPR) to reduce information leakage, and Answer Keywords (AK) based evaluation metrics (combining answer keywords and word blacklists) to improve the objectivity of evaluation results (2024-02-06). |\n| LLM-Uncertainty-Bench |         Tencent         | [LLM-Uncertainty-Bench](https:\u002F\u002Fgithub.com\u002Fsmartyfh\u002FLLM-Uncertainty-Bench) | A new benchmark method for LLMs has been introduced, incorporating uncertainty quantification. Based on nine LLMs tested across five representative NLP tasks, it was found that: I) More accurate LLMs may exhibit lower certainty; II) Larger-scale LLMs may display greater uncertainty than smaller models; III) Instruction fine-tuning tends to increase the uncertainty of LLMs. These findings underscore the importance of including uncertainty in LLM evaluations (2024-01-22). |\n|  Psychometrics Eval   | Microsoft Research Asia |    [Psychometrics Eval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.16379)    | Microsoft Research Asia has proposed a generalized evaluation method for AI based on psychometrics, aiming to address limitations in traditional evaluation methods concerning predictive power, information volume, and test tool quality. This approach draws on psychometric theories to identify key psychological constructs of AI, design targeted tests, and apply Item Response Theory for precise scoring. It also introduces concepts of reliability and validity to ensure evaluation reliability and accuracy. This framework extends psychometric methods to assess AI performance in handling unknown complex tasks but also faces open questions such as distinguishing between AI \"individuals\" and \"populations,\" addressing prompt sensitivity, and evaluating differences between human and AI constructs (2023-10-19). |\n|    CommonGen-Eval     |         AllenAI         | [CommonGen-Eval](https:\u002F\u002Fgithub.com\u002Fallenai\u002FCommonGen-Eval)  | A study using the CommonGen-lite dataset to evaluate LLMs, employing GPT-4 for assessment and comparing the performance of different models, with results listed on the leaderboard (2024-01-04). |\n|         felm          |          HKUST          |          [felm](https:\u002F\u002Fgithub.com\u002Fhkust-nlp\u002Ffelm)           | FELM is a meta-benchmark for evaluating the factual assessment of large language models. The benchmark comprises 847 questions spanning five distinct domains: world knowledge, science\u002Ftechnology, writing\u002Frecommendation, reasoning, and mathematics. Prompts corresponding to each domain are gathered from various sources, including standard datasets like TruthfulQA, online platforms like GitHub repositories, ChatGPT-generated prompts, or those drafted by authors. For each response, fine-grained annotation at the segment level is employed, including reference links, identified error types, and reasons behind these errors as provided by annotators (2023-10-03). |\n|       just-eval       |       AI2 Mosaic        |      [just-eval](https:\u002F\u002Fgithub.com\u002FRe-Align\u002Fjust-eval)      | A GPT-based evaluation tool for multi-faceted and explainable assessment of LLMs, capable of evaluating aspects such as helpfulness, clarity, factuality, depth, and engagement (2023-12-05). |\n|       EQ-Bench        |        EQ-Bench         |       [EQ-Bench](https:\u002F\u002Fgithub.com\u002FEQ-bench\u002FEQ-Bench)       | A benchmark for evaluating the emotional intelligence of language models, featuring 171 questions (compared to 60 in v1) and a new scoring system that better distinguishes performance differences among models (2023-12-20). |\n|       CRUXEval        |        MIT CSAIL        |         [CRUXEval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03065)         | CRUXEval is a benchmark for evaluating code reasoning, understanding, and execution. It includes 800 Python functions and their input-output pairs, testing input prediction and output prediction tasks. Many models that perform well on HumanEval underperform on CRUXEval, highlighting the need for improved code reasoning capabilities. The best model, GPT-4 with chain-of-thought (CoT), achieved pass@1 rates of 75% and 81% for input prediction and output prediction, respectively. The benchmark exposes gaps between open-source and closed-source models. GPT-4 failed to fully pass CRUXEval, providing insights into its limitations and directions for improvement (2024-01-05). |\n|     MLAgentBench      |      snap-stanford      | [MLAgentBench](https:\u002F\u002Fgithub.com\u002Fsnap-stanford\u002FMLAgentBench) | MLAgentBench is a suite of end-to-end machine learning (ML) research tasks for benchmarking AI research agents. These agents aim to autonomously develop or improve an ML model based on a given dataset and ML task description. Each task represents an interactive environment that directly reflects what human researchers encounter. Agents can read available files, run multiple experiments on compute clusters, and analyze results to achieve the specified research objectives. Specifically, it includes 15 diverse ML engineering tasks that can be accomplished by attempting different ML methods, data processing, architectures, and training processes (2023-10-05). |\n|      AlignBench       |          THUDM          |      [AlignBench](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAlignBench)       | AlignBench is a comprehensive and multi-dimensional benchmark for evaluating the alignment performance of Chinese large language models. It constructs a human-in-the-loop data creation process to ensure dynamic data updates. AlignBench employs a multi-dimensional, rule-based model evaluation method (LLM-as-Judge) and combines chain-of-thought (CoT) to generate multi-dimensional analyses and final comprehensive scores for model responses, enhancing the reliability and explainability of evaluations (2023-12-01). |\n|       UltraEval       |         OpenBMB         |      [UltraEval](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FUltraEval)       | UltraEval is an open-source foundational model capability evaluation framework offering a lightweight and easy-to-use evaluation system that supports mainstream large model performance assessments. Its key features include: (1) a lightweight and user-friendly evaluation framework with intuitive design, minimal dependencies, easy deployment, and good scalability for various evaluation scenarios; (2) flexible and diverse evaluation methods with unified prompt templates and rich evaluation metrics, supporting customization; (3) efficient and rapid inference deployment supporting multiple model deployment solutions, including torch and vLLM, and enabling multi-instance deployment to accelerate the evaluation process; (4) a transparent and open leaderboard with publicly accessible, traceable, and reproducible evaluation results driven by the community to ensure transparency; and (5) official and authoritative evaluation data using widely recognized official datasets to guarantee evaluation fairness and standardization, ensuring result comparability and reproducibility (2023-11-24). |\n|        IFEval         |     google-research     | [Instruction Following Eval](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finstruction_following_eval) | Following natural language instructions is a core capability of large language models. However, the evaluation of this capability lacks standardization: human evaluation is expensive, slow, and lacks objective reproducibility, while automated evaluation based on LLMs may be biased by the evaluator LLM's capabilities or limitations. To address these issues, researchers at Google introduced Instruction Following Evaluation (IFEval), a simple and reproducible benchmark focusing on a set of \"verifiable instructions,\" such as \"write over 400 words\" and \"mention the AI keyword at least 3 times.\" IFEval identifies 25 such verifiable instructions and constructs approximately 500 prompts, each containing one or more verifiable instructions (2023-11-15). |\n|        LLMBar         |      princeton-nlp      |      [LLMBar](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FLLMBar)       | LLMBar is a challenging meta-evaluation benchmark designed to test the ability of LLM evaluators to identify instruction-following outputs. It contains 419 instances, each consisting of an instruction and two outputs: one faithfully and correctly following the instruction, and the other deviating from it. Each instance also includes a gold label indicating which output is objectively better (2023-10-29). |\n|HalluQA | Fudan, Shanghai AI Lab | [HalluQA](https:\u002F\u002Fgithub.com\u002Fxiami2019\u002FHalluQA\u002F) | HalluQA is a Chinese LLM hallucination evaluation benchmark, featuring 450 data points including 175 misleading entries, 69 hard misleading entries, and 206 knowledge-based entries. Each question has an average of 2.8 correct and incorrect answers annotated. To enhance the usability of HalluQA, the authors designed a GPT-4-based evaluation method. Specifically, hallucination criteria and correct answers are input as instructions to GPT-4, which evaluates whether the model's response contains hallucinations.|\n|FMTI | Stanford | [FMTI](https:\u002F\u002Fcrfm.stanford.edu\u002Ffmti\u002F) | The Foundation Model Transparency Index (FMTI) evaluates the transparency of developers in model training and deployment across 100 indicators, including data, computational resources, and labor. Evaluations of flagship models from 10 companies reveal an average transparency score of only 37\u002F100, indicating significant room for improvement.|\n|ColossalEval | Colossal-AI | [ColossalEval](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI\u002Ftree\u002Fmain\u002Fapplications\u002FColossalEval) | A project by Colossal-AI offering a unified evaluation workflow for assessing language models on public datasets or custom datasets using traditional metrics and GPT-assisted evaluations.|\n| LLMEval²-WideDeep | Alibaba Research | [LLMEval²](https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FDAMO-ConvAI\u002Ftree\u002Fmain\u002FWideDeep) | Constructed as the largest and most diverse English evaluation benchmark for LLM evaluators, featuring 15 tasks, 8 capabilities, and 2,553 samples. Experimental results indicate that a wider network (involving many reviewers) with two layers (one round of discussion) performs best, improving the Kappa correlation coefficient from 0.28 to 0.34. WideDeep is also utilized to assist in evaluating Chinese LLMs, accelerating the evaluation process by 4.6 times and reducing costs by 60%.|\n|Aviary | Ray Project | [Aviary](https:\u002F\u002Fgithub.com\u002Fray-project\u002Faviary) | Enables interaction with various large language models (LLMs) in one place. Direct comparison of different model outputs, ranking by quality, and obtaining cost and latency estimates are supported. It particularly supports models hosted on Hugging Face and in many cases, also supports DeepSpeed inference acceleration.|\n| Do-Not-Answer | Libr-AI | [Do-Not-Answer](https:\u002F\u002Fgithub.com\u002FLibr-AI\u002Fdo-not-answer) | An open-source dataset designed to evaluate the safety mechanisms of LLMs at a low cost. It consists of prompts that responsible language models should not respond to. In addition to human annotations, it implements model-based evaluation, where a BERT-like evaluator fine-tuned 600 million times achieves results comparable to humans and GPT-4.|\n| LucyEval | Oracle | [LucyEval](http:\u002F\u002Flucyeval.besteasy.com\u002F) | Chinese LLM maturity evaluation—LucyEval can objectively test various aspects of model capabilities, identify model shortcomings, and help designers and engineers more accurately adjust and train models, aiding LLMs in advancing toward greater intelligence.|\n| Zhujiu | Institute of Automation, CAS | [Zhujiu](http:\u002F\u002Fwww.zhujiu-benchmark.com) | Covers seven capability dimensions and 51 tasks; employs three complementary evaluation methods; offers comprehensive Chinese benchmarking with English evaluation capabilities.|\n| ChatEval | THU-NLP | [ChatEval](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FChatEval) | ChatEval aims to simplify the human evaluation process of generated text. Given different text fragments, roles (played by master's students) in ChatEval can autonomously discuss nuances and differences, providing judgments based on their designated roles.|\n|FlagEval | Zhiyuan\u002FTsinghua | [FlagEval](https:\u002F\u002Fflageval.baai.ac.cn\u002F#\u002Fhome) | Produced by Zhiyuan, combining subjective and objective scoring to offer LLM score rankings.|\n|InfoQ Comprehensive LLM Evaluation | InfoQ | [InfoQ Evaluation](https:\u002F\u002Fmp.weixin.qq.com\u002Fs?__biz=MjM5MDE0Mjc4MA==&mid=265117067676&idx=1&sn=b98af3bd14c9f9fbb3e7f0f8f9bb3ec&scene=21#wechat_redirect) | Chinese-oriented ranking: ChatGPT > Wenxin Yiyang > Claude > Xinghuo.|\n|Chain-of-Thought Evaluation | Yao Fu | [COT Evaluation](https:\u002F\u002Fgithub.com\u002FFranxYao\u002Fchain-of-thought-hub) | Includes rankings for GSM8k and MATH complex problems.|\n|Z-Bench | True Fund | [Z-Bench](https:\u002F\u002Fgithub.com\u002Fzhenbench\u002Fz-bench) | Indicates that domestic Chinese models have relatively low programmability, with minimal performance differences between models. The two versions of ChatGLM show significant improvement.|\n| CMU Chatbot Evaluation  | CMU | [zeno-build](https:\u002F\u002Fgithub.com\u002Fzeno-ml\u002Fzeno-build) | In conversational training scenarios, rankings show ChatGPT > Vicuna > others.|\n|lmsys-arena | Berkeley | [lmsys Ranking](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-05-03-arena\u002F) | Utilizes Elo scoring mechanism, with rankings showing GPT4 > Claude > GPT3.5 > Vicuna > others.|\n|Huggingface Open LLM Leaderboard | Huggingface | [HF Open LLM Leaderboard](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FHuggingFaceH4\u002Fopen_llm_leaderboard) | Organized by Huggingface, this leaderboard evaluates multiple mainstream open-source LLMs. Evaluations focus on four datasets: AI2 Reasoning Challenge, HellaSwag, MMLU, and TruthfulQA, primarily in English.|\n| AlpacaEval | tatsu-lab | [AlpacaEval](https:\u002F\u002Ftatsu-lab.github.io\u002Falpaca_eval\u002F) | Open-source model leader where Vicuna, OpenChat, and WizardLM lead based on LLM-based automatic evaluations.|\n|Chinese-LLM-Benchmark | jeinlee1991 | [llm-benchmark](https:\u002F\u002Fgithub.com\u002Fjeinlee1991\u002Fchinese-llm-benchmark) | Chinese LLM capability evaluation rankings covering Baidu Ernie Bot, ChatGPT, Alibaba Tongyi Qianwen, iFLYTEK Xinghuo, and open-source models like Belle and ChatGLM6B. It provides capability score rankings and original model output results.|\n|Open LLM Leaderboard | HuggingFace | [Leaderboard](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FHuggingFaceH4\u002Fopen_llm_leaderboard) | Organized by HuggingFace to evaluate multiple mainstream open-source LLMs. Evaluations primarily focus on four datasets: AI2 Reasoning Challenge, HellaSwag, MMLU, and TruthfulQA, mainly in English.|\n|Stanford Question Answering Dataset (SQuAD) | Stanford NLP Group | [SQuAD](https:\u002F\u002Frajpurkar.github.io\u002FSQuAD-explorer\u002F) | Evaluates model performance on reading comprehension tasks.|\n|Multi-Genre Natural Language Inference (MultiNLI) | New York University, DeepMind, Facebook AI Research, Allen Institute for AI, Google AI Language | [MultiNLI](https:\u002F\u002Fcims.nyu.edu\u002F~sbowman\u002Fmultinli\u002F) | Evaluates the model's ability to understand sentence relationships across different text genres.|\n|LogiQA | Tsinghua University and Microsoft Research Asia | [LogiQA](https:\u002F\u002Fgithub.com\u002Flgw863\u002FLogiQA-dataset) | Evaluates the model's logical reasoning capabilities.|\n| HellaSwag | University of Washington and Allen Institute for AI | [HellaSwag](https:\u002F\u002Frowanzellers.com\u002Fhellaswag\u002F) | Evaluates the model's reasoning capabilities.|\n| The LAMBADA Dataset | University of Trento and Fondazione Bruno Kessler | [LAMBADA](https:\u002F\u002Fzenodo.org\u002Frecord\u002F2630551#.ZFUKS-zML0p) | Evaluates the model's ability to predict the last word of a paragraph, reflecting long-term understanding capabilities.|\n|CoQA | Stanford NLP Group | [CoQA](https:\u002F\u002Fstanfordnlp.github.io\u002Fcoqa\u002F) | Evaluates the model's ability to understand text paragraphs and answer a series of interrelated questions in conversational settings.|\n|ParlAI | Facebook AI Research | [ParlAI](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FParlAI) | Evaluates model performance in accuracy, F1 score, perplexity (the model's ability to predict the next word in a sequence), human evaluation (relevance, fluency, and coherence), speed and resource utilization, robustness (model performance under varying conditions such as noisy inputs, adversarial attacks, or changes in data quality), and generalization capabilities.|\n|Language Interpretability Tool (LIT) | Google | [LIT](https:\u002F\u002Fpair-code.github.io\u002Flit\u002F) | Provides a platform for evaluating models based on user-defined metrics, analyzing model strengths, weaknesses, and potential biases.|\n|Adversarial NLI (ANLI) | Facebook AI Research, New York University, Johns Hopkins University, University of Maryland, Allen Institute for AI | [Adversarial NLI (ANLI)](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fanli) | Evaluates the model's robustness, generalization capabilities, reasoning explanation abilities, consistency, and resource efficiency (memory usage, inference time, and training time). |\n\n### Domain\n\n| Name | Institution | Field | URL | Introduction |\n| :--: | :--: | :--: | :--: | :-- |\n| Seismometer | Epic | Healthcare | [seismomete](github.com\u002Fepic-open-source\u002Fseismometer) | Seismometer is an AI model performance evaluation tool for the healthcare field, providing standardized evaluation criteria to help make decisions based on local data and workflows. It supports continuous monitoring of model performance. Although it can be used for models in any field, it was designed with a focus on validation for healthcare AI models where local validation requires cross-referencing data about patients (such as demographics, clinical interventions, and patient outcomes) and model performance. (2024-05-22) |\n| Medbench | OpenMEDLab | Healthcare | [medbench](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fopencompass\u002Ftree\u002Fmain\u002Fopencompass\u002Fdatasets\u002Fmedbench\u002F) | MedBench is committed to creating a scientific, fair, and rigorous evaluation system and open platform for Chinese medical large models. Based on authoritative medical standards, it continuously updates and maintains high-quality medical datasets to comprehensively and multi-dimensionally quantify the capabilities of models across various medical dimensions. MedBench comprises 40,041 questions sourced from authentic examination exercises and medical reports of diverse branches of medicine. It is composed of four key components: the Chinese Medical Licensing Examination, the Resident Standardization Training Examination, the Doctor In-Charge Qualification Examination, and real-world clinic cases encompassing examinations, diagnoses, and treatments. (2023-12-20) |\n| Fin-Eva | Ant Group, Shanghai University of Finance and Economics | Finance | [Fin-Eva](https:\u002F\u002Fgithub.com\u002Falipay\u002Ffinancial_evaluation_dataset) | Fin-Eva Version 1.0, jointly launched by Ant Group and Shanghai University of Finance and Economics, covers multiple financial scenarios such as wealth management, insurance, and investment research, as well as financial specialty disciplines, with a total of over 13,000 evaluation questions. Ant’s data sources include data from various business fields and publicly available internet data. After processes such as data desensitization, text clustering, corpus screening, and data rewriting, it is combined with reviews from financial experts to construct the dataset. Shanghai University of Finance and Economics’ data sources are primarily based on real questions and simulated questions from authoritative exams in relevant fields, following the requirements of knowledge outlines. Ant’s section covers five major capabilities in finance cognition, financial knowledge, financial logic, content generation, and safety compliance, with 33 sub-dimensions and 8,445 evaluation questions; Shanghai University of Finance and Economics’ section covers four major areas: finance, economics, accounting, and certificates, including 4,661 questions across 34 different disciplines. Fin-Eva Version 1.0 adopts multiple-choice questions with fixed answers, accompanied by corresponding instructions to enable models to output in a standard format (2023-12-20) |\n| GenMedicalEval | SJTU | Healthcare | [GenMedicalEval](https:\u002F\u002Fgithub.com\u002FMediaBrain-SJTU\u002FGenMedicalEval) | 1. **Large-scale comprehensive performance evaluation**: GenMedicalEval constructs a total of over 100,000 medical evaluation data covering 16 major departments, 3 stages of physician training, 6 medical clinical application scenarios, based on over 40,000 medical examination real questions and over 55,000 patient medical records from top-tier hospitals. This dataset comprehensively evaluates the overall performance of large models in real medical complex scenarios from aspects such as medical basic knowledge, clinical application, and safety standards, addressing the shortcomings of existing evaluation benchmarks that fail to cover many practical challenges in medical practice. 2. **In-depth multi-dimensional scenario evaluation**: GenMedicalEval integrates physicians’ clinical notes and medical imaging materials, building a series of diverse and theme-rich generative evaluation questions around key medical scenarios such as examination, diagnosis, and treatment. This provides a strong supplement to existing question-and-answer based evaluations that simulate real clinical environments for open diagnostic processes. 3. **Innovative open evaluation metrics and automated evaluation models**: To address the challenge of lacking effective evaluation metrics for open generative tasks, GenMedicalEval employs advanced structured extraction and terminology alignment techniques to build an innovative generative evaluation metric system. This system accurately measures the medical knowledge accuracy of generated answers. Furthermore, it trains a medical automatic evaluation model based on its self-built knowledge base, which has a high correlation with human evaluations. The model provides multi-dimensional medical scores and evaluation reasons. Its features include no data leakage and being controllable, giving it unique advantages compared to other models like GPT-4 (2023-12-08) |\n| OpenFinData | Shanghai Artificial Intelligence Laboratory | Finance | [OpenFinData](https:\u002F\u002Fopencompass.org.cn) | OpenFinData, the first full-scenario financial evaluation dataset based on the \"OpenCompass\" framework, released by the Shanghai Artificial Intelligence Laboratory, comprises six modules and nineteen financial task dimensions, covering multi-level data types and diverse financial scenarios. Each piece of data originates from actual financial business scenarios (2024-01-04) |\n| LAiW | Sichuan University | Legal | [LAiW](https:\u002F\u002Fgithub.com\u002FDai-shen\u002FLAiW) | From a legal perspective and feasibility, LAiW categorizes the capabilities of legal NLP into three major abilities, totaling 13 basic tasks: (1) Legal NLP basic abilities: evaluates the capabilities of legal basic tasks, NLP basic tasks, and legal information extraction, including legal clause recommendation, element recognition, named entity recognition, judicial point summarization, and case identification, five basic tasks; (2) Basic legal application abilities: evaluates the basic application capabilities of large models in the legal field, including争议焦点挖掘, case matching, criminal judgment prediction, civil judgment prediction, and legal Q&A, five basic tasks; (3) Complex legal application abilities: evaluates the complex application capabilities of large models in the legal field, including judicial reasoning generation, case understanding, and legal consultation, three basic tasks (2023-10-08) |\n| LawBench | Nanjing University | Legal | [LawBench](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Flawbench) | LawBench is meticulously designed to precisely evaluate the legal capabilities of large language models. When designing test tasks, it simulates three dimensions of judicial cognition and selects 20 tasks to assess the capabilities of large models. Compared to some existing benchmarks that only have multiple-choice questions, LawBench includes more task types closely related to real-world applications, such as legal entity recognition, reading comprehension, crime amount calculation, and consultation. LawBench recognizes that the current safety strategies of large models may lead to models refusing to respond to certain legal inquiries or encountering difficulties in understanding instructions, resulting in a lack of responses. Therefore, LawBench has developed a separate evaluation metric, the \"abstention rate,\" to measure the frequency of models refusing to provide answers or failing to correctly understand instructions. Researchers have evaluated the performance of 51 large language models on LawBench, including 20 multilingual models, 22 Chinese models, and 9 legal-specific large language models (2023-09-28) |\n| PsyEval | SJTU | Psychological | [PsyEval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09189) | In mental health research, the use of large language models (LLMs) is gaining increasing attention, especially their significant capabilities in disease detection. Researchers have custom-designed the first comprehensive benchmark for the mental health field to systematically evaluate the capabilities of LLMs in this domain. This benchmark includes six sub-tasks covering three dimensions to comprehensively assess the capabilities of LLMs in mental health. Corresponding concise prompts have been designed for each sub-task, and eight advanced LLMs have been comprehensively evaluated (2023-11-15) |\n| PPTC | Microsoft, PKU | Office | [PPTC](https:\u002F\u002Fgithub.com\u002Fgydpku\u002FPPTC) | PPTC is a benchmark for testing the capabilities of large models in PPT generation, comprising 279 multi-turn conversations covering different topics and hundreds of instructions involving multi-modal operations. The research team has also proposed the PPTX-Match evaluation system, which assesses whether large language models have completed instructions based on predicted files rather than label API sequences. Therefore, it supports various LLM-generated API sequences. Currently, PPT generation faces three main challenges: error accumulation in multi-turn conversations, processing long PPT templates, and multi-modal perception issues (2023-11-04) |\n| LLMRec | Alibaba | Recommendation | [LLMRec](https:\u002F\u002Fgithub.com\u002Fwilliamliujl\u002FLLMRec) | Benchmark testing of popular LLMs (such as ChatGPT, LLaMA, ChatGLM, etc.) has been conducted on five recommendation-related tasks, including rating prediction, sequential recommendation, direct recommendation, explanation generation, and review summarization. Additionally, the effectiveness of supervised fine-tuning to enhance the instruction-following capabilities of LLMs has been studied (2023-10-08) |\n| LAiW | Dai-shen | Legal | [LAiW](https:\u002F\u002Fgithub.com\u002FDai-shen\u002FLAiW) | In response to the rapid development of legal large language models, the first Chinese legal large language model benchmark based on legal capabilities has been proposed. Legal capabilities are divided into three levels: basic legal natural language processing capabilities, basic legal application capabilities, and complex legal application capabilities. The first phase of evaluation has been completed, focusing on the assessment of basic legal natural language processing capabilities. The evaluation results show that while some legal large language models perform better than their base models, there is still a gap compared to ChatGPT (2023-10-25) |\n| OpsEval | Tsinghua University | AIOps | [OpsEval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07637) | OpsEval is a comprehensive task-oriented AIOps benchmark test for large language models, assessing the proficiency of LLMs in three key scenarios: wired network operations, 5G communication operations, and database operations. These scenarios involve different capability levels, including knowledge recall, analytical thinking, and practical application. The benchmark comprises 7,200 questions in multiple-choice and Q&A formats, supporting both English and Chinese (2023-10-02) |\n| SWE-bench | princeton-nlp | Software | [SWE-bench](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FSWE-bench) | SWE-bench is a benchmark for evaluating the performance of large language models on real software issues collected from GitHub. Given a code repository and a problem, the task of the language model is to generate a patch that can solve the described problem |\n| BLURB | Mindrank AI | Healthcare | [BLURB](https:\u002F\u002Fmicrosoft.github.io\u002FBLURB\u002Findex.html) | BLURB includes a comprehensive benchmark test for biomedical natural language processing applications based on PubMed, as well as a leaderboard for tracking community progress. BLURB comprises six diverse tasks and thirteen publicly available datasets. To avoid overemphasizing tasks with many available datasets (e.g., named entity recognition NER), BLURB reports the macro-average across all tasks as the primary score. The BLURB leaderboard is model-agnostic; any system that can generate test predictions using the same training and development data can participate. The primary goal of BLURB is to lower the barrier to participation in biomedical natural language processing and help accelerate progress in this important field that has a positive impact on society and humanity |\n| SmartPlay | microsoft | Gaming | [SmartPlay](github.com\u002Fmicrosoft\u002FSmartPlay) | SmartPlay is a large language model (LLM) benchmark designed for ease of use, offering a variety of games for testing |\n| FinEval | SUFE-AIFLM-Lab | Finance | [FinEval](github.com\u002FSUFE-AIFLM-Lab\u002FFinEval) | FinEval: A collection of high-quality multiple-choice questions covering fields such as finance, economics, accounting, and certificates |\n| GSM8K | OpenAI | Mathematics | [GSM8K](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math) | GSM8K is a dataset of 8.5K high-quality linguistically diverse elementary school math word problems. GSM8K divides them into 7.5K training problems and 1K test problems. These problems require 2 to 8 steps to solve, with solutions primarily involving performing a series of basic arithmetic operations (+ - \u002F *) to reach the final answer |\n\n### RAG-Evaluation\n\n| Name | Institution | URL | Introduction |\n| :--: | :--: | :--: | :-- |\n| BERGEN | NAVER | [BERGEN](https:\u002F\u002Fgithub.com\u002Fnaver\u002Fbergen) | BERGEN: A benchmarking library for RAG systems focusing on question-answering (QA) to enhance the understanding and comparison of the impact of each component in a RAG pipeline. It simplifies the reproducibility and integration of new datasets and models through HuggingFace. BERGEN (BEnchmarking Retrieval-augmented GENeration) is a library to benchmark RAG systems, focusing on question-answering (QA). Inconsistent benchmarking poses a major challenge in comparing approaches and understanding the impact of each component in a RAG pipeline. BERGEN was designed to ease the reproducibility and integration of new datasets and models thanks to HuggingFace (2024-05-31) |\n| CRAG | Meta Reality Labs | [CRAG](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.04744) | CRAG is an RAG benchmark comprising nearly 4,500 QA pairs and mock APIs, covering a wide range of domains and question types to inspire researchers to improve the reliability and accuracy of QA systems. It is a factual question-answering benchmark of 4,409 question-answer pairs and mock APIs to simulate web and Knowledge Graph (KG) search. CRAG is designed to encapsulate a diverse array of questions across five domains and eight question categories, reflecting varied entity popularity from popular to long-tail, and temporal dynamisms ranging from years to seconds (2024-06-07) |\n| raga-llm-hub | RAGA-AI | [raga-llm-hub](https:\u002F\u002Fgithub.com\u002Fraga-ai-hub\u002Fraga-llm-hub) | raga-llm-hub is a comprehensive evaluation toolkit for language and learning models (LLMs). With over 100 carefully designed evaluation metrics, it is the most comprehensive platform allowing developers and organizations to effectively evaluate and compare LLMs, and establish basic safeguards for LLM and retrieval-augmented generation (RAG) applications. These tests assess various aspects such as relevance and understanding, content quality, hallucination, safety and bias, context relevance, safeguards, and vulnerability scanning, while providing a series of metric-based tests for quantitative analysis (2024-03-10) |\n| ARES | Stanford | [ARES](https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FARES) | ARES is an automatic evaluation framework for retrieval-augmented generation systems, comprising three components: (1) A set of annotated query-document-answer triplets with human preference validations for evaluation criteria such as context relevance, answer faithfulness, and\u002For answer relevance. There should be at least 50 examples, but preferably several hundred. (2) A small set of examples for scoring context relevance, answer faithfulness, and\u002For answer relevance in your system. (3) A large number of unannotated query-document-answer triplets generated by your RAG system for scoring. The ARES training process includes three steps: (1) Generating synthetic queries and answers from domain-specific paragraphs. (2) Fine-tuning LLM evaluators for scoring RAG systems by training on the synthetic data. (3) Deploying the prepared LLM evaluators to assess the performance of your RAG system on key metrics (2023-09-27) |\n| RGB | CAS | [RGB](https:\u002F\u002Fgithub.com\u002Fchen700564\u002FRGB) | RGB is a new corpus\u002Fbenchmark for evaluating RAG in English and Chinese (RGB). It analyzes the performance of different large language models in the four basic capabilities required for RAG, including noise robustness, negative rejection, information integration, and counterfactual robustness. RGB divides the instances in the benchmark into four independent test sets based on these basic capabilities to address cases. Then, six representative LLMs were evaluated in RGB to diagnose the challenges faced by current LLMs when applying RAG. The evaluation shows that while LLMs demonstrate a certain level of noise robustness, they still face significant difficulties in negative rejection, information integration, and handling false information. The above evaluation results indicate that there is still a long way to go in effectively applying RAG to LLMs (2023-09-04) |\n| tvalmetrics | TonicAI | [tvalmetrics](https:\u002F\u002Fgithub.com\u002FTonicAI\u002Ftvalmetrics) | The metrics in Tonic Validate Metrics use LLM-assisted evaluation, meaning they use an LLM (e.g., gpt-4) to score different aspects of RAG application outputs. The metrics in Tonic Validate Metrics use these objects and LLM-assisted evaluation to answer questions about RAG applications. (1) Answer similarity score: How well should the RAG answer match the answer? (2) Retrieval precision: Is the retrieved context relevant to the question? (3) Augmentation precision: Does the answer contain retrieved context relevant to the question? (4) Augmentation accuracy: What is the proportion of retrieved context in the answer? (5) Answer consistency (binary): Does the answer contain any information outside the retrieved context? (6) Retrieval k-recall: For the top k context vectors, is the retrieved context a subset of the top k context vectors, and are all relevant contexts in the retrieved context part of the top k context vectors? (2023-11-11) |\n\n### Agent-Capabilities\n\n| Name | Institution | URL | Introduction |\n| :--: | :--: | :--: | :-- |\n| SuperCLUE-Agent | CLUE | [SuperCLUE-Agent](https:\u002F\u002Fgithub.com\u002FCLUEbenchmark\u002FSuperCLUE-Agent) | SuperCLUE-Agent is a multi-dimensional benchmark focusing on Agent capabilities, covering three core capabilities and ten basic tasks. It can be used to evaluate the performance of large language models in core Agent capabilities, including tool usage, task planning, and long- and short-term memory. Evaluation of 16 Chinese-supporting large language models found that the GPT-4 model leads significantly in core Agent capabilities for Chinese tasks. Meanwhile, representative domestic models, including open-source and closed-source models, are approaching the level of GPT-3.5 (2023-10-20) |\n| AgentBench | Tsinghua University | [AgentBench](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAgentBench) | AgentBench is a systematic benchmark evaluation tool for assessing LLMs as intelligent agents, highlighting the performance gap between commercial LLMs and open-source competitors (2023-08-01) |\n| AgentBench Reasoning and Decision-making Evaluation Leaderboard | THUDM | [AgentBench](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAgentBench) | Jointly launched by Tsinghua and multiple universities, it covers the reasoning and decision-making capabilities of models in different task environments, such as shopping, home, and operating systems |\n| ToolBench Tool Invocation Evaluation | Zhiyuan\u002FTsinghua | [ToolBench](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FToolBench) | Compares with tool fine-tuned models and ChatGPT to provide evaluation scripts |\n\n### Code-Capabilities\n\n| Name | Institution | URL | Introduction |\n| :--: | :--: | :--: | :-- |\n| McEval | Beihang | [McEval](https:\u002F\u002Fgithub.com\u002FMCEVAL\u002FMcEval) | To more comprehensively explore the code capabilities of large language models, this work proposes a large-scale multi-lingual multi-task code evaluation benchmark (McEval) covering 40 programming languages with 16,000 test samples. Evaluation results show that open-source models still have significant gaps compared to GPT-4 in multi-lingual programming capabilities, with most open-source models unable to surpass even GPT-3.5. Additionally, tests indicate that open-source models such as Codestral, DeepSeek-Coder, CodeQwen, and some derivative models also exhibit excellent multi-lingual capabilities. McEval is a massively multilingual code benchmark covering 40 programming languages with 16K test samples, substantially pushing the limits of code LLMs in multilingual scenarios. The benchmark includes challenging code completion, understanding, and generation evaluation tasks with finely curated massively multilingual instruction corpora McEval-Instruct. The McEval leaderboard can be found [here](https:\u002F\u002Fmceval.github.io\u002F ) (2024-06-11) |\n| HumanEval-XL | FloatAI | [SuperCLUE-Agent](https:\u002F\u002Fgithub.com\u002FFloatAI\u002FHumanEval-XL) | Existing benchmarks primarily focus on translating English prompts into multi-lingual code or are limited to very restricted natural languages. These benchmarks overlook the broad field of large-scale multi-lingual NL to multi-lingual code generation, leaving an important gap in evaluating multi-lingual LLMs. To address this challenge, the authors propose HumanEval-XL, a large-scale multi-lingual code generation benchmark aimed at filling this gap. HumanEval-XL establishes connections between 23 natural languages and 12 programming languages, comprising 22,080 prompts with an average of 8.33 test cases per prompt. By ensuring parallel data across multiple NLs and PLs, HumanEval-XL provides a comprehensive evaluation platform for multi-lingual LLMs, enabling the assessment of understanding of different NLs. This work represents a pioneering step in addressing the gap in NL generalization evaluation for multi-lingual code generation (2024-02-26) |\n| DebugBench | Tsinghua University | [DebugBench](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FDebugBench) | DebugBench is an LLM debugging benchmark comprising 4,253 instances, covering four major vulnerability categories and 18 minor categories in C++, Java, and Python. To construct DebugBench, the authors collected code snippets from the LeetCode community, implanted vulnerabilities into the source data using GPT-4, and ensured strict quality checks (2024-01-09) |\n\n### Multi-modal\u002FCross-modal\n\n| Name | Institution | URL | Introduction |\n| :--: | :--: | :--: | :-- |\n| ChartVLM | Shanghai AI Lab | [ChartVLM](https:\u002F\u002Fgithub.com\u002FUniModal4Reasoning\u002FChartVLM) | ChartX is a multi-modal evaluation set comprising 18 types of charts, 7 chart tasks, 22 subject themes, and high-quality chart data. Additionally, the authors of this paper have developed ChartVLM, offering a new perspective for handling multi-modal tasks dependent on explainable patterns, such as reasoning tasks in the fields of charts or geometric images (2024-02-19) |\n| ReForm-Eval | FudanDISC | [ReForm-Eval](https:\u002F\u002Fgithub.com\u002FFudanDISC\u002FReForm-Eval) | ReForm-Eval is a benchmark dataset for comprehensively evaluating large visual language models. By reconstructing existing multi-modal benchmark datasets with different task formats, ReForm-Eval constructs a benchmark dataset with a unified format suitable for large model evaluation. The constructed ReForm-Eval has the following features: it spans eight evaluation dimensions, providing sufficient evaluation data for each dimension (averaging over 4,000 entries per dimension); it has a unified evaluation question format (including multiple-choice and text generation questions); it is convenient and easy to use, with reliable and efficient evaluation methods that do not rely on external services like ChatGPT; it efficiently utilizes existing data resources without requiring additional manual annotation and can be further expanded to more datasets (2023-10-24) |\n| LVLM-eHub | OpenGVLab | [LVLM-eHub](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FMulti-Modality-Arena) | \"Multi-Modality Arena\" is an evaluation platform for large multi-modal models. Following Fastchat, two anonymous models are compared side-by-side on visual question answering tasks. \"Multi-Modality Arena\" allows side-by-side benchmarking of visual-language models while providing image input. It supports various models such as MiniGPT-4, LLaMA-Adapter V2, LLaVA, and BLIP-2 |\n\n### Long Context\n\n| Name | Institution | URL | Introduction |\n| :--: | :--: | :--: | :-- |\n| InfiniteBench | OpenBMB | [InfiniteBench](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FInfiniteBench) | Understanding and processing long text is an essential capability for large models to advance to a deeper level of understanding and interaction. While some large models claim to handle sequences of 100k+, there is a lack of standardized benchmark datasets. InfiniteBench addresses this by constructing a benchmark for sequences exceeding 100k+, focusing on five key capabilities of large models in handling long text: retrieval, mathematics, coding, question answering, and summarization. (1) Long Context: The average context length in InfiniteBench test data is 195k, far exceeding existing benchmarks. (2) Multi-domain and Multi-language: The benchmark includes 12 tasks in both Chinese and English, covering the five domains mentioned above. (3) Forward-looking and Challenging: The tasks in InfiniteBench are designed to match the capabilities of the strongest current models such as GPT-4 and Claude 2. (4) Realistic and Synthetic Scenarios: InfiniteBench incorporates both real-world data to test the model’s ability to handle practical problems and synthetic data to facilitate the expansion of context windows for testing. InfiniteBench is the first LLM benchmark featuring an average data length surpassing 100K tokens. It comprises synthetic and realistic tasks spanning diverse domains in both English and Chinese. The tasks in InfiniteBench require a thorough understanding of long dependencies in contexts, making the simple retrieval of a limited number of passages from contexts insufficient for these tasks. (2024-03-19) |\n\n### Reasoning Speed\n\n| Name | Institution | URL | Introduction |\n| :--: | :--: | :--: | :-- |\n| llmperf | Ray | [llmperf](https:\u002F\u002Fgithub.com\u002Fray-project\u002Fllmperf) | A library for inspecting and benchmarking LLM performance. It measures metrics such as Time to First Token (TTFT), Inter-Token Latency (ITL), and the number of requests with no data returned within 3 seconds. It also validates the correctness of LLM outputs, primarily checking for cross-requests (e.g., Request A receiving the response of Request B). Variations in input and output token lengths are considered in the design to better reflect real-world scenarios. Currently supported endpoints include OpenAI-compatible endpoints (e.g., Anyscale endpoints, private endpoints, OpenAI, Fireworks, etc.), Together, Vertex AI, and SageMaker. (2023-11-03) |\n| llm-analysis | Databricks | [llm-analysis](https:\u002F\u002Fgithub.com\u002Fcli99\u002Fllm-analysis) | Latency and Memory Analysis of Transformer Models for Training and Inference. |\n| llm-inference-benchmark | Nankai University | [llm-inference-benchmark](https:\u002F\u002Fgithub.com\u002Fninehills\u002Fllm-inference-benchmark) | LLM Inference framework benchmark. |\n| llm-inference-bench | CentML | [llm-inference-bench](https:\u002F\u002Fgithub.com\u002FCentML\u002Fllm-inference-bench) | This benchmark operates entirely external to any serving framework and can be easily extended and modified. It provides a variety of statistics and profiling modes. Designed as a standalone tool, it enables precise benchmarking with statistically significant results for specific input\u002Foutput distributions. Each request consists of a single prompt and a single decoding step. |\n| GPU-Benchmarks-on-LLM-Inference | UIUC | [GPU-Benchmarks-on-LLM-Inference](https:\u002F\u002Fgithub.com\u002FXiongjieDai\u002FGPU-Benchmarks-on-LLM-Inference) | Uses llama.cpp to test the inference speed of LLaMA models on different GPUs, including RunPod, 16-inch M1 Max MacBook Pro, M2 Ultra Mac Studio, 14-inch M3 MacBook Pro, and 16-inch M3 Max MacBook Pro. |\n\n### Quantization-and-Compression\n\n| Name | Institution | URL | Introduction |\n| :--: | :--: | :--: | :-- |\n| LLM-QBench | Beihang\u002FSenseTime | [LLM-QBench](https:\u002F\u002Fgithub.com\u002FModelTC\u002Fllmc) | LLM-QBench is a benchmark for post-training quantization of large language models and serves as an efficient LLM compression tool with various advanced compression methods. It supports multiple inference backends. (2024-05-09) |\n\n\n## Demos\n\n- [Chat Arena: anonymous models side-by-side and vote for which one is better](https:\u002F\u002Fchat.lmsys.org\u002F?arena) - An open-source AI LLM \"anonymous\" arena! Here, you can become a judge, score two model responses without knowing their identities, and after scoring, the true identities of the models will be revealed. Participants include Vicuna, Koala, OpenAssistant (oasst), Dolly, ChatGLM, StableLM, Alpaca, LLaMA, and more.\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fonejune2018_Awesome-LLM-Eval_readme_fd1bbf5cb523.png)\n\n## Leaderboards\n\n|              Platform               | Access                                                       |\n| :---------------------------------: | ------------------------------------------------------------ |\n|                ACLUE                | [[Source](https:\u002F\u002Fgithub.com\u002Fisen-zhang\u002FACLUE)               |\n|             AgentBench              | [[Source](https:\u002F\u002Fllmbench.ai\u002Fagent)]                        |\n|             AlpacaEval              | [[Source](https:\u002F\u002Ftatsu-lab.github.io\u002Falpaca_eval\u002F)]         |\n|                ANGO                 | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FAngoHF\u002FANGO-Leaderboard)] |\n|              BeHonest               | [[Source](https:\u002F\u002Fgair-nlp.github.io\u002FBeHonest\u002F#leaderboard)] |\n|     Big Code Models Leaderboard     | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fbigcode\u002Fbigcode-models-leaderboard)] |\n|            Chatbot Arena            | [[Source](https:\u002F\u002Flmarena.ai\u002F?leaderboard)]                  |\n|   Chinese Large Model Leaderboard   | [[Source](https:\u002F\u002Fgithub.com\u002Fjeinlee1991\u002Fchinese-llm-benchmark)] |\n|                CLEVA                | [[Source](http:\u002F\u002Fwww.lavicleva.com\u002F)]                        |\n|             CompassRank             | [[Source](https:\u002F\u002Frank.opencompass.org.cn\u002F)]                 |\n|               CompMix               | [[Source](https:\u002F\u002Fqa.mpi-inf.mpg.de\u002Fcompmix)]                |\n|               C-Eval                | [[Source](https:\u002F\u002Fcevalbenchmark.com\u002F)]                      |\n|            DreamBench++             | [[Source](https:\u002F\u002Fdreambenchplus.github.io\u002F#leaderboard)]    |\n|                FELM                 | [[Source](https:\u002F\u002Fhkust-nlp.github.io\u002Ffelm)]                 |\n|              FlagEval               | [[Source](https:\u002F\u002Fflageval.baai.ac.cn\u002F)]                     |\n|      Hallucination Leaderboard      | [[Source](https:\u002F\u002Fgithub.com\u002Fvectara\u002Fhallucination-leaderboard)] |\n|                HELM                 | [[Source](https:\u002F\u002Fcrfm.stanford.edu\u002Fhelm\u002F)]                  |\n|  Huggingface Open LLM Leaderboard   | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fopen-llm-leaderboard\u002Fopen_llm_leaderboard)] |\n|  Huggingface LLM Perf Leaderboard   | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Foptimum\u002Fllm-perf-leaderboard)] |\n|       Indico LLM Leaderboard        | [[Source](https:\u002F\u002Findicodata.ai\u002Fllm)]                        |\n|              InfiBench              | [[Source](https:\u002F\u002Finfi-coder.github.io\u002Finfibench)]           |\n|              InterCode              | [[Source](https:\u002F\u002Fintercode-benchmark.github.io\u002F)]           |\n|              LawBench               | [[Source](https:\u002F\u002Flawbench.opencompass.org.cn\u002Fleaderboard)]  |\n|               LLMEval               | [[Source](https:\u002F\u002Fopenrouter.ai\u002Frankings)]                   |\n|            LLM Rankings             | [[Source](http:\u002F\u002Fllmeval.com)]                               |\n|      LLM Use Case Leaderboard       | [[Source](https:\u002F\u002Fllmleaderboard.goml.io)]                   |\n|              LucyEval               | [[Source](https:\u002F\u002Fopenrouter.ai\u002Frankings)]                   |\n|                M3CoT                | [[Source](https:\u002F\u002Flightchen233.github.io\u002Fm3cot.github.io\u002Fleaderboard.html)] |\n|      MMLU by Task Leaderboard       | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FCoreyMorris\u002FMMLU-by-task-Leaderboard)] |\n|              MMToM-QA               | [[Source](https:\u002F\u002Fchuanyangjin.com\u002Fmmtom-qa-leaderboard)]    |\n|              MathEval               | [[Source](https:\u002F\u002Fmatheval.ai\u002F)]                             |\n|            OlympicArena             | [[Source](https:\u002F\u002Fgair-nlp.github.io\u002FOlympicArena\u002F#leaderboard)] |\n|              OpenEval               | [[Source](http:\u002F\u002Fopeneval.org.cn\u002F#\u002Frank)]                    |\n|     Open Multilingual LLM Eval      | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fuonlp\u002Fopen_multilingual_llm_leaderboard)] |\n|              PubMedQA               | [[Source](https:\u002F\u002Fpubmedqa.github.io\u002F)]                      |\n|             SafetyBench             | [[Source](https:\u002F\u002Fllmbench.ai\u002Fsafety)]                       |\n|              SciBench               | [[Source](https:\u002F\u002Fscibench-ucla.github.io\u002F#leaderboard)]     |\n|             SciKnowEval             | [[Source](https:\u002F\u002Fgithub.com\u002FHICAI-ZJU\u002FSciKnowEval)]         |\n|             SEED-Bench              | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FAILab-CVC\u002FSEED-Bench_Leaderboard)] |\n|             SuperBench              | [[Source](https:\u002F\u002Ffm.ai.tsinghua.edu.cn\u002Fsuperbench\u002F#\u002Fleaderboard)] |\n|              SuperCLUE              | [[Source](https:\u002F\u002Fwww.superclueai.com\u002F)]                     |\n|              SuperGLUE              | [[Source](https:\u002F\u002Fsuper.gluebenchmark.com\u002F)]                 |\n|              SuperLim               | [[Source](https:\u002F\u002Flab.kb.se\u002Fleaderboard\u002Fresults)]            |\n|               TAT-DQA               | [[Source](https:\u002F\u002Fnextplusplus.github.io\u002FTAT-DQA)]           |\n|               TAT-QA                | [[Source](https:\u002F\u002Fnextplusplus.github.io\u002FTAT-QA)]            |\n| TheoremOne LLM Benchmarking Metrics | [[Source](https:\u002F\u002Fllm-evals.formula-labs.com\u002F)]              |\n|               Toloka                | [[Source](https:\u002F\u002Ftoloka.ai\u002Fllm-leaderboard\u002F)]               |\n|              Toolbench              | [[Source](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fqiantong-xu\u002Ftoolbench-leaderboard)] |\n|           VisualWebArena            | [[Source](https:\u002F\u002Fjykoh.com\u002Fvwa)]                            |\n|               We-Math               | [[Source](https:\u002F\u002Fwe-math.github.io\u002F#leaderboard)]           |\n|               WHOOPS!               | [[Source](https:\u002F\u002Fwhoops-benchmark.github.io)]               |\n\n\n\n### Leaderboards for popular Provider (performance and cost, 2024-05-14)\n\n| Provider (link to pricing)                                   | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Google](https:\u002F\u002Fcloud.google.com\u002Fvertex-ai\u002Fgenerative-ai\u002Fpricing) | [Replicate](https:\u002F\u002Freplicate.com\u002Fpricing)                   | [DeepSeek](https:\u002F\u002Fwww.deepseek.com\u002F)                        | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F)         | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Google](https:\u002F\u002Fcloud.google.com\u002Fvertex-ai\u002Fgenerative-ai\u002Fpricing) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Cohere](https:\u002F\u002Fcohere.com\u002Fcommand)                         | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Replicate](https:\u002F\u002Freplicate.com\u002Fpricing)                   | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F)         | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) |      | [Groq](https:\u002F\u002Fwow.groq.com\u002F) | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Groq](https:\u002F\u002Fwow.groq.com\u002F) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | Microsoft                                                    | Microsoft                                                    | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) |\n| ------------------------------------------------------------ | ------------------------------------ | ------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------ | ---- | ----------------------------- | ------------------------------------ | ---------------------------------------------------- | ------------------------------------------ | ----------------------------- | ------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ---------------------------------------------------- |\n| Model name                                                   | GPT-4o                               | GPT-4 Turbo                          | Claude 3 Opus                              | Gemini 1.5 Pro                                               | [Llama 3 70B](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FMeta-Llama-3-70B-Instruct) | [DeepSeek-V2](https:\u002F\u002Fhuggingface.co\u002Fdeepseek-ai\u002FDeepSeek-V2-Chat) | [Mixtral 8x22B](https:\u002F\u002Fhuggingface.co\u002Fmistralai\u002FMixtral-8x22B-Instruct-v0.1) | Claude 3 Sonnet                            | Gemini 1.5 Flash                                             | Mistral Large                                        | [Command R+](https:\u002F\u002Fhuggingface.co\u002FCohereForAI\u002Fc4ai-command-r-plus) | Claude 3 Haiku                             | Mistral Small                                        | [Llama 3 8B](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FMeta-Llama-3-8B-Instruct) | [Mixtral 8x7B](https:\u002F\u002Fhuggingface.co\u002Fmistralai\u002FMixtral-8x7B-Instruct-v0.1) | GPT-3.5 Turbo                        |      | Llama 3 70B (Groq)            | GPT-4                                | Mistral Medium                                       | Claude 2.0                                 | Mixtral 8x7B (Groq)           | Claude 2.1                                 | Claude Instant                             | [Phi-Medium 4k](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002FPhi-3-medium-4k-instruct) | [Phi-3-Small 8k](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002FPhi-3-small-8k-instruct) | Mistral 7B                                           |\n| Column Last Updated                                          | 5\u002F14\u002F2024                            | 5\u002F14\u002F2024                            | 5\u002F14\u002F2024                                  | 5\u002F14\u002F2024                                                    | 5\u002F20\u002F2024                                                    | 5\u002F20\u002F2024                                                    | 5\u002F20\u002F2024                                                    | 5\u002F14\u002F2024                                  | 5\u002F14\u002F2024                                                    | 5\u002F20\u002F2024                                            | 5\u002F20\u002F2024                                                    | 5\u002F14\u002F2024                                  | 5\u002F20\u002F2024                                            | 5\u002F21\u002F2024                                                    | 5\u002F14\u002F2024                                                    | 5\u002F14\u002F2024                            |      | 5\u002F14\u002F2024                     | 5\u002F14\u002F2024                            | 5\u002F20\u002F2024                                            | 5\u002F14\u002F2024                                  | 5\u002F14\u002F2024                     | 5\u002F14\u002F2024                                  | 5\u002F14\u002F2024                                  | 5\u002F21\u002F2024                                                    | 5\u002F21\u002F2024                                                    | 5\u002F22\u002F2024                                            |\n| CAPABILITY                                                   |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| Artificial Analysis Index                                    | 100                                  | 94                                   | 94                                         | 88                                                           | 88                                                           | 82                                                           | 81                                                           | 78                                         | 76                                                           | 75                                                   | 74                                                           | 72                                         | 71                                                   | 65                                                           | 65                                                           | 65                                   |      | 88                            | 83                                   | 73                                                   | 69                                         | 65                            | 63                                         | 63                                         |                                                              |                                                              | 39                                                   |\n| LMSys Chatbot Arena ELO                                      | 1310                                 | 1257                                 | 1256                                       | 1249                                                         | 1208                                                         |                                                              |                                                              | 1204                                       |                                                              | 1158                                                 | 1193                                                         | 1182                                       |                                                      | 1154                                                         | 1114                                                         | 1102                                 |      | 1208                          | 1189                                 | 1148                                                 | 1126                                       | 1114                          | 1115                                       | 1104                                       |                                                              |                                                              | 1006                                                 |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **General knowledge:**                                       |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MMLU                                                         | 88.70                                | 86.40                                | 86.80                                      | 81.90                                                        | 82.00                                                        | 78.50                                                        | 77.75                                                        | 79.00                                      | 78.90                                                        | 81.20                                                | 75.70                                                        | 75.20                                      | 72.20                                                | 68.40                                                        | 70.60                                                        | 70.00                                |      | 82.00                         | 86.40                                | 75.30                                                | 78.50                                      | 70.60                         |                                            | 73.40                                      | 78.00                                                        | 75.70                                                        | 62.50                                                |\n| **Math:**                                                    |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MATH                                                         | 76.60                                | 73.40                                | 60.10                                      | 58.50                                                        | 50.40                                                        |                                                              |                                                              | 43.10                                      | 54.90                                                        | 45.00                                                |                                                              | 38.90                                      |                                                      | 30.00                                                        |                                                              | 34.10                                |      | 50.40                         | 52.90                                |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MGSM \u002F GSM8K                                                 | 90.50                                | 88.60                                | 95.00                                      |                                                              | 93.00                                                        |                                                              |                                                              | 92.30                                      |                                                              |                                                      |                                                              | 88.90                                      |                                                      | 79.60                                                        |                                                              | 57.10                                |      | 93.00                         | 92.00                                |                                                      |                                            |                               |                                            |                                            | 91.00                                                        | 89.60                                                        |                                                      |\n| **Reasoning:**                                               |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| GPQA                                                         | 53.60                                | 49.10                                | 50.40                                      | 41.50                                                        | 39.50                                                        |                                                              |                                                              | 40.40                                      | 39.50                                                        |                                                      |                                                              | 33.30                                      |                                                      | 34.20                                                        |                                                              | 28.10                                |      | 39.50                         | 35.70                                |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| BIG-BENCH-HARD                                               |                                      |                                      | 86.80                                      | 84.00                                                        |                                                              |                                                              |                                                              | 82.90                                      | 85.50                                                        |                                                      |                                                              | 73.70                                      |                                                      |                                                              |                                                              | 66.60                                |      |                               | 83.10                                |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| DROP, F1 Score                                               | 83.40                                | 85.40                                | 83.10                                      |                                                              |                                                              |                                                              |                                                              | 78.90                                      |                                                              |                                                      |                                                              | 78.40                                      |                                                      |                                                              |                                                              | 64.10                                |      |                               | 80.90                                |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| HellaSwag                                                    |                                      |                                      | 95.40                                      |                                                              |                                                              |                                                              |                                                              | 89.00                                      |                                                              | 89.20                                                |                                                              | 85.90                                      | 86.90                                                |                                                              | 86.70                                                        | 85.50                                |      |                               | 95.30                                | 88.00                                                |                                            | 86.70                         |                                            |                                            | 82.40                                                        | 77.00                                                        |                                                      |\n| **Code:**                                                    |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| HumanEval                                                    | 90.20                                | 87.60                                | 84.90                                      | 71.90                                                        | 81.70                                                        |                                                              |                                                              | 73.00                                      |                                                              |                                                      |                                                              | 75.90                                      |                                                      | 62.20                                                        |                                                              | 48.10                                |      | 81.70                         | 67.00                                |                                                      |                                            |                               |                                            |                                            | 62.20                                                        | 61.00                                                        |                                                      |\n| Natural2Code                                                 |                                      |                                      |                                            | 77.70                                                        |                                                              |                                                              |                                                              |                                            | 77.20                                                        |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **Conversational:**                                          |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MT Bench                                                     |                                      | 93.20                                |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              | 83.00                                                        | 83.90                                |      |                               |                                      | 86.10                                                | 80.60                                      | 83.00                         | 81.80                                      | 78.50                                      |                                                              |                                                              | 68.40                                                |\n| Benchmark Avg Not useful - selection bias has significant impact. | 80.50                                | 80.53                                | 80.31                                      | 69.25                                                        | 69.32                                                        | 78.50                                                        | 77.75                                                        | 72.33                                      | 67.20                                                        | 71.80                                                | 75.70                                                        | 68.78                                      | 79.55                                                | 54.88                                                        | 80.10                                                        | 59.72                                |      | 69.32                         | 74.16                                | 83.13                                                | 79.55                                      | 80.10                         | 81.80                                      | 75.95                                      | 78.40                                                        | 75.83                                                        | 65.45                                                |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **THROUGHPUT**                                               |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| Throughput (median tokens\u002Fsec)                               | 90.40                                | 21.10                                | 25.60                                      | 46.20                                                        | 26.30                                                        | 15.60                                                        | 76.50                                                        | 61.30                                      | 161.70                                                       | 32.30                                                | 42.20                                                        | 116.70                                     | 80.70                                                | 75.40                                                        | 60.00                                                        | 58.60                                |      | 305.30                        | 28.30                                | 18.30                                                | 37.20                                      | 477.10                        | 42.10                                      | 85.70                                      |                                                              |                                                              | 62.20                                                |\n| Throughput (median seconds per 1K tokens)                    | 11.06                                | 47.39                                | 39.06                                      | 21.65                                                        | 38.02                                                        | 64.10                                                        | 13.07                                                        | 16.31                                      | 6.18                                                         | 30.96                                                | 23.70                                                        | 8.57                                       | 12.39                                                | 13.26                                                        | 16.67                                                        | 17.06                                |      | 3.28                          | 35.34                                | 54.64                                                | 26.88                                      | 2.10                          | 23.75                                      | 11.67                                      |                                                              |                                                              | 16.08                                                |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **COST**                                                     |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| Cost Input (1M tokens) aka \"context window tokens\"           | $5.00                                | $10.00                               | $15.00                                     | $7.00                                                        | $0.65                                                        | $0.14                                                        | $2.00                                                        | $3.00                                      | $0.35                                                        | $4.00                                                | $3.00                                                        | $0.25                                      | $1.00                                                | $0.05                                                        | $0.70                                                        | $0.50                                |      | $0.59                         | $30.00                               | $2.70                                                | $8.00                                      | $0.27                         | $8.00                                      | $0.80                                      |                                                              |                                                              | $0.25                                                |\n| Cost Output (1M tokens)                                      | $15.00                               | $30.00                               | $75.00                                     | $21.00                                                       | $2.75                                                        | $0.28                                                        | $6.00                                                        | $15.00                                     | $0.53                                                        | $12.00                                               | $15.00                                                       | $1.25                                      | $3.00                                                | $0.25                                                        | $0.70                                                        | $1.50                                |      | $0.79                         | $60.00                               | $8.10                                                | $24.00                                     | $0.27                         | $24.00                                     | $2.40                                      |                                                              |                                                              | $0.25                                                |\n| Cost 1M Input + 1M Output tokens                             | $20.00                               | $40.00                               | $90.00                                     | $28.00                                                       | $3.40                                                        | $0.42                                                        | $8.00                                                        | $18.00                                     | $0.88                                                        | $16.00                                               | $18.00                                                       | $1.50                                      | $4.00                                                | $0.30                                                        | $1.40                                                        | $2.00                                |      | $1.38                         | $90.00                               | $10.80                                               | $32.00                                     | $0.54                         | $32.00                                     | $3.20                                      |                                                              |                                                              | $0.50                                                |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **COST VS PERFORMANCE**                                      |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| Cost 1M+1M IO tokens per AA Index point                      | $0.20                                | $0.43                                | $0.96                                      | $0.32                                                        | $0.04                                                        | $0.01                                                        | $0.10                                                        | $0.23                                      | $0.01                                                        | $0.21                                                | $0.24                                                        | $0.02                                      | $0.06                                                | $0.00                                                        | $0.02                                                        | $0.03                                |      | $0.02                         | $1.08                                | $0.15                                                | $0.46                                      | $0.01                         | $0.51                                      | $0.05                                      |                                                              |                                                              | $0.01                                                |\n| Cost 1M+1M IO tokens per Chatbot ELO point                   | $0.02                                | $0.03                                | $0.07                                      | $0.02                                                        | $0.00                                                        | #DIV\u002F0!                                                      | #DIV\u002F0!                                                      | $0.01                                      | #DIV\u002F0!                                                      | $0.01                                                | $0.02                                                        | $0.00                                      | #DIV\u002F0!                                              | $0.00                                                        | $0.00                                                        | $0.00                                |      | $0.00                         | $0.08                                | $0.01                                                | $0.03                                      | $0.00                         | $0.03                                      | $0.00                                      |                                                              |                                                              | $0.00                                                |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| Cost 1M+1M IO tokens per Throughput (tokens\u002Fsec)             | $0.22                                | $1.90                                | $3.52                                      | $0.61                                                        | $0.13                                                        | $0.03                                                        | $0.10                                                        | $0.29                                      | $0.01                                                        | $0.50                                                | $0.43                                                        | $0.01                                      | $0.05                                                | $0.00                                                        | $0.02                                                        | $0.03                                |      | $0.00                         | $3.18                                | $0.59                                                | $0.86                                      | $0.00                         | $0.76                                      | $0.04                                      |                                                              |                                                              | $0.01                                                |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **SPECS**                                                    |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| Context Window (k)                                           | 128                                  | 128                                  | 200                                        | 1,000                                                        | 8                                                            | 32                                                           | 65                                                           | 200                                        | 1,000                                                        | 32                                                   | 128                                                          | 200                                        | 32                                                   | 8                                                            | 32                                                           | 16                                   |      |                               | 8                                    | 32                                                   | 100                                        | 32                            | 200                                        | 100                                        | 4                                                            | 8                                                            | 33                                                   |\n| Max Output Tokens (k)                                        | 4                                    | 4                                    | 4                                          | 8                                                            |                                                              |                                                              |                                                              | 4                                          | 8                                                            |                                                      |                                                              | 4                                          |                                                      |                                                              |                                                              |                                      |      |                               | 4                                    |                                                      | 4                                          |                               | 4                                          | 4                                          |                                                              |                                                              |                                                      |\n| Rate Limit (requests \u002F minute)                               | tiered                               | tiered                               | tiered                                     | 5                                                            | 600                                                          |                                                              |                                                              | tiered                                     | 360                                                          |                                                      |                                                              | tiered                                     |                                                      | 600                                                          |                                                              | tiered                               |      | 30                            | tiered                               |                                                      | tiered                                     | 30                            | tiered                                     | tiered                                     |                                                              |                                                              |                                                      |\n| Rate Limit (requests \u002F day)                                  | tiered                               | tiered                               | tiered                                     | 2,000                                                        |                                                              |                                                              |                                                              | tiered                                     | 10,000                                                       |                                                      |                                                              | tiered                                     |                                                      |                                                              |                                                              | tiered                               |      | 14,400                        | tiered                               |                                                      | tiered                                     | 14,400                        | tiered                                     | tiered                                     |                                                              |                                                              |                                                      |\n| Rate Limit (tokens \u002F minute)                                 | tiered                               | tiered                               | tiered                                     | 10,000,000                                                   |                                                              |                                                              |                                                              | tiered                                     | 10,000,000                                                   |                                                      |                                                              | tiered                                     |                                                      |                                                              |                                                              | tiered                               |      | 6,000                         | tiered                               |                                                      | tiered                                     | 5,000                         | tiered                                     | tiered                                     |                                                              |                                                              |                                                      |\n\n\u003Cbr>\u003Cbr>\n## Papers\n\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289) [**Beyond Factuality: A Comprehensive Evaluation of Large Language Models as Knowledge Generators**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289),\u003Cbr> by *Liang Chen, Yang Deng, Yatao Bian et al.*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03109) [**A Closer Look into Automatic Evaluation Using Large Language Models**](https:\u002F\u002Fbrowse.arxiv.org\u002Fpdf\u002F2310.05657.pdf),\u003Cbr> by *Cheng-han Chiang, Hungyi Li*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03109) [**A Survey on Evaluation of Large Language Models**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03109),\u003Cbr> by *Yupeng Chang, Xu Wang, Jindong Wang, Yuan Wu, Linyi Yang, Kaijie Zhu, Hao Chen, Xiaoyuan Yi et al.*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fpublication\u002Fgpteval-nlg-evaluation-using-gpt-4-with-better-human-alignment\u002F) [**G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment**](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fpublication\u002Fgpteval-nlg-evaluation-using-gpt-4-with-better-human-alignment\u002F),\u003Cbr> by *Yang Liu, Dan Iter, Yichong Xu, Shuohang Wang, Ruochen Xu, Chenguang Zhu*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.04023) [**A Multitask, Multilingual, Multimodal Evaluation of ChatGPT on Reasoning,\n  Hallucination, and Interactivity**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.04023),\u003Cbr> by *Yejin Bang, Samuel Cahyawijaya, Nayeon Lee, Wenliang Dai, Dan Su, Bryan Wilie, Holy Lovenia, Ziwei Ji et al.*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.12095) [**Beyond Factuality: A Comprehensive Evaluation of Large Language Models as Knowledge Generators**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289),\u003Cbr> by *Liang Chen, Yang Deng, Yatao Bian, Zeyu Qin, Bingzhe Wu, Tat-Seng Chua, Kam-Fai Wong*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.06476) [**Is ChatGPT a General-Purpose Natural Language Processing Task Solver?**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.06476),\u003Cbr> by *Qin, Chengwei, Zhang, Aston, Zhang, Zhuosheng, Chen, Jiaao, Yasunaga, Michihiro and Yang, Diyi*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.06466) [**ChatGPT versus Traditional Question Answering for Knowledge Graphs:\n  Current Status and Future Directions Towards Knowledge Graph Chatbots**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.06466),\u003Cbr> by *Reham Omar, Omij Mangukiya, Panos Kalnis and Essam Mansour*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.13867) [**Mathematical Capabilities of ChatGPT**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.13867),\u003Cbr> by *Simon Frieder, Luca Pinchetti, Ryan-Rhys Griffiths, Tommaso Salvatori, Thomas Lukasiewicz, Philipp Christian Petersen, Alexis Chevalier and Julius Berner*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.08081) [**Exploring the Limits of ChatGPT for Query or Aspect-based Text Summarization**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.08081),\u003Cbr> by *Xianjun Yang, Yan Li, Xinlu Zhang, Haifeng Chen and Wei Cheng*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.12095) [**On the Robustness of ChatGPT: An Adversarial and Out-of-distribution\n  Perspective**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.12095),\u003Cbr> by *Jindong Wang, Xixu Hu, Wenxin Hou, Hao Chen, Runkai Zheng, Yidong Wang, Linyi Yang, Haojun Huang et al.*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.04655) [**ChatGPT is not all you need. A State of the Art Review of large\n  Generative AI models**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.04655),\u003Cbr> by *Roberto Gozalo-Brizuela and Eduardo C. Garrido-Merch\\'an*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10198) [**Can ChatGPT Understand Too? A Comparative Study on ChatGPT and Fine-tuned\n  BERT**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10198),\u003Cbr> by *Qihuang Zhong, Liang Ding, Juhua Liu, Bo Du and Dacheng Tao*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2303.07992) [**Evaluation of ChatGPT as a Question Answering System for Answering\n  Complex Questions**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2303.07992),\u003Cbr> by *Yiming Tan, Dehai Min, Yu Li, Wenbo Li, Nan Hu, Yongrui Chen and Guilin Qi*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16421) [**ChatGPT is a Knowledgeable but Inexperienced Solver: An Investigation of Commonsense Problem in Large Language Models**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16421),\u003Cbr> by *Ning Bian, Xianpei Han, Le Sun, Hongyu Lin, Yaojie Lu and Ben He*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2211.09110) [**Holistic Evaluation of Language Models**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2211.09110),\u003Cbr> by *Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan et al.*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2204.00498) [**Evaluating the Text-to-SQL Capabilities of Large Language Models**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2204.00498),\u003Cbr> by *Nitarshan Rajkumar, Raymond Li and Dzmitry Bahdanau*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCOLING-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.491) [**Are Visual-Linguistic Models Commonsense Knowledge Bases?**](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.491),\u003Cbr> by *Hsiu-Yu Yang and Carina Silberer*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2212.10529) [**Is GPT-3 a Psychopath? Evaluating Large Language Models from a Psychological\n  Perspective**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2212.10529),\u003Cbr> by *Xingxuan Li, Yutong Li, Linlin Liu, Lidong Bing and Shafiq R. Joty*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.132) [**GeoMLAMA: Geo-Diverse Commonsense Probing on Multilingual Pre-Trained\n  Language Models**](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.132),\u003Cbr> by *Da Yin, Hritik Bansal, Masoud Monajatipoor, Liunian Harold Li and Kai-Wei Chang*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.653) [**RobustLR: A Diagnostic Benchmark for Evaluating Logical Robustness\n  of Deductive Reasoners**](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.653),\u003Cbr> by *Soumya Sanyal, Zeyi Liao and Xiang Ren*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.13169) [**A Systematic Evaluation of Large Language Models of Code**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.13169),\u003Cbr> by *Frank F. Xu, Uri Alon, Graham Neubig and Vincent J. Hellendoorn*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374) [**Evaluating Large Language Models Trained on Code**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374),\u003Cbr> by *Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Pond\\'e de Oliveira Pinto, Jared Kaplan, Harrison Edwards, Yuri Burda et al.*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FACL-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.36) [**GLGE: A New General Language Generation Evaluation Benchmark**](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.36),\u003Cbr> by *Dayiheng Liu, Yu Yan, Yeyun Gong, Weizhen Qi, Hang Zhang, Jian Jiao, Weizhu Chen, Jie Fu et al.*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05861) [**Evaluating Pre-Trained Models for User Feedback Analysis in Software\n  Engineering: A Study on Classification of App-Reviews**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05861),\u003Cbr> by *Mohammad Abdul Hadi and Fatemeh H. Fard*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FACL_Findings-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.322) [**Do Language Models Perform Generalizable Commonsense Inference?**](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.322), [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCode-skyblue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fgithub.com\u002Fwangpf3\u002FLM-for-CommonsenseInference)\u003Cbr> by *Peifeng Wang, Filip Ilievski, Muhao Chen and Xiang Ren*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.emnlp-main.598) [**RICA: Evaluating Robust Inference Capabilities Based on Commonsense\n  Axioms**](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.emnlp-main.598),\u003Cbr> by *Pei Zhou, Rahul Khanna, Seyeon Lee, Bill Yuchen Lin, Daniel Ho, Jay Pujara and Xiang Ren*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2020-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.14799) [**Evaluation of Text Generation: A Survey**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.14799),\u003Cbr> by *Asli Celikyilmaz, Elizabeth Clark and Jianfeng Gao*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2020-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15780) [**Neural Language Generation: Formulation, Methods, and Evaluation**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15780),\u003Cbr> by *Cristina Garbacea and Qiaozhu Mei*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FICLR-2020-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fopenreview.net\u002Fforum?id=SkeHuCVFDr) [**BERTScore: Evaluating Text Generation with BERT**](https:\u002F\u002Fopenreview.net\u002Fforum?id=SkeHuCVFDr),\u003Cbr> by *Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger and Yoav Artzi*\n  \u003Cbr>\u003Cbr>\n\n\u003Cbr>\u003Cbr>\n## LLM-List\n\n### Typical LLM details\n\n| 模型       | Parameter | Layers | Atten  heads | Dimension | Learning rate | batch  size | train tokens |\n| ---------- | --------- | ------ | ------------ | --------- | ------------- | ----------- | ------------ |\n| LLaMA2     | 6.7B      | 32     | 32           | 4096      | 3.00E-04      | 400万       | 1.0万亿      |\n| LLaMA2     | 13.0B     | 40     | 40           | 5120      | 3.00E-04      | 400万       | 1.0万亿      |\n| LLaMA2     | 32.5B     | 60     | 52           | 6656      | 1.50E-04      | 400万       | 1.4万亿      |\n| LLaMA2     | 65.2B     | 80     | 64           | 8192      | 1.50E-04      | 400万       | 1.4万亿      |\n| nano-GPT   | 85,584    | 3      | 3            | 768       | 3.00E-04      |             |              |\n| GPT2-small | 0.12B     | 12     | 12           | 768       | 2.50E-04      |             |              |\n| GPT2-XL    | 1.5B      | 48     | 25           | 1600      | 1.50E-04      |             |              |\n| GPT3       | 175B      | 96     | 96           | 12288     | 1.50E-04      |             | 0.5万亿      |\n\n### Pre-trained-LLM\n\n|       Model        | Size |  Architecture   |                            Access                            |  Date   | Origin                                                       |\n| :----------------: | :--: | :-------------: | :----------------------------------------------------------: | :-----: | ------------------------------------------------------------ |\n| Switch Transformer | 1.6T |  Decoder(MOE)   |                              -                               | 2021-01 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.03961.pdf)                |\n|        GLaM        | 1.2T |  Decoder(MOE)   |                              -                               | 2021-12 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.06905.pdf)                |\n|        PaLM        | 540B |     Decoder     |                              -                               | 2022-04 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.02311.pdf)                |\n|       MT-NLG       | 530B |     Decoder     |                              -                               | 2022-01 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.11990.pdf)                |\n|      J1-Jumbo      | 178B |     Decoder     |        [api](https:\u002F\u002Fdocs.ai21.com\u002Fdocs\u002Fcomplete-api)        | 2021-08 | [Paper](https:\u002F\u002Fuploads-ssl.webflow.com\u002F60fd4503684b466578c0d307\u002F61138924626a6981ee09caf6_jurassic_tech_paper.pdf) |\n|        OPT         | 175B |     Decoder     | [api](https:\u002F\u002Fopt.alpa.ai) \\| [ckpt](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq\u002Ftree\u002Fmain\u002Fprojects\u002FOPT) | 2022-05 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.01068.pdf)                |\n|       BLOOM        | 176B |     Decoder     | [api](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) \\| [ckpt](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) | 2022-11 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.05100.pdf)                |\n|      GPT 3.0       | 175B |     Decoder     |                [api](https:\u002F\u002Fopenai.com\u002Fapi\u002F)                | 2020-05 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.14165.pdf)                |\n|       LaMDA        | 137B |     Decoder     |                              -                               | 2022-01 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.08239.pdf)                |\n|        GLM         | 130B |     Decoder     |          [ckpt](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FGLM-130B)           | 2022-10 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.02414.pdf)                |\n|        YaLM        | 100B |     Decoder     |         [ckpt](https:\u002F\u002Fgithub.com\u002Fyandex\u002FYaLM-100B)          | 2022-06 | [Blog](https:\u002F\u002Fmedium.com\u002Fyandex\u002Fyandex-publishes-yalm-100b-its-the-largest-gpt-like-neural-network-in-open-source-d1df53d0e9a6) |\n|       LLaMA        | 65B  |     Decoder     |      [ckpt](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)       | 2022-09 | [Paper](https:\u002F\u002Fresearch.facebook.com\u002Fpublications\u002Fllama-open-and-efficient-foundation-language-models\u002F) |\n|      GPT-NeoX      | 20B  |     Decoder     |        [ckpt](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neox)        | 2022-04 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.06745.pdf)                |\n|        UL2         | 20B  |    agnostic     | [ckpt](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Ful2#:~:text=UL2%20is%20a%20unified%20framework%20for%20pretraining%20models,downstream%20fine-tuning%20is%20associated%20with%20specific%20pre-training%20schemes.) | 2022-05 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.05131v1.pdf)              |\n|         T5         | 11B  | Encoder-Decoder |            [ckpt](https:\u002F\u002Fhuggingface.co\u002Ft5-11b)             | 2019-10 | [Paper](https:\u002F\u002Fjmlr.org\u002Fpapers\u002Fv21\u002F20-074.html)             |\n|      CPM-Bee       | 10B  |     Decoder     |          [api](https:\u002F\u002Flive.openbmb.org\u002Fmodels\u002Fbee)          | 2022-10 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.00413.pdf)                |\n|       rwkv-4       |  7B  |      RWKV       |    [ckpt](https:\u002F\u002Fhuggingface.co\u002FBlinkDL\u002Frwkv-4-pile-7b)     | 2022-09 | [Github](https:\u002F\u002Fgithub.com\u002FBlinkDL\u002FRWKV-LM)                 |\n|       GPT-J        |  6B  |     Decoder     |      [ckpt](https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fgpt-j-6B)      | 2022-09 | [Github](https:\u002F\u002Fgithub.com\u002Fkingoflolz\u002Fmesh-transformer-jax) |\n|      GPT-Neo       | 2.7B |     Decoder     |        [ckpt](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)         | 2021-03 | [Github](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)              |\n|      GPT-Neo       | 1.3B |     Decoder     |        [ckpt](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)         | 2021-03 | [Github](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)              |\n\n\u003Cbr>\u003Cbr>\n### Instruction-finetuned-LLM\n|    Model    | Size |  Architecture   |                            Access                            |  Date   | Origin                                                 |\n| :---------: | :--: | :-------------: | :----------------------------------------------------------: | :-----: | ------------------------------------------------------ |\n|  Flan-PaLM  | 540B |     Decoder     |                              -                               | 2022-10 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416.pdf)          |\n|   BLOOMZ    | 176B |     Decoder     |       [ckpt](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz)       | 2022-11 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.01786.pdf)          |\n| InstructGPT | 175B |     Decoder     |         [api](https:\u002F\u002Fplatform.openai.com\u002Foverview)          | 2022-03 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.02155.pdf)          |\n|  Galactica  | 120B |     Decoder     |    [ckpt](https:\u002F\u002Fhuggingface.co\u002Ffacebook\u002Fgalactica-120b)    | 2022-11 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09085.pdf)          |\n| OpenChatKit | 20B  |        -        |   [ckpt](https:\u002F\u002Fgithub.com\u002Ftogethercomputer\u002FOpenChatKit)    | 2023-3  | -                                                      |\n|  Flan-UL2   | 20B  |     Decoder     | [ckpt](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Ful2) | 2023-03 | [Blog](https:\u002F\u002Fwww.yitay.net\u002Fblog\u002Fflan-ul2-20b)        |\n|   Gopher    |  -   |        -        |                              -                               |    -    | -                                                      |\n| Chinchilla  |  -   |        -        |                              -                               |    -    | -                                                      |\n|   Flan-T5   | 11B  | Encoder-Decoder | [ckpt](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ft5x\u002Fblob\u002Fmain\u002Fdocs\u002Fmodels.md#flan-t5-checkpoints) | 2022-10 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416.pdf)          |\n|     T0      | 11B  | Encoder-Decoder |         [ckpt](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002FT0)         | 2021-10 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.08207.pdf)          |\n|   Alpaca    |  7B  |     Decoder     |          [demo](https:\u002F\u002Fcrfm.stanford.edu\u002Falpaca\u002F)           | 2023-03 | [Github](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca) |\n\n\u003Cbr>\u003Cbr>\n### Aligned-LLM\n|  Model  | Size | Architecture |                            Access                            |  Date   | Origin                                                     |\n| :-----: | :--: | :----------: | :----------------------------------------------------------: | :-----: | ---------------------------------------------------------- |\n|  GPT 4  |  -   |      -       |                              -                               | 2023-03 | [Blog](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4)                  |\n| ChatGPT |  -   |   Decoder    | [demo](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F)\\|[api](https:\u002F\u002Fshare.hsforms.com\u002F1u4goaXwDRKC9-x9IvKno0A4sk30) | 2022-11 | [Blog](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F)                   |\n| Sparrow | 70B  |      -       |                              -                               | 2022-09 | [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.14375.pdf)              |\n| Claude  |  -   |      -       | [demo](https:\u002F\u002Fpoe.com\u002Fclaude)\\|[api](https:\u002F\u002Fwww.anthropic.com\u002Fearlyaccess) | 2023-03 | [Blog](https:\u002F\u002Fwww.anthropic.com\u002Findex\u002Fintroducing-claude) |\n\n\u003Cbr>\u003Cbr>\n### Open-LLM\n\n- [LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai\u002F) - A foundational, 65-billion-parameter large language model. [LLaMA.cpp](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp) [Lit-LLaMA](https:\u002F\u002Fgithub.com\u002FLightning-AI\u002Flit-llama)\n  - [Alpaca](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html) - A model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations. [Alpaca.cpp](https:\u002F\u002Fgithub.com\u002Fantimatter15\u002Falpaca.cpp) [Alpaca-LoRA](https:\u002F\u002Fgithub.com\u002Ftloen\u002Falpaca-lora)\n  - [Flan-Alpaca](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fflan-alpaca) - Instruction Tuning from Humans and Machines.\n  - [Baize](https:\u002F\u002Fgithub.com\u002Fproject-baize\u002Fbaize-chatbot) - Baize is an open-source chat model trained with [LoRA](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FLoRA). It uses 100k dialogs generated by letting ChatGPT chat with itself. \n  - [Cabrita](https:\u002F\u002Fgithub.com\u002F22-hours\u002Fcabrita) - A portuguese finetuned instruction LLaMA.\n  - [Vicuna](https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat) - An Open-Source Chatbot Impressing GPT-4 with 90% ChatGPT Quality. \n  - [Llama-X](https:\u002F\u002Fgithub.com\u002FAetherCortex\u002FLlama-X) - Open Academic Research on Improving LLaMA to SOTA LLM.\n  - [Chinese-Vicuna](https:\u002F\u002Fgithub.com\u002FFacico\u002FChinese-Vicuna) - A Chinese Instruction-following LLaMA-based Model.\n  - [GPTQ-for-LLaMA](https:\u002F\u002Fgithub.com\u002Fqwopqwop200\u002FGPTQ-for-LLaMa) - 4 bits quantization of [LLaMA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13971) using [GPTQ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.17323).\n  - [GPT4All](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all) - Demo, data, and code to train open-source assistant-style large language model based on GPT-J and LLaMa.\n  - [Koala](https:\u002F\u002Fbair.berkeley.edu\u002Fblog\u002F2023\u002F04\u002F03\u002Fkoala\u002F) - A Dialogue Model for Academic Research\n  - [BELLE](https:\u002F\u002Fgithub.com\u002FLianjiaTech\u002FBELLE) - Be Everyone's Large Language model Engine\n  - [StackLLaMA](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fstackllama) - A hands-on guide to train LLaMA with RLHF.\n  - [RedPajama](https:\u002F\u002Fgithub.com\u002Ftogethercomputer\u002FRedPajama-Data) -  An Open Source Recipe to Reproduce LLaMA training dataset.\n  - [Chimera](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FLLMZoo) - Latin Phoenix.\n- [BLOOM](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) - BigScience Large Open-science Open-access Multilingual Language Model [BLOOM-LoRA](https:\u002F\u002Fgithub.com\u002Flinhduongtuan\u002FBLOOM-LORA)\n  - [BLOOMZ&mT0](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz) - a family of models capable of following human instructions in dozens of languages zero-shot.\n  - [Phoenix](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FLLMZoo)\n\n- [T5](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) - Text-to-Text Transfer Transformer \n  - [T0](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.08207) - Multitask Prompted Training Enables Zero-Shot Task Generalization\n\n- [OPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068) - Open Pre-trained Transformer Language Models.\n- [UL2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.05131v1) - a unified framework for pretraining models that are universally effective across datasets and setups. \n- [GLM](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FGLM)- GLM is a General Language Model pretrained with an autoregressive blank-filling objective and can be finetuned on various natural language understanding and generation tasks.\n- [RWKV](https:\u002F\u002Fgithub.com\u002FBlinkDL\u002FRWKV-LM) - Parallelizable RNN with Transformer-level LLM Performance.\n  - [ChatRWKV](https:\u002F\u002Fgithub.com\u002FBlinkDL\u002FChatRWKV) - ChatRWKV is like ChatGPT but powered by my RWKV (100% RNN) language model.\n- [StableLM](https:\u002F\u002Fstability.ai\u002Fblog\u002Fstability-ai-launches-the-first-of-its-stablelm-suite-of-language-models) - Stability AI Language Models.\n- [YaLM](https:\u002F\u002Fmedium.com\u002Fyandex\u002Fyandex-publishes-yalm-100b-its-the-largest-gpt-like-neural-network-in-open-source-d1df53d0e9a6) - a GPT-like neural network for generating and processing text. It can be used freely by developers and researchers from all over the world.\n- [GPT-Neo](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo) - An implementation of model & data parallel [GPT3](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)-like models using the [mesh-tensorflow](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Fmesh) library.\n- [GPT-J](https:\u002F\u002Fgithub.com\u002Fkingoflolz\u002Fmesh-transformer-jax\u002F#gpt-j-6b) - A 6 billion parameter, autoregressive text generation model trained on [The Pile](https:\u002F\u002Fpile.eleuther.ai\u002F).\n  - [Dolly](https:\u002F\u002Fwww.databricks.com\u002Fblog\u002F2023\u002F03\u002F24\u002Fhello-dolly-democratizing-magic-chatgpt-open-models.html) - a cheap-to-build LLM that exhibits a surprising degree of the instruction following capabilities exhibited by ChatGPT.\n\n- [Pythia](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fpythia) - Interpreting Autoregressive Transformers Across Time and Scale\n  - [Dolly 2.0](https:\u002F\u002Fwww.databricks.com\u002Fblog\u002F2023\u002F04\u002F12\u002Fdolly-first-open-commercially-viable-instruction-tuned-llm) - the first open source, instruction-following LLM, fine-tuned on a human-generated instruction dataset licensed for research and commercial use.\n- [OpenFlamingo](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_flamingo) - an open-source reproduction of DeepMind's Flamingo model.\n- [Cerebras-GPT](https:\u002F\u002Fwww.cerebras.net\u002Fblog\u002Fcerebras-gpt-a-family-of-open-compute-efficient-large-language-models\u002F) - A Family of Open, Compute-efficient, Large Language Models.\n- [GALACTICA](https:\u002F\u002Fgithub.com\u002Fpaperswithcode\u002Fgalai\u002Fblob\u002Fmain\u002Fdocs\u002Fmodel_card.md) - The GALACTICA models are trained on a large-scale scientific corpus.\n  - [GALPACA](https:\u002F\u002Fhuggingface.co\u002FGeorgiaTechResearchInstitute\u002Fgalpaca-30b) - GALACTICA 30B fine-tuned on the Alpaca dataset.\n\n- [Palmyra](https:\u002F\u002Fhuggingface.co\u002FWriter\u002Fpalmyra-base) - Palmyra Base was primarily pre-trained with English text.\n- [Camel](https:\u002F\u002Fhuggingface.co\u002FWriter\u002Fcamel-5b-hf) - a state-of-the-art instruction-following large language model designed to deliver exceptional performance and versatility.\n- [h2oGPT](https:\u002F\u002Fgithub.com\u002Fh2oai\u002Fh2ogpt)\n- [PanGu-α](https:\u002F\u002Fopeni.org.cn\u002Fpangu\u002F) - PanGu-α is a 200B parameter autoregressive pretrained Chinese language model develped by Huawei Noah's Ark Lab, MindSpore Team and Peng Cheng Laboratory.\n- [Open-Assistant](https:\u002F\u002Fgithub.com\u002FLAION-AI\u002FOpen-Assistant) - a project meant to give everyone access to a great chat based large language model.\n- [HuggingChat](https:\u002F\u002Fhuggingface.co\u002Fchat\u002F) - Powered by Open Assistant's latest model – the best open source chat model right now and @huggingface Inference API.\n- [Baichuan](https:\u002F\u002Fgithub.com\u002Fbaichuan-inc\u002FBaichuan-13B) - An open-source, commercially available large-scale language model developed by Baichuan Intelligent Technology following Baichuan-7B, containing 13 billion parameters. (20230715)\n- [Qwen](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen-7B) - Qwen-7B is the 7B-parameter version of the large language model series, Qwen (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen-7B is a Transformer-based large language model, which is pretrained on a large volume of data, including web texts, books, codes, etc. (20230803)\n\n\u003Cbr>\u003Cbr>\n### Popular-LLM\n\n|          **Model**          |        **\\#Author**         |                          **\\#Link**                          | **\\#Parameter** |     **Base Model**     | **\\#Layer** | **\\#Encoder** | **\\#Decoder** | **\\#Pretrain Tokens** | **\\#IFT Sample** |  **RLHF**  |\n| :-------------------------: | :-------------------------: | :----------------------------------------------------------: | :-------------: | :--------------------: | :---------: | :-----------: | :-----------: | :-------------------: | :--------------: | :--------: |\n|          GPT3-Ada           |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      0.35B      |           -            |     24      |       -       |      24       |           -           |        -         |     -      |\n|          Pythia-1B          |     biderman2023pythia      |         https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-1b          |       1B        |           -            |     16      |       -       |      16       |      300B tokens      |        -         |     -      |\n|        GPT3-Babbage         |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      1.3B       |           -            |     24      |       -       |      24       |           -           |        -         |     -      |\n|           GPT2-XL           |     radford2019language     |                https:\u002F\u002Fhuggingface.co\u002Fgpt2-xl                |      1.5B       |           -            |     48      |       -       |      48       |      40B tokens       |        -         |     -      |\n|          BLOOM-1b7          |        scao2022bloom        |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom-1b7          |      1.7B       |           -            |     24      |       -       |      24       |      350B tokens      |        -         |     -      |\n|         BLOOMZ-1b7          | muennighoff2022crosslingual |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-1b7         |      1.7B       |       BLOOM-1b7        |     24      |       -       |      24       |           -           |   8.39B tokens   |     -      |\n|         Dolly-v2-3b         |          2023dolly          |        https:\u002F\u002Fhuggingface.co\u002Fdatabricks\u002Fdolly-v2-3b         |      2.8B       |      Pythia-2.8B       |     32      |       -       |      32       |           -           |       15K        |     -      |\n|         Pythia-2.8B         |     biderman2023pythia      |        https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-2.8b         |      2.8B       |           -            |     32      |       -       |      32       |      300B tokens      |        -         |     -      |\n|          BLOOM-3b           |        scao2022bloom        |          https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom-3b          |       3B        |           -            |     30      |       -       |      30       |      350B tokens      |        -         |     -      |\n|          BLOOMZ-3b          | muennighoff2022crosslingual |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-3b          |       3B        |        BLOOM-3b        |     30      |       -       |      30       |           -           |   8.39B tokens   |     -      |\n|   StableLM-Base-Alpha-3B    |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-base-alpha-3b   |       3B        |           -            |     16      |       -       |      16       |      800B tokens      |        -         |     -      |\n|   StableLM-Tuned-Alpha-3B   |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-tuned-alpha-3b  |       3B        | StableLM-Base-Alpha-3B |     16      |       -       |      16       |           -           |       632K       |     -      |\n|         ChatGLM-6B          | zeng2023glm-130b,du2022glm  |           https:\u002F\u002Fhuggingface.co\u002FTHUDM\u002Fchatglm-6b            |       6B        |           -            |     28      |      28       |      28       |       1T tokens       |    \\checkmark    | \\checkmark |\n|          DoctorGLM          |     xiong2023doctorglm      |          https:\u002F\u002Fgithub.com\u002Fxionghonglin\u002FDoctorGLM           |       6B        |       ChatGLM-6B       |     28      |      28       |      28       |           -           |      6.38M       |     -      |\n|         ChatGLM-Med         |         ChatGLM-Med         |            https:\u002F\u002Fgithub.com\u002FSCIR-HI\u002FMed-ChatGLM            |       6B        |       ChatGLM-6B       |     28      |      28       |      28       |           -           |        8K        |     -      |\n|         GPT3-Curie          |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      6.7B       |           -            |     32      |       -       |      32       |           -           |        -         |     -      |\n|         MPT-7B-Chat         |   MosaicML2023Introducing   |         https:\u002F\u002Fhuggingface.co\u002Fmosaicml\u002Fmpt-7b-chat          |      6.7B       |         MPT-7B         |     32      |       -       |      32       |           -           |       360K       |     -      |\n|       MPT-7B-Instruct       |   MosaicML2023Introducing   |       https:\u002F\u002Fhuggingface.co\u002Fmosaicml\u002Fmpt-7b-instruct        |      6.7B       |         MPT-7B         |     32      |       -       |      32       |           -           |      59.3K       |     -      |\n|   MPT-7B-StoryWriter-65k+   |   MosaicML2023Introducing   |      https:\u002F\u002Fhuggingface.co\u002Fmosaicml\u002Fmpt-7b-storywriter      |      6.7B       |         MPT-7B         |     32      |       -       |      32       |           -           |    \\checkmark    |     -      |\n|         Dolly-v2-7b         |          2023dolly          |        https:\u002F\u002Fhuggingface.co\u002Fdatabricks\u002Fdolly-v2-7b         |      6.9B       |      Pythia-6.9B       |     32      |       -       |      32       |           -           |       15K        |     -      |\n| h2ogpt-oig-oasst1-512-6.9b  |         2023h2ogpt          |   https:\u002F\u002Fhuggingface.co\u002Fh2oai\u002Fh2ogpt-oig-oasst1-512-6.9b    |      6.9B       |      Pythia-6.9B       |     32      |       -       |      32       |           -           |       398K       |     -      |\n|         Pythia-6.9B         |     biderman2023pythia      |        https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-6.9b         |      6.9B       |           -            |     32      |       -       |      32       |      300B tokens      |        -         |     -      |\n|          Alpaca-7B          |           alpaca            |       https:\u002F\u002Fhuggingface.co\u002Ftatsu-lab\u002Falpaca-7b-wdiff       |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       52K        |     -      |\n|       Alpaca-LoRA-7B        |       2023alpacalora        |         https:\u002F\u002Fhuggingface.co\u002Ftloen\u002Falpaca-lora-7b          |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       52K        |     -      |\n|          Baize-7B           |         xu2023baize         |      https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-lora-7B      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       263K       |     -      |\n|     Baize Healthcare-7B     |         xu2023baize         | https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-healthcare-lora-7B |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       201K       |     -      |\n|         ChatDoctor          |   yunxiang2023chatdoctor    |           https:\u002F\u002Fgithub.com\u002FKent0n-Li\u002FChatDoctor            |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       167K       |     -      |\n|           HuaTuo            |       wang2023huatuo        |     https:\u002F\u002Fgithub.com\u002Fscir-hi\u002Fhuatuo-llama-med-chinese      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |        8K        |     -      |\n|          Koala-7B           |     koala_blogpost_2023     |           https:\u002F\u002Fhuggingface.co\u002Fyoung-geng\u002Fkoala            |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       472K       |     -      |\n|          LLaMA-7B           |      touvron2023llama       |     https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-7b-hf     |       7B        |           -            |     32      |       -       |      32       |       1T tokens       |        -         |     -      |\n|     Luotuo-lora-7b-0.3      |           luotuo            |     https:\u002F\u002Fhuggingface.co\u002Fsilk-road\u002Fluotuo-lora-7b-0.3      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       152K       |     -      |\n|   StableLM-Base-Alpha-7B    |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-base-alpha-7b   |       7B        |           -            |     16      |       -       |      16       |      800B tokens      |        -         |     -      |\n|   StableLM-Tuned-Alpha-7B   |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-tuned-alpha-7b  |       7B        | StableLM-Base-Alpha-7B |     16      |       -       |      16       |           -           |       632K       |     -      |\n|    Vicuna-7b-delta-v1.1     |         vicuna2023          |      https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat\\#vicuna-weights      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       70K        |     -      |\n| BELLE-7B-0.2M \u002F0.6M \u002F1M \u002F2M |     belle2023exploring      |        https:\u002F\u002Fhuggingface.co\u002FBelleGroup\u002FBELLE-7B-2M         |      7.1B       |     Bloomz-7b1-mt      |     30      |       -       |      30       |           -           | 0.2M\u002F0.6M\u002F1M\u002F2M  |     -      |\n|          BLOOM-7b1          |        scao2022bloom        |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom-7b1          |      7.1B       |           -            |     30      |       -       |      30       |      350B tokens      |        -         |     -      |\n|     BLOOMZ-7b1 \u002Fmt \u002Fp3      | muennighoff2022crosslingual |       https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-7b1-p3        |      7.1B       |       BLOOM-7b1        |     30      |       -       |      30       |           -           |   4.19B tokens   |     -      |\n|        Dolly-v2-12b         |          2023dolly          |        https:\u002F\u002Fhuggingface.co\u002Fdatabricks\u002Fdolly-v2-12b        |       12B       |       Pythia-12B       |     36      |       -       |      36       |           -           |       15K        |     -      |\n|    h2ogpt-oasst1-512-12b    |         2023h2ogpt          |      https:\u002F\u002Fhuggingface.co\u002Fh2oai\u002Fh2ogpt-oasst1-512-12b      |       12B       |       Pythia-12B       |     36      |       -       |      36       |           -           |      94.6K       |     -      |\n|  Open-Assistant-SFT-4-12B   |      2023openassistant      | https:\u002F\u002Fhuggingface.co\u002FOpenAssistant\u002Foasst-sft-4-pythia-12b-epoch-3.5 |       12B       |   Pythia-12B-deduped   |     36      |       -       |      36       |           -           |       161K       |     -      |\n|         Pythia-12B          |     biderman2023pythia      |         https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-12b         |       12B       |           -            |     36      |       -       |      36       |      300B tokens      |        -         |     -      |\n|          Baize-13B          |         xu2023baize         |     https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-lora-13B      |       13B       |       LLaMA-13B        |     40      |       -       |      40       |           -           |       263K       |     -      |\n|          Koala-13B          |     koala_blogpost_2023     |           https:\u002F\u002Fhuggingface.co\u002Fyoung-geng\u002Fkoala            |       13B       |       LLaMA-13B        |     40      |       -       |      40       |           -           |       472K       |     -      |\n|          LLaMA-13B          |      touvron2023llama       |    https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-13b-hf     |       13B       |           -            |     40      |       -       |      40       |       1T tokens       |        -         |     -      |\n|      StableVicuna-13B       |        2023StableLM         |   https:\u002F\u002Fhuggingface.co\u002FCarperAI\u002Fstable-vicuna-13b-delta    |       13B       |     Vicuna-13B v0      |     40      |       -       |      40       |           -           |       613K       | \\checkmark |\n|    Vicuna-13b-delta-v1.1    |         vicuna2023          |      https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat\\#vicuna-weights      |       13B       |       LLaMA-13B        |     40      |       -       |      40       |           -           |       70K        |     -      |\n|      moss-moon-003-sft      |          2023moss           |        https:\u002F\u002Fhuggingface.co\u002Ffnlp\u002Fmoss-moon-003-sft         |       16B       |   moss-moon-003-base   |     34      |       -       |      34       |           -           |       1.1M       |     -      |\n|  moss-moon-003-sft-plugin   |          2023moss           |     https:\u002F\u002Fhuggingface.co\u002Ffnlp\u002Fmoss-moon-003-sft-plugin     |       16B       |   moss-moon-003-base   |     34      |       -       |      34       |           -           |       1.4M       |     -      |\n|        GPT-NeoX-20B         |           gptneox           |        https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fgpt-neox-20b        |       20B       |           -            |     44      |       -       |      44       |         825GB         |        -         |     -      |\n|    h2ogpt-oasst1-512-20b    |         2023h2ogpt          |      https:\u002F\u002Fhuggingface.co\u002Fh2oai\u002Fh2ogpt-oasst1-512-20b      |       20B       |      GPT-NeoX-20B      |     44      |       -       |      44       |           -           |      94.6K       |     -      |\n|          Baize-30B          |         xu2023baize         |     https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-lora-30B      |       33B       |       LLaMA-30B        |     60      |       -       |      60       |           -           |       263K       |     -      |\n|          LLaMA-30B          |      touvron2023llama       |    https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-30b-hf     |       33B       |           -            |     60      |       -       |      60       |      1.4T tokens      |        -         |     -      |\n|          LLaMA-65B          |      touvron2023llama       |    https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-65b-hf     |       65B       |           -            |     80      |       -       |      80       |      1.4T tokens      |        -         |     -      |\n|        GPT3-Davinci         |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      175B       |           -            |     96      |       -       |      96       |      300B tokens      |        -         |     -      |\n|            BLOOM            |        scao2022bloom        |           https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom            |      176B       |           -            |     70      |       -       |      70       |      366B tokens      |        -         |     -      |\n|       BLOOMZ \u002Fmt \u002Fp3        | muennighoff2022crosslingual |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-p3          |      176B       |         BLOOM          |     70      |       -       |      70       |           -           |   2.09B tokens   |     -      |\n|    ChatGPT~(2023.05.01)     |        openaichatgpt        |       https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3-5        |        -        |        GPT-3.5         |      -      |       -       |       -       |           -           |    \\checkmark    | \\checkmark |\n|     GPT-4~(2023.05.01)      |       openai2023gpt4        |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-4         |        -        |           -            |      -      |       -       |       -       |           -           |    \\checkmark    | \\checkmark |\n\n\n\u003Cbr>\u003Cbr>\n## Frameworks-for-Training\n\n- [Accelerate](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Faccelerate) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhuggingface\u002Faccelerate.svg?style=social) - 🚀 A simple way to train and use PyTorch models with multi-GPU, TPU, mixed-precision.\n- [Apache MXNet](https:\u002F\u002Fgithub.com\u002Fapache\u002Fmxnet) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fapache\u002Fmxnet.svg?style=social) - Lightweight, Portable, Flexible Distributed\u002FMobile Deep Learning with Dynamic, Mutation-aware Dataflow Dep Scheduler.\n- [Caffe](https:\u002F\u002Fgithub.com\u002FBVLC\u002Fcaffe) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBVLC\u002Fcaffe.svg?style=social) - A fast open framework for deep learning.\n- [ColossalAI](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhpcaitech\u002FColossalAI.svg?style=social) - An integrated large-scale model training system with efficient parallelization techniques.\n- [DeepSpeed](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FDeepSpeed.svg?style=social) - DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.\n- [Horovod](https:\u002F\u002Fgithub.com\u002Fhorovod\u002Fhorovod) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhorovod\u002Fhorovod.svg?style=social) - Distributed training framework for TensorFlow, Keras, PyTorch, and Apache MXNet.\n- [Jax](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fjax) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fgoogle\u002Fjax.svg?style=social) - Autograd and XLA for high-performance machine learning research.\n- [Kedro](https:\u002F\u002Fgithub.com\u002Fkedro-org\u002Fkedro) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fkedro-org\u002Fkedro.svg?style=social) - Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code.\n- [Keras](https:\u002F\u002Fgithub.com\u002Fkeras-team\u002Fkeras) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fkeras-team\u002Fkeras.svg?style=social) - Keras is a deep learning API written in Python, running on top of the machine learning platform TensorFlow.\n- [LightGBM](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FLightGBM) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FLightGBM.svg?style=social) - A fast, distributed, high performance gradient boosting (GBT, GBDT, GBRT, GBM or MART) framework based on decision tree algorithms, used for ranking, classification and many other machine learning tasks.\n- [MegEngine](https:\u002F\u002Fgithub.com\u002FMegEngine\u002FMegEngine) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FMegEngine\u002FMegEngine.svg?style=social) - MegEngine is a fast, scalable and easy-to-use deep learning framework, with auto-differentiation.\n- [metric-learn](https:\u002F\u002Fgithub.com\u002Fscikit-learn-contrib\u002Fmetric-learn) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fscikit-learn-contrib\u002Fmetric-learn.svg?style=social) - Metric Learning Algorithms in Python.\n- [MindSpore](https:\u002F\u002Fgithub.com\u002Fmindspore-ai\u002Fmindspore) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmindspore-ai\u002Fmindspore.svg?style=social) - MindSpore is a new open source deep learning training\u002Finference framework that could be used for mobile, edge and cloud scenarios.\n- [Oneflow](https:\u002F\u002Fgithub.com\u002FOneflow-Inc\u002Foneflow) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FOneflow-Inc\u002Foneflow.svg?style=social) - OneFlow is a performance-centered and open-source deep learning framework.\n- [PaddlePaddle](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddle) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FPaddlePaddle\u002FPaddle.svg?style=social) - Machine Learning Framework from Industrial Practice.\n- [PyTorch](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Fpytorch) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpytorch\u002Fpytorch.svg?style=social) - Tensors and Dynamic neural networks in Python with strong GPU acceleration.\n- [PyTorch Lightning](https:\u002F\u002Fgithub.com\u002Flightning-AI\u002Flightning) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flightning-AI\u002Flightning.svg?style=social) - Deep learning framework to train, deploy, and ship AI products Lightning fast.\n- [XGBoost](https:\u002F\u002Fgithub.com\u002Fdmlc\u002Fxgboost) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdmlc\u002Fxgboost.svg?style=social) - Scalable, Portable and Distributed Gradient Boosting (GBDT, GBRT or GBM) Library.\n- [scikit-learn](https:\u002F\u002Fgithub.com\u002Fscikit-learn\u002Fscikit-learn) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fscikit-learn\u002Fscikit-learn.svg?style=social) - Machine Learning in Python.\n- [TensorFlow](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Ftensorflow) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftensorflow\u002Ftensorflow.svg?style=social) - An Open Source Machine Learning Framework for Everyone.\n- [VectorFlow](https:\u002F\u002Fgithub.com\u002FNetflix\u002Fvectorflow) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FNetflix\u002Fvectorflow.svg?style=social) - A minimalist neural network library optimized for sparse data and single machine environments.\n\n\n\u003Cbr>\u003Cbr>\n## LLMOps\n\n| Name | Stars | Description |\n| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |\n| [Byzer-LLM](https:\u002F\u002Fgithub.com\u002Fallwefantasy\u002Fbyzer-llm ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fallwefantasy\u002Fbyzer-llm.svg?style=social ) | Byzer-LLM is a comprehensive large model infrastructure that supports capabilities related to large models, such as pre-training, fine-tuning, deployment, and serving. Byzer-Retrieval is a storage infrastructure specifically developed for large models, supporting batch import of various data sources, real-time single-item updates, and full-text, vector, and hybrid searches to facilitate data usage for Byzer-LLM. Byzer-SQL\u002FPython offers user-friendly interactive APIs with a low barrier to entry for utilizing the aforementioned products. |\n| [agenta](https:\u002F\u002Fgithub.com\u002FAgenta-AI\u002Fagenta ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FAgenta-AI\u002Fagenta.svg?style=social ) | An LLMOps platform for building powerful LLM applications. It allows for easy experimentation and evaluation of different prompts, models, and workflows to construct robust applications. |\n| [Arize-Phoenix](https:\u002F\u002Fgithub.com\u002FArize-ai\u002Fphoenix ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FArize-ai\u002Fphoenix.svg?style=social ) | ML observability for LLMs, vision, language, and tabular models. |\n| [BudgetML](https:\u002F\u002Fgithub.com\u002Febhy\u002Fbudgetml ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Febhy\u002Fbudgetml.svg?style=social ) | Deploy ML inference services on a limited budget with less than 10 lines of code. |\n| [CometLLM](https:\u002F\u002Fgithub.com\u002Fcomet-ml\u002Fcomet-llm ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fcomet-ml\u002Fcomet-llm.svg?style=social ) | An open-source LLMOps platform for logging, managing, and visualizing LLM prompts and chains. It tracks prompt templates, variables, duration, token usage, and other metadata. It also scores prompt outputs and visualizes chat history in a single UI. |\n| [deeplake](https:\u002F\u002Fgithub.com\u002Factiveloopai\u002Fdeeplake ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Factiveloopai\u002FHub.svg?style=social ) | Stream large multimodal datasets to achieve near 100% GPU utilization. Query, visualize, and version control data. Access data without recalculating embeddings for model fine-tuning. |\n| [Dify](https:\u002F\u002Fgithub.com\u002Flanggenius\u002Fdify ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flanggenius\u002Fdify.svg?style=social ) | An open-source framework that enables developers (even non-developers) to quickly build useful applications based on large language models, ensuring they are visible, actionable, and improvable. |\n| [Dstack](https:\u002F\u002Fgithub.com\u002Fdstackai\u002Fdstack ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdstackai\u002Fdstack.svg?style=social ) | Cost-effective LLM development in any cloud (AWS, GCP, Azure, Lambda, etc.). |\n| [Embedchain](https:\u002F\u002Fgithub.com\u002Fembedchain\u002Fembedchain ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fembedchain\u002Fembedchain.svg?style=social ) | A framework for creating ChatGPT-like robots on datasets. |\n| [GPTCache](https:\u002F\u002Fgithub.com\u002Fzilliztech\u002FGPTCache ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fzilliztech\u002FGPTCache.svg?style=social ) | Create semantic caches to store responses to LLM queries. |\n| [Haystack](https:\u002F\u002Fgithub.com\u002Fdeepset-ai\u002Fhaystack ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdeepset-ai\u002Fhaystack.svg?style=social ) | Quickly build applications with LLM agents, semantic search, question answering, and more. |\n| [langchain](https:\u002F\u002Fgithub.com\u002Fhwchase17\u002Flangchain ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhwchase17\u002Flangchain.svg?style=social ) | Build LLM applications through composability. |\n| [LangFlow](https:\u002F\u002Fgithub.com\u002Flogspace-ai\u002Flangflow ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flogspace-ai\u002Flangflow.svg?style=social ) | A hassle-free way to experiment with and prototype LangChain processes using drag-and-drop components and a chat interface. |\n| [LangKit](https:\u002F\u002Fgithub.com\u002Fwhylabs\u002Flangkit ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwhylabs\u002Flangkit.svg?style=social ) | A ready-to-use LLM telemetry collection library that extracts profiles of LLM performance over time, as well as prompts, responses, and metadata, to identify issues at scale. |\n| [LiteLLM 🚅](https:\u002F\u002Fgithub.com\u002FBerriAI\u002Flitellm\u002F ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBerriAI\u002Flitellm.svg?style=social ) | A simple and lightweight 100-line package for standardizing LLM API calls across OpenAI, Azure, Cohere, Anthropic, Replicate, and other API endpoints. |\n| [LlamaIndex](https:\u002F\u002Fgithub.com\u002Fjerryjliu\u002Fllama_index ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjerryjliu\u002Fllama_index.svg?style=social ) | Provides a central interface to connect your LLMs with external data. |\n| [LLMApp](https:\u002F\u002Fgithub.com\u002Fpathwaycom\u002Fllm-app ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpathwaycom\u002Fllm-app.svg?style=social ) | LLM App is a Python library that helps you build real-time LLM-enabled data pipelines with just a few lines of code. |\n| [LLMFlows](https:\u002F\u002Fgithub.com\u002Fstoyan-stoyanov\u002Fllmflows ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fstoyan-stoyanov\u002Fllmflows.svg?style=social ) | LLMFlows is a framework for building simple, clear, and transparent LLM applications, such as chatbots, question-answering systems, and agents. |\n| [LLMonitor](https:\u002F\u002Fgithub.com\u002Fllmonitor\u002Fllmonitor ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fllmonitor\u002Fllmonitor.svg?style=social ) | Observability and monitoring for AI applications and agents. Debug agents with robust tracking and logging. Use analytical tools to delve into request history. Developer-friendly modules that can be easily integrated into LangChain. |\n| [magentic](https:\u002F\u002Fgithub.com\u002Fjackmpcollins\u002Fmagentic ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjackmpcollins\u002Fmagentic.svg?style=social ) | Seamlessly integrate LLMs as Python functions. Use type annotations to specify structured outputs. Combine LLM queries and function calls with regular Python code to create complex LLM-driven functionalities. |\n| [Pezzo 🕹️](https:\u002F\u002Fgithub.com\u002Fpezzolabs\u002Fpezzo ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpezzolabs\u002Fpezzo.svg?style=social ) | Pezzo is an open-source LLMOps platform built for developers and teams. With just two lines of code, you can easily troubleshoot AI operations, collaborate on and manage your prompts, and deploy changes instantly from one place. |\n| [promptfoo](https:\u002F\u002Fgithub.com\u002Ftyppo\u002Fpromptfoo ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftyppo\u002Fpromptfoo.svg?style=social ) | An open-source tool for testing and evaluating prompt quality. Create test cases, automatically check output quality, and catch regressions to reduce evaluation costs. |\n| [prompttools](https:\u002F\u002Fgithub.com\u002Fhegelai\u002Fprompttools ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhegelai\u002Fprompttools.svg?style=social ) | An open-source tool for testing and trying out prompts. The core idea is to enable developers to evaluate prompts using familiar interfaces such as code and notebooks. With just a few lines of code, you can test prompts and parameters across different models (whether you're using OpenAI, Anthropic, or LLaMA models). You can even evaluate the accuracy of vector database retrievals. |\n| [TrueFoundry](https:\u002F\u002Fwww.truefoundry.com\u002F ) | No GitHub link | Deploy LLMOps tools on your own Kubernetes (EKS, AKS, GKE, On-prem) infrastructure, including Vector DBs, embedded servers, etc. This includes open-source LLM models for deployment, fine-tuning, prompt tracking, and providing complete data security and optimal GPU management. Use best software engineering practices to train and launch your LLM applications at production scale. |\n| [ReliableGPT 💪](https:\u002F\u002Fgithub.com\u002FBerriAI\u002FreliableGPT\u002F ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBerriAI\u002FreliableGPT.svg?style=social ) | Handle OpenAI errors for your production LLM applications (overloaded OpenAI servers, rotated keys, or context window errors). |\n| [Weights & Biases (Prompts)](https:\u002F\u002Fdocs.wandb.ai\u002Fguides\u002Fprompts ) | No GitHub link | A set of LLMOps tools in the developer-focused W&B MLOps platform. Use W&B Prompts to visualize and inspect LLM execution flows, track inputs and outputs, view intermediate results, and manage prompts and LLM chain configurations. |\n| [xTuring](https:\u002F\u002Fgithub.com\u002Fstochasticai\u002Fxturing ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fstochasticai\u002Fxturing.svg?style=social ) | Build and control your personal LLMs using fast and efficient fine-tuning. |\n| [ZenML](https:\u002F\u002Fgithub.com\u002Fzenml-io\u002Fzenml ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fzenml-io\u002Fzenml.svg?style=social ) | An open-source framework for orchestrating, experimenting, and deploying production-grade ML solutions, with built-in `langchain` and `llama_index` integration. |\n\n\n\u003Cbr>\u003Cbr>\n## Courses\n\n- [大语言模型课程notebooks集-Large Language Model Course](https:\u002F\u002Fgithub.com\u002Fmlabonne\u002Fllm-course) - Course with a roadmap and notebooks to get into Large Language Models (LLMs).\n- [Full+Stack+LLM+Bootcamp](https:\u002F\u002Fihower.tw\u002Fnotes\u002F技術筆記-AI\u002FFull+Stack+LLM+Bootcamp) - LLM相关学习\u002F应用资源集.\n\n\n\n\n\u003Cbr>\u003Cbr>\n## Other-Awesome-Lists\n\n- [Awesome LLM](https:\u002F\u002Fgithub.com\u002FHannibal046\u002FAwesome-LLM\u002F) -  A curated list of papers about large language models.\n- [Awesome-Efficient-LLM](https:\u002F\u002Fgithub.com\u002Fhorseee\u002FAwesome-Efficient-LLM) - A curated list for Efficient Large Language Models.\n- [Awesome-production-machine-learning](https:\u002F\u002Fgithub.com\u002FEthicalML\u002Fawesome-production-machine-learning) - A curated list of awesome open source libraries to deploy, monitor, version and scale your machine learning.\n- [Awesome-marketing-datascience](https:\u002F\u002Fgithub.com\u002Funderlines\u002Fawesome-marketing-datascience) - Curated list of useful LLM \u002F Analytics \u002F Datascience resources.\n- [Awesome-llm-tools](https:\u002F\u002Fgithub.com\u002Funderlines\u002Fawesome-marketing-datascience\u002Fblob\u002Fmaster\u002Fllm-tools.md) - Curated list of useful LLM tool.\n- [Awesome-LLM-Compression](https:\u002F\u002Fgithub.com\u002FHuangOwen\u002FAwesome-LLM-Compression) - A curated list for Efficient LLM Compression.\n- [Awesome-Multimodal-Large-Language-Models](https:\u002F\u002Fgithub.com\u002FBradyFU\u002FAwesome-Multimodal-Large-Language-Models) -  A curated list of  Multimodal Large Language Models.\n- [Awesome-LLMOps](https:\u002F\u002Fgithub.com\u002Ftensorchord\u002FAwesome-LLMOps) - An awesome & curated list of the best LLMOps tools for developers.\n- [Awesome-MLops](https:\u002F\u002Fgithub.com\u002Fvisenger\u002Fawesome-mlops) - An awesome list of references for MLOps - Machine Learning Operations.\n- [Awesome ChatGPT Prompts](https:\u002F\u002Fgithub.com\u002Ff\u002Fawesome-chatgpt-prompts) - A collection of prompt examples to be used with the ChatGPT model.\n- [awesome-chatgpt-prompts-zh](https:\u002F\u002Fgithub.com\u002FPlexPt\u002Fawesome-chatgpt-prompts-zh) - A Chinese collection of prompt examples to be used with the ChatGPT model.\n- [Awesome ChatGPT](https:\u002F\u002Fgithub.com\u002Fhumanloop\u002Fawesome-chatgpt) - Curated list of resources for ChatGPT and GPT-3 from OpenAI.\n- [Chain-of-Thoughts Papers](https:\u002F\u002Fgithub.com\u002FTimothyxxx\u002FChain-of-ThoughtsPapers) -  A trend starts from \"Chain of Thought Prompting Elicits Reasoning in Large Language Models.\n- [Instruction-Tuning-Papers](https:\u002F\u002Fgithub.com\u002FSinclairCoder\u002FInstruction-Tuning-Papers) - A trend starts from `Natrural-Instruction` (ACL 2022), `FLAN` (ICLR 2022) and `T0` (ICLR 2022).\n- [LLM Reading List](https:\u002F\u002Fgithub.com\u002Fcrazyofapple\u002FReading_groups\u002F) - A paper & resource list of large language models.\n- [Reasoning using Language Models](https:\u002F\u002Fgithub.com\u002Fatfortes\u002FLM-Reasoning-Papers) - Collection of papers and resources on Reasoning using Language Models.\n- [Chain-of-Thought Hub](https:\u002F\u002Fgithub.com\u002FFranxYao\u002Fchain-of-thought-hub) - Measuring LLMs' Reasoning Performance\n- [Awesome GPT](https:\u002F\u002Fgithub.com\u002Fformulahendry\u002Fawesome-gpt) - A curated list of awesome projects and resources related to GPT, ChatGPT, OpenAI, LLM, and more.\n- [Awesome GPT-3](https:\u002F\u002Fgithub.com\u002Felyase\u002Fawesome-gpt3) - a collection of demos and articles about the [OpenAI GPT-3 API](https:\u002F\u002Fopenai.com\u002Fblog\u002Fopenai-api\u002F).\n\n\n\u003Cbr>\u003Cbr>\n## Licenses\n\n[![MIT license](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-blue.svg)](https:\u002F\u002Flbesson.mit-license.org\u002F)\n\n[MIT License](https:\u002F\u002Flbesson.mit-license.org\u002F).\n\n[![CC BY-NC-SA 4.0](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-CC%20BY--NC--SA%204.0-lightgrey.svg)](http:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc-sa\u002F4.0\u002F)\n\n[Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](http:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc-sa\u002F4.0\u002F).\n\n\u003Cbr>\u003Cbr>\n\n## Citation\n\n```bibtex\n@misc{llm-eval-anthropomorphic,\n      title={Beyond Benchmark: LLMs Evaluation with an Anthropomorphic and Value-oriented Roadmap}, \n      author={Jun Wang and Ninglun Gu and Kailai Zhang and Zijiao Zhang and Yelun Bao and Jin Yang and Xu Yin and Liwei Liu and Yihuan Liu and Pengyong Li and Gary G. Yen and Junchi Yan},\n      year={2025},\n      eprint={2508.18646},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.18646}, \n}\n```\n","\u003Cdiv align=\"center\">\n    \u003Ch1>强大的大语言模型评估\u003C\u002Fh1>\n    \u003Ca href=\"https:\u002F\u002Fawesome.re\">\u003Cimg src=\"https:\u002F\u002Fawesome.re\u002Fbadge.svg\"\u002F>\u003C\u002Fa>\n\u003C\u002Fdiv>\n\n[English](README_EN.md) | [中文](README_CN.md)\n\n\nAwesome-LLM-Eval：一个精心整理的工具、数据集\u002F基准测试、演示、排行榜、论文、文档和模型列表，主要用于对大型语言模型进行评估，并探索生成式人工智能的边界与局限。\n\n这是我们的综述文章的官方项目：《超越基准：以拟人化和价值导向的路线图评估大语言模型》（arxiv.org\u002Fabs\u002F2508.18646）。\n\n**注意：** 由于我们无法实时更新 arXiv 论文，请参考此仓库获取最新信息，论文可能会在稍后更新。我们也欢迎任何拉取请求或问题，帮助我们改进这项工作。您的贡献将在\u003Ca href=\"#acknowledgements\">致谢\u003C\u002Fa>部分被提及。\n\n如果您觉得我们的综述有用，请引用我们的论文：\n\n```bibtex\n@misc{wang2025llmevalroadmap,\n      title={Beyond Benchmark: LLMs Evaluation with an Anthropomorphic and Value-oriented Roadmap}, \n      author={Jun Wang and Ninglun Gu and Kailai Zhang and Zijiao Zhang and Yelun Bao and Jin Yang and Xu Yin and Liwei Liu and Yihuan Liu and Pengyong Li and Gary G. Yen and Junchi Yan},\n      year={2025},\n      eprint={2508.18646},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.18646}, \n}\n```\n\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fonejune2018_Awesome-LLM-Eval_readme_31e6bafb0a3c.png)\n\n## 目录\n\n- [新闻](#News)\n- [工具](#Tools)\n- [数据集 \u002F 基准测试](#Datasets-or-Benchmark)\n  - [通用](#General)\n  - [领域](#Domain)\n  - [RAG-评估](#RAG-Evaluation)\n  - [智能体能力](#Agent-Capabilities)\n  - [编程能力](#Coding-Capabilities)\n  - [多模态\u002F跨模态](#Multimodal-Cross-modal)\n  - [长上下文](#Long-Context)\n  - [推理速度](#Inference-Speed)\n  - [量化与压缩](#Quantization-and-Compression)\n- [演示](#Demos)\n- [排行榜](#Leaderboards)\n- [论文](#Papers)\n- [大语言模型列表](#LLM-List)\n  - [预训练大语言模型](#Pre-trained-LLM)\n  - [指令微调的大语言模型](#Instruction-finetuned-LLM)\n  - [对齐的大语言模型](#Aligned-LLM)\n  - [开源大语言模型](#Open-LLM)\n  - [热门大语言模型](#Popular-LLM)\n- [LLMOps](#LLMOps)\n- [训练框架](#Frameworks-for-Training)\n- [课程](#Courses)\n- [其他](#Others)\n- [其他优秀列表](#Other-Awesome-Lists)\n- [许可证](#Licenses)\n- [引用](#Citation)\n\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fonejune2018_Awesome-LLM-Eval_readme_a285cd76374e.gif)\n\n\n## 新闻\n\n- [2025年8月20日] 我们新增了[拟人化分类法](#Anthropomorphic-Taxonomy)部分。\n- [2024年4月26日] 我们新增了[推理速度](#Inference-Speed)部分。\n- [2024年2月26日] 我们新增了[编程评估](#Coding-Capabilities)部分。\n- [2024年2月8日] 我们加入了来自 Hugging Face 的 lighteval 工具。\n- [2024年1月15日] 我们新增了 CRUXEval（arxiv.org\u002Fabs\u002F2401.03065）、DebugBench（github.com\u002Fthunlp\u002FDebugBench）、OpenFinData（opencompass.org.cn）以及 LAiW（github.com\u002FDai-shen\u002FLAiW）。\n- [2023年12月20日] 我们新增了[RAG-评估](#RAG-Evaluation)部分。\n- [2023年11月15日] 我们新增了用于评估大语言模型指令遵循能力的 Instruction-Following-Evaluation（google-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finstruction_following_eval）和 LLMBar（github.com\u002Fprinceton-nlp\u002FLLMBar）。\n- [2023年10月20日] 我们新增了用于大语言模型智能体评估的 SuperCLUE-Agent（github.com\u002FCLUEbenchmark\u002FSuperCLUE-Agent）。\n- [2023年9月25日] 我们加入了来自 Colossal-AI 的 ColossalEval（github.com\u002Fhpcaitech\u002FColossalAI\u002Ftree\u002Fmain\u002Fapplications\u002FColossalEval）。\n- [2023年9月22日] 我们新增了[排行榜查找器](#Leaderboards)章节。\n- [2023年9月20日] 我们新增了来自 CLUEbenchmark 的 DeepEval（github.com\u002Fmr-gpt\u002Fdeepeval）、FinEval（github.com\u002FSUFE-AIFLM-Lab\u002FFinEval）以及 SuperCLUE-Safety（github.com\u002FCLUEbenchmark\u002FSuperCLUE-Safety）。\n- [2023年9月18日] 我们新增了来自上海人工智能实验室的 OpenCompass（github.com\u002FInternLM\u002Fopencompass\u002Ftree\u002Fmain）。\n- [2023年8月3日] 我们新增了两款中国大语言模型：Baichuan（github.com\u002Fbaichuan-inc\u002FBaichuan-13B）和 Qwen（github.com\u002FQwenLM\u002FQwen-7B）。\n- [2023年6月28日] 我们新增了 AlpacaEval（github.com\u002Ftatsu-lab\u002Falpaca_eval）以及多个工具。\n- [2023年4月26日] 我们发布了包含多个基准测试的 V0.1 版本评估列表。\n\n\n\n## 拟人化分类法\n\n### 典型智商（IQ）——通用智能评估基准\n\n| 名称                  | 年份 | 任务类型              | 机构         | 评估重点                              | 数据集       | 链接                                                          |\n| --------------------- | ---- | ---------------------- | ------------ | ------------------------------------- | ------------ | ------------------------------------------------------------ |\n| MMLU-Pro              | 2024 | 多选题知识测试        | TIGER-AI-Lab | 细致推理，减少噪声                 | MMLU-Pro     | [链接](https:\u002F\u002Fgithub.com\u002FTIGER-AI-Lab\u002FMMLU-Pro )            |\n| DyVal                 | 2024 | 动态评估              | Microsoft    | 数据污染，复杂度控制               | DyVal        | [链接](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpromptbench )            |\n| PertEval              | 2024 | 通用                  | USTC         | 知识容量                           | PertEval     | [链接](https:\u002F\u002Fgithub.com\u002Faigc-apps\u002FPertEval )               |\n| LV-Eval               | 2024 | 长文本问答            | Infinigence-AI | 长度多样性，事实准确性             | 11个子集     | [链接](https:\u002F\u002Fgithub.com\u002Finfinigence\u002FLVEval )               |\n| LLM-Uncertainty-Bench | 2024 | NLP任务              | Tencent      | 不确定性量化                       | 5个NLP任务   | [链接](https:\u002F\u002Fgithub.com\u002Fsmartyfh\u002FLLM-Uncertainty-Bench )   |\n| CommonGen-Eval        | 2024 | 生成                  | AI2          | 常识                                 | CommonGen-lite | [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002FCommonGen-Eval )           |\n| MathBench             | 2024 | 数学                  | 上海AI实验室 | 理论与实践相结合问题解决           | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002FMathBench )           |\n| AIME                  | 2024 | 数学                  | MAA          | 美国邀请数学竞赛                   | 各种         | [链接](https:\u002F\u002Fwww.kaggle.com\u002Fdatasets\u002Fhemishveeraboina\u002Faime-problem-set-1983-2024 ) |\n| FrontierMath          | 2024 | 数学                  | Epoch AI     | 原创、具有挑战性的数学问题           | 各种         | [链接](https:\u002F\u002Fepochai.org\u002Ffiles\u002Fsample_question_transcripts.zip ) |\n| FELM                  | 2023 | 事实性                | HKUST        | 事实性                               | 847个问题    | [链接](https:\u002F\u002Fgithub.com\u002Fhkust-nlp\u002Ffelm )                   |\n| Just-Eval-Instruct    | 2023 | 通用                  | AI2 Mosaic   | 有用性，可解释性                   | 各种         | [链接](https:\u002F\u002Fgithub.com\u002FRe-Align\u002Fjust-eval )               |\n| MLAgentBench          | 2023 | 机器学习研究          | snap-stanford | 端到端机器学习任务                 | 15个任务     | [链接](https:\u002F\u002Fgithub.com\u002Fsnap-stanford\u002FMLAgentBench )       |\n| UltraEval             | 2023 | 通用                  | OpenBMB      | 轻量、灵活、快速                     | 各种         | [链接](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FUltraEval )                |\n| FMTI                  | 2023 | 透明性                | 斯坦福大学   | 模型透明性                           | 100项指标    | [链接](https:\u002F\u002Fcrfm.stanford.edu\u002Ffmti\u002F )                     |\n| BAMBOO                | 2023 | 长文本                | RUCAIBox     | 长文本建模                           | 10个数据集   | [链接](https:\u002F\u002Fgithub.com\u002FRUCAIBox\u002FBAMBOO )                  |\n| TRACE                 | 2023 | 持续学习              | 复旦大学     | 持续学习                             | 8个数据集    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06762 )                    |\n| ColossalEval          | 2023 | 通用                  | Colossal-AI  | 统一评估                             | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI\u002Ftree\u002Fmain\u002Fapplications\u002FColossalEval ) |\n| LLMEval²              | 2023 | 通用                  | AlibabaResearch | 广泛而深入的评估                   | 2,553个样本  | [链接](https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FDAMO-ConvAI\u002Ftree\u002Fmain\u002FWideDeep ) |\n| BigBench              | 2023 | 通用                  | Google       | 知识、语言、推理                   | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench )                 |\n| LucyEval              | 2023 | 通用                  | Oracle       | 成熟度评估                           | 各种         | [链接](http:\u002F\u002Flucyeval.besteasy.com\u002F )                       |\n| Zhujiu                | 2023 | 通用                  | IACAS        | 综合评估                             | 51个任务     | [链接](http:\u002F\u002Fwww.zhujiu-benchmark.com )                     |\n| ChatEval              | 2023 | 聊天                  | THU-NLP      | 类人评估                             | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FChatEval )                  |\n| FlagEval              | 2023 | 通用                  | THU          | 主观与客观评分                     | 各种         | [链接](https:\u002F\u002Fflageval.baai.ac.cn\u002F )                        |\n| AlpacaEval            | 2023 | 通用                  | tatsu-lab    | 自动评估                             | 各种         | [链接](https:\u002F\u002Ftatsu-lab.github.io\u002Falpaca_eval\u002F )            |\n| GPQA                  | 2023 | 通用                  | NYU          | 研究生级别的谷歌认证问答           | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fidavidrein\u002Fgpqa )                  |\n| MuSR                  | 2023 | 推理                  | Zayne Sprague | 基于叙事的推理                       | 756个        | [链接](https:\u002F\u002Fgithub.com\u002FZayne-sprague\u002FMuSR )               |\n| FreshQA               | 2023 | 知识                  | FreshLLMs    | 当今世界知识                         | 599个        | [链接](https:\u002F\u002Fgithub.com\u002Ffreshllms\u002Ffreshqa )                |\n| AGIEval               | 2023 | 通用                  | Microsoft    | 以人为中心的推理                     | NA           | [链接](https:\u002F\u002Fgithub.com\u002Fruixiangcui\u002FAGIEval )              |\n| SummEdits             | 2023 | 通用                  | Salesforce   | 不一致性检测                         | 6,348个      | [链接](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FfactualNLG )            |\n| ScienceQA             | 2022 | 推理                  | UCLA         | 科学推理                             | 21,208个     | [链接](https:\u002F\u002Fgithub.com\u002Flupantech\u002FScienceQA )              |\n| e-CARE                | 2022 | 推理                  | HIT          | 可解释的因果关系                     | 21,000个     | [链接](https:\u002F\u002Fgithub.com\u002FWaste-Wood\u002Fe-CARE )                |\n| BigBench Hard         | 2022 | 推理                  | BigBench     | 具有挑战性的子任务                   | 6,500个      | [链接](https:\u002F\u002Fgithub.com\u002Fsuzgunmirac\u002FBIG-Bench-Hard )       |\n| PlanBench             | 2022 | 推理                  | ASU          | 行动规划                             | 11,113个     | [链接](https:\u002F\u002Fgithub.com\u002Fkarthikv792\u002FLLMs-Planning )        |\n| MGSM                  | 2022 | 数学                  | Google       | 小学数学问题，涵盖10种语言         | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Furl-nlp\u002Ftree\u002Fmain\u002Fmgsm ) |\n| MATH                  | 2021 | 数学                  | UC Berkeley  | 数学问题解决                         | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fmath\u002F )                  |\n| GSM8K                 | 2021 | 数学                  | OpenAI       | 多样化的小学数学应用题               | 各种         | [链接](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math )         |\n| SVAMP                 | 2021 | 数学                  | Microsoft    | 算术推理                             | 1,000个      | [链接](https:\u002F\u002Fgithub.com\u002Farkilpatel\u002FSVAMP )                 |\n| SpartQA               | 2021 | 推理                  | MSU          | 文本空间问答                         | 510个        | [链接](https:\u002F\u002Fgithub.com\u002FHLR\u002FSpartQA-baselines )            |\n| MLSUM                 | 2020 | 通用                  | Thomas Scialom | 新闻摘要                             | 535,062个    | [链接](https:\u002F\u002Fgithub.com\u002FThomasScialom\u002FMLSUM )              |\n| Natural Questions     | 2019 | 语言、推理            | Google       | 基于搜索的问答                       | 300,000个    | [链接](https:\u002F\u002Fgithub.com\u002Fgoogle-research-datasets\u002Fnatural-questions ) |\n| ANLI                  | 2019 | 语言、推理            | Facebook AI  | 对抗性推理                           | 169,265个    | [链接](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fanli )            |\n| BoolQ                 | 2019 | 语言、推理            | Google       | 二元问答                             | 16,000个     | [链接](https:\u002F\u002Fgithub.com\u002Fgoogle-research-datasets\u002Fboolean-questions ) |\n| SuperGLUE             | 2019 | 语言、推理            | NYU          | 高级GLUE任务                         | NA           | [链接](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002Fjiant )                    |\n| DROP                  | 2019 | 语言、推理            | UCI NLP      | 段落级推理                           | 96,000个     | [链接](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Flm-evaluation-harness ) |\n| HellaSwag             | 2019 | 语言、推理            | AI2          | 常识推理                             | 59,950个     | [链接](https:\u002F\u002Fgithub.com\u002Frowanz\u002Fhellaswag )                 |\n| Winogrande            | 2019 | 语言、推理            | AI2          | 代词消歧义                           | 44,000个     | [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fwinogrande )               |\n| PIQA                  | 2019 | 语言、推理            | AI2          | 物理交互问答                         | 18,000个     | [链接](https:\u002F\u002Fgithub.com\u002Fybisk\u002Fybisk.github.io\u002Ftree\u002Fmaster\u002Fpiqa ) |\n| HotpotQA              | 2018 | 语言、推理            | HotpotQA     | 可解释的问答                         | 113,000个    | [链接](https:\u002F\u002Fgithub.com\u002Fhotpotqa\u002Fhotpot )                  |\n| GLUE                  | 2018 | 语言、推理            | NYU          | 基础NLU任务                          | NA           | [链接](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002FGLUE-baselines )           |\n| OpenBookQA            | 2018 | 语言、推理            | AI2          | 开放书本考试                         | 12,000个     | [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002FOpenBookQA )               |\n| SQuAD2.0              | 2018 | 语言、推理            | 斯坦福大学   | 无法回答的问题                       | 150,000个    | [链接](https:\u002F\u002Frajpurkar.github.io\u002FSQuAD-explorer\u002F )         |\n| ARC                   | 2018 | 语言、推理            | AI2          | AI2推理挑战                          | 7,787个      | [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002Faristo-leaderboard )       |\n| SWAG                  | 2018 | 语言、推理            | AI2          | 对抗性的常识推理                     | 113,000个    | [链接](https:\u002F\u002Fgithub.com\u002Frowanz\u002Fswagaf )                    |\n| CommonsenseQA         | 2018 | 语言、推理            | AI2          | 常识推理                             | 12,102个     | [链接](https:\u002F\u002Fgithub.com\u002Fjonathanherzig\u002Fcommonsenseqa )     |\n| RACE                  | 2017 | 语言、推理            | CMU          | 考试形式的问答                       | 100,000个    | [链接](https:\u002F\u002Fwww.cs.cmu.edu\u002F~glai1\u002Fdata\u002Frace\u002F )            |\n| SciQ                  | 2017 | 语言、推理            | AI2          | 群众贡献的科学知识                   | 13,700个     | [链接](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fsciq )        |\n| TriviaQA              | 2017 | 语言、推理            | AI2          | 远程监督                             | 650,000个    | [链接](https:\u002F\u002Fgithub.com\u002Fmandarjoshi90\u002Ftriviaqa )           |\n| MultiNLI              | 2017 | 语言、推理            | NYU          | 跨类型蕴含关系                       | 433,000个    | [链接](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002FmultiNLI )                 |\n| SQuAD                 | 2016 | 语言、推理            | 斯坦福大学   | 基于维基百科的问答                   | 100,000个    | [链接](https:\u002F\u002Frajpurkar.github.io\u002FSQuAD-explorer\u002F )         |\n| LAMBADA               | 2016 | 语言、推理            | CIMEC        | 话语上下文                           | 12,684个     | [链接](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcimec\u002Flambada )       |\n| MS MARCO              | 2016 | 语言、推理            | Microsoft    | 基于搜索的问答                       | 1,112,939个  | [链接](https:\u002F\u002Fmicrosoft.github.io\u002Fmsmarco\u002F )                |\n\n### 典型职业商数（PQ）——职业专业能力评估基准\n\n| 领域     | 名称                  | 机构              | 任务范围                                         | 独特贡献                                         | 链接                                                          |\n| ---------- | --------------------- | ------------------------ | ------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |\n|            | BLURB                 | Mindrank AI              | 六种多样化的NLP任务，十三个数据集               | 所有任务上的宏平均得分                       | [链接](https:\u002F\u002Fmicrosoft.github.io\u002FBLURB\u002Findex.html )        |\n|            | Seismometer           | Epic                     | 使用本地数据和工作流程                         | 患者人口统计学特征、临床干预及治疗结果       | [链接](https:\u002F\u002Fgithub.com\u002Fepic-open-source\u002Fseismometer )     |\n| 医疗保健 | Medbench              | OpenMEDLab               | 强调科学严谨性和公平性                           | 来自医学考试和报告的40,041个问题              | [链接](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fopencompass\u002Ftree\u002Fmain\u002Fopencompass\u002Fdatasets\u002Fmedbench\u002F ) |\n|            | GenMedicalEval        | E                        | 16个专业领域，3个训练阶段，6种临床场景           | 开放式指标和自动化评估模型                   | [链接](https:\u002F\u002Fgithub.com\u002FMediaBrain-SJTU\u002FGenMedicalEval )   |\n|            | PsyEval               | SJTU                     | 六个子任务，涵盖三个维度                       | 面向心理健康大语言模型的定制化基准测试       | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09189 )                    |\n|            | Fin-Eva               | 蚂蚁集团                | 财富管理、保险、投资研究                         | 同时包含工业界和学术界的金融评估             | [链接](https:\u002F\u002Fgithub.com\u002Falipay\u002Ffinancial_evaluation_dataset ) |\n| 金融     | FinEval               | SUFE-AIFLM-Lab           | 关于金融、经济学和会计的选择题问答               | 专注于高质量的评估题目                         | [链接](https:\u002F\u002Fgithub.com\u002FSUFE-AIFLM-Lab\u002FFinEval )           |\n|            | OpenFinData           | 上海人工智能实验室      | 多场景金融任务                                   | 首个全面的金融评估数据集                       | [链接](https:\u002F\u002Fopencompass.org.cn )                          |\n|            | FinBen                | FinAI                    | 23项金融任务中的35个数据集                       | 归纳推理、定量推理                             | [链接](https:\u002F\u002Fgithub.com\u002FThe-FinAI\u002FPIXIU )                  |\n|            | LAiW                  | 四川大学               | 13项基础法律NLP任务                              | 将法律NLP能力划分为三大核心能                   | [链接](https:\u002F\u002Fgithub.com\u002FDai-shen\u002FLAiW )                    |\n| 法律     | LawBench              | 南京大学               | 法律实体识别、阅读理解                           | 真实世界任务，“弃权率”指标                   | [链接](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Flawbench )            |\n|            | LegalBench            | 斯坦福大学             | 162项任务，涵盖六种法律推理类型                 | 促进跨学科对话                                 | [链接](https:\u002F\u002Fgithub.com\u002FHazyResearch\u002Flegalbench\u002F )         |\n|            | LexEval               | 清华大学               | 组织不同任务的法律认知能力                       | 更大的法律评估数据集，探讨伦理问题           | [链接](https:\u002F\u002Fgithub.com\u002FCSHaitao\u002FLexEval )                 |\n|            | SPEC5G                | 普渡大学               | 安全相关的文本分类与摘要生成                     | 5G协议分析自动化                               | [链接](https:\u002F\u002Fgithub.com\u002FImtiazkarimik23\u002FSPEC5G )           |\n| 电信     | TeleQnA               | 华为（巴黎）            | 通用电信咨询                                     | 精通电信相关问题                               | [链接](https:\u002F\u002Fgithub.com\u002Fnetop-team\u002FTeleQnA )               |\n|            | OpsEval               | 清华大学               | 有线网络运维、5G、数据库运维                     | 专注于AIOps，评估熟练程度                     | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07637 )                    |\n|            | TelBench              | SK电信                 | 数学建模、开放式问答、代码生成                   | 电信领域的整体评估                             | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.09424v1 )                  |\n|            | TelecomGPT            | 阿联酋                 | 电信数学建模、开放式问答和代码任务               | 电信领域的整体评估                             | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.09424v1 )                  |\n|            | Linguistic            | 皇后大学               | 多项以语言为中心的任务                           | 零样本评估                                     | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15818 )                    |\n|            | TelcoLM               | Orange                   | 多项选择题问卷                                   | 领域特定数据（8亿token，8万条指令）             | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.15891 )                    |\n|            | ORAN-Bench-13K        | GMU                      | 多项选择题                                       | 开放式无线接入网（O-RAN）                      | [链接](https:\u002F\u002Fgithub.com\u002Fprnshv\u002FORAN-Bench-13K )            |\n|            | Open-Telco Benchmarks | GSMA                     | 多项以语言为中心的任务                           | 零样本评估                                     | [链接](https:\u002F\u002Fwww.gsma.com\u002Fget-involved\u002Fgsma-foundry\u002Fgsma-open-telco-llm-benchmarks\u002F ) |\n|            | FullStackBench        | 字节跳动               | 代码编写、调试、代码审查                         | 汇集了最新的Stack Overflow问答                | [链接](https:\u002F\u002Fgithub.com\u002Fbytedance\u002FFullStackBench )         |\n| 编程     | StackEval             | Prosus AI                | 11个真实场景，16种编程语言                       | 在多样化且实用的编程环境中进行评估           | [链接](https:\u002F\u002Fgithub.com\u002FProsusAI\u002Fstack-eval )              |\n|            | CodeBenchGen          | 多家机构               | 基于执行的代码生成任务                           | 基准规模随大小和复杂度扩展                   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.00566 )                    |\n|            | HumanEval             | 华盛顿大学             | 严格的测试                                       | 对生成代码正确性的评估采用更严格的协议       | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374 )                    |\n|            | APPS                  | 加州大学               | 来自竞技平台的编码挑战                           | 检查生成代码在测试用例上的解题能力             | [链接](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fapps )                   |\n|            | MBPP                  | 谷歌研究院             | 来自不同来源的编程问题                           | 多样化的编程任务                               | [链接](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fmbpp ) |\n|            | ClassEval             | 清华大学               | 类级别的代码生成                                 | 手工制作，面向对象编程概念                     | [链接](https:\u002F\u002Fgithub.com\u002FFudanSELab\u002FClassEval )             |\n|            | CoderEval             | 北京大学               | 实用的代码生成                                   | 能够针对描述的问题生成有效的代码补丁         | [链接](https:\u002F\u002Fgithub.com\u002FCoderEval\u002FCoderEval )              |\n|            | MultiPL-E             | 普林斯顿大学           | 神经网络代码生成                                 | 用于基准测试神经网络代码生成模型               | [链接](https:\u002F\u002Fgithub.com\u002Fnuprl\u002FMultiPL-E )                  |\n|            | CodeXGLUE             | 微软                   | 代码智能                                           | 涵盖广泛的任务：代码-代码、文本-代码、代码-文本以及文本-文本 | [链接](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FCodeXGLUE )              |\n|            | EvoCodeBench          | 北京大学               | 进化型代码生成基准                               | 与真实世界的代码库对齐，并随时间不断进化     | [链接](https:\u002F\u002Fgithub.com\u002Fseketeam\u002FEvoCodeBench )            |\n\n### 典型的情感商数（EQ）对齐能力评估基准\n\n| 名称 | 年份 | 任务类型 | 机构 | 类别 | 数据集 | 链接 |\n| --- | --- | --- | --- | --- | --- | --- |\n| DiffAware | 2025 | 偏差 | 斯坦福大学 | 通用偏差 | 8个数据集 | [链接](https:\u002F\u002Fgithub.com\u002FAngelina-Wang\u002Fdifference_awareness ) |\n| CASE-Bench | 2025 | 安全性 | 剑桥大学 | 上下文感知安全性 | CASE-Bench | [链接](https:\u002F\u002Fgithub.com\u002FBriansIDP\u002FCASEBench ) |\n| Fairness | 2025 | 公平性 | 宾夕法尼亚州立大学 | 分配公平性 | - | - |\n| HarmBench | 2024 | 安全性 | 伊利诺伊大学厄巴纳-香槟分校 | 对抗性行为 | 510 | [链接](https:\u002F\u002Fgithub.com\u002Fcenterforaisafety\u002FHarmBench ) |\n| SimpleQA | 2024 | 安全性 | OpenAI | 事实准确性 | 4,326 | [链接](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fsimple-evals ) |\n| AgentHarm | 2024 | 安全性 | 英国商业、能源及工业战略部 | 恶意代理任务 | 110 | [链接](https:\u002F\u002Fgithub.com\u002FUKGovernmentBEIS\u002Finspect_evals ) |\n| StrongReject | 2024 | 安全性 | dsbowen | 抗攻击性 | 不适用 | [链接](https:\u002F\u002Fgithub.com\u002Fdsbowen\u002Fstrong_reject ) |\n| LLMBar | 2024 | 指令遵循 | 普林斯顿大学 | 指令遵循 | 419个实例 | [链接](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FLLMBar ) |\n| AIR-Bench | 2024 | 安全性 | 斯坦福大学 | 监管对齐 | 5,694 | [链接](https:\u002F\u002Fgithub.com\u002Fstanford-crfm\u002Fair-bench-2024 ) |\n| TrustLLM | 2024 | 通用 | TrustLLM | 可信度 | 30+ | [链接](https:\u002F\u002Ftrustllmbenchmark.github.io\u002FTrustLLM-Website\u002F ) |\n| RewardBench | 2024 | 对齐 | AIAI | 人类偏好 | RewardBench | [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench ) |\n| EQ-Bench | 2024 | 情感 | Paech | 情商 | 171个问题 | [链接](https:\u002F\u002Fgithub.com\u002FEQ-bench\u002FEQ-Bench ) |\n| Forbidden | 2023 | 安全性 | CISPA | 越狱检测 | 15,140 | [链接](https:\u002F\u002Fgithub.com\u002Fverazuo\u002Fjailbreak_llms ) |\n| MaliciousInstruct | 2023 | 安全性 | 普林斯顿大学 | 恶意意图 | 100 | [链接](https:\u002F\u002Fgithub.com\u002FPrinceton-SysML\u002FJailbreak_LLM ) |\n| SycophancyEval | 2023 | 安全性 | Anthropic | 观点对齐 | 不适用 | [链接](https:\u002F\u002Fgithub.com\u002Fmeg-tong\u002Fsycophancy-eval ) |\n| DecodingTrust | 2023 | 安全性 | 伊利诺伊大学厄巴纳-香槟分校 | 可信度 | 243,877 | [链接](https:\u002F\u002Fgithub.com\u002FAI-secure\u002FDecodingTrust ) |\n| AdvBench | 2023 | 安全性 | 卡内基梅隆大学 | 对抗性攻击 | 1,000 | [链接](https:\u002F\u002Fgithub.com\u002Fllm-attacks\u002Fllm-attacks ) |\n| XSTest | 2023 | 安全性 | 博科尼大学 | 安全性过度 | 450 | [链接](https:\u002F\u002Fgithub.com\u002Fpaul-rottger\u002Fexaggerated-safety ) |\n| OpinionQA | 2023 | 安全性 | tatsu-lab | 人口统计学对齐 | 1,498 | [链接](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fopinions_qa ) |\n| SafetyBench | 2023 | 安全性 | 清华大学 | 内容安全 | 11,435 | [链接](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafetyBench ) |\n| HarmfulQA | 2023 | 安全性 | declare-lab | 有害主题 | 1,960 | [链接](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fred-instruct ) |\n| QHarm | 2023 | 安全性 | vinid | 安全抽样 | 100 | [链接](https:\u002F\u002Fgithub.com\u002Fvinid\u002Fsafety-tuned-llamas ) |\n| BeaverTails | 2023 | 安全性 | 北京大学 | 红队测试 | 334,000 | [链接](https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fbeavertails ) |\n| DoNotAnswer | 2023 | 安全性 | Libr-AI | 安全机制 | 939 | [链接](https:\u002F\u002Fgithub.com\u002FLibr-AI\u002Fdo-not-answer ) |\n| AlignBench | 2023 | 对齐 | THUDM | 对齐、可靠性 | 各种 | [链接](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAlignBench ) |\n| IFEval | 2023 | 指令遵循 | 谷歌 | 指令遵循 | 500个提示 | [链接](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finstruction_following_eval ) |\n| ToxiGen | 2022 | 安全性 | 微软 | 毒性检测 | 274,000 | [链接](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FTOXIGEN ) |\n| HHH | 2022 | 安全性 | Anthropic | 人类偏好 | 44,849 | [链接](https:\u002F\u002Fgithub.com\u002Fanthropics\u002Fhh-rlhf ) |\n| RedTeam | 2022 | 安全性 | Anthropic | 红队测试 | 38,921 | [链接](https:\u002F\u002Fgithub.com\u002Fanthropics\u002Fhh-rlhf ) |\n| BOLD | 2021 | 偏差 | 亚马逊 | 生成中的偏差 | 23,679 | [链接](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fbold ) |\n| BBQ | 2021 | 偏差 | 纽约大学 | 社会偏见 | 58,492 | [链接](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002FBBQ ) |\n| StereoSet | 2020 | 偏差 | 麦吉尔大学 | 刻板印象检测 | 4,229 | [链接](https:\u002F\u002Fgithub.com\u002Fmoinnadeem\u002FStereoSet ) |\n| ETHICS | 2020 | 伦理 | 伯克利大学 | 道德判断 | 134,400 | [链接](https:\u002F\u002Fgithub.com\u002Fhendrycks\u002Fethics ) |\n| ToxicityPrompt | 2020 | 安全性 | AllenAI | 毒性评估 | 99,442 | [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freal-toxicity-prompts ) |\n| CrowS-Pairs | 2020 | 偏差 | 纽约大学 | 刻板印象测量 | 1,508 | [链接](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002Fcrows-pairs ) |\n| SEAT | 2019 | 偏差 | 普林斯顿大学 | 编码器偏见 | 不适用 | [链接](https:\u002F\u002Fgithub.com\u002FW4ngatang\u002Fsent-bias ) |\n| WinoGender | 2018 | 偏差 | 马萨诸塞大学 | 性别偏见 | 720 | [链接](https:\u002F\u002Fgithub.com\u002Frudinger\u002Fwinogender-schemas ) |\n\n## 工具\n\n|       名称        |  组织机构   |                           网站                            |                         描述                          |\n| :---------------: | :-------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n|  prometheus-eval  | prometheus-eval | [prometheus-eval](https:\u002F\u002Fgithub.com\u002Fprometheus-eval\u002Fprometheus-eval) | PROMETHEUS开源评估专用语言模型，性能较其前代版本更强大。它能够高度模拟人类及GPT-4的判断结果。此外，该模型支持直接评分和成对排序两种评估格式，并可结合用户自定义的评估标准使用。在四项直接评分基准和四项成对排序基准上，PROMETHEUS 2与人类评估者及专有语言模型的相关性和一致性均位居所有已测试的开源评估语言模型之首（2024年5月4日）。 |\n|   athina-evals    |    athina-ai    |    [athina-ai](https:\u002F\u002Fgithub.com\u002Fathina-ai\u002Fathina-evals)    | Athina-ai是一个开源库，提供即插即用的预设评估工具以及模块化、可扩展的框架，用于编写和运行评估任务。它帮助工程师通过评估驱动的开发方法，系统性地提升大型语言模型的可靠性和性能。Athina-ai提供了一套评估驱动的开发体系，克服了传统工作流程的局限性，支持快速实验，并为用户提供具有统一指标的可定制评估器。 |\n| LeaderboardFinder |   Huggingface   | [LeaderboardFinder](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fleaderboards\u002FLeaderboardFinder) | LeaderboardFinder可以帮助您为特定场景找到合适的排行榜，堪称“排行榜中的排行榜”（2024年4月2日）。 |\n|     LightEval     |   Huggingface   |    [lighteval](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Flighteval)     | LightEval是Hugging Face开发的一个轻量级大型语言模型（LLM）评估框架。最初作为内部工具，用于评估Hugging Face新发布的LLM数据处理库datatrove和LLM训练库nanotron，现已开源供社区使用和改进。LightEval的主要特点包括：(1) 轻量化设计，易于使用和集成；(2) 支持多任务和多模型的评估套件；(3) 兼容CPU或GPU上的评估，并可与Hugging Face的加速库Accelerate以及Nanotron等框架集成；(4) 支持分布式评估，尤其适用于大型模型的评估；(5) 可应用于Open LLM Leaderboard上的所有基准；(6) 可定制性，允许用户添加新的指标和任务以满足特定的评估需求（2024年2月8日）。 |\n|  LLM Comparator   |     Google      |    [LLM Comparator](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2402.10524v1)     | 一种用于比较和评估大型语言模型（LLM）的可视化分析工具。相较于传统的手工评估方法，该工具提供了一种可扩展的自动化比较评估方案。它利用另一款LLM作为评估者，展示不同模型之间的质量差异，并解释这些差异的原因。通过交互式表格和摘要可视化，LLM Comparator帮助用户理解模型在特定情境下表现优异或不佳的原因，以及模型响应之间的定性差异。该工具由Google的研究人员和工程师合作开发，在Google内部得到广泛应用，三个月内吸引了超过400名用户，评估了超过1,000个实验（2024年2月16日）。 |\n|   Arthur Bench    |    Arthur-AI    |      [Arthur Bench](https:\u002F\u002Fgithub.com\u002Farthur-ai\u002Fbench)      | Arthur Bench是一款开源评估工具，旨在比较和分析大型语言模型（LLM）的性能。它支持多种评估任务，包括问答、摘要、翻译和代码生成，并提供关于LLM在这些任务中表现的详细报告。Arthur Bench的关键特性与优势包括：(1) 模型比较功能，可评估不同供应商、不同版本以及不同训练数据集的LLM；(2) 提示词和超参数评估，考察不同提示词对LLM性能的影响，并测试通过各种超参数设置来控制模型行为的效果；(3) 任务定义与模型选择，允许用户自定义评估任务，并从一系列支持的LLM模型中选择评估对象；(4) 参数配置功能，使用户能够调整提示词和超参数以精细控制LLM的行为；(5) 自动化评估流程，简化评估任务的执行；(6) 应用场景涵盖模型选择与验证、预算与隐私优化，以及将学术基准转化为实际性能评估。此外，它还提供全面的评分指标，支持本地和云端版本，并鼓励社区协作与项目发展（2023年10月6日）。 |\n| llm-benchmarker-suite | FormulaMonks | [llm-benchmarker-suite](https:\u002F\u002Fgithub.com\u002FFormulaMonks\u002Fllm-benchmarker-suite) | 这项开源计划旨在解决LLM基准测试领域的碎片化和模糊性问题。该套件提供结构化的评估方法、多样化的基准集合以及工具包，以简化LLM性能的评估过程。通过提供一个通用平台，该项目致力于促进自然语言处理领域的协作、透明度和高质量研究。 |\n| autoevals | braintrust | [autoevals](https:\u002F\u002Fgithub.com\u002Fbraintrustdata\u002Fautoevals) | AutoEvals是一款AI模型输出评估工具，采用最佳实践快速简便地评估AI模型的输出。它集成了多种自动评估方法，支持自定义评估提示和自定义评分器，简化了模型输出的评估流程。Autoevals包含针对各类主观任务的模型评分评估，如事实核查、安全性等。其中许多评估基于OpenAI优秀的evals项目，但以灵活的方式实现，允许用户调整提示并调试输出。 |\n| EVAL | OPENAI | [EVAL](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fevals) | EVAL是OpenAI开发的一款用于评估大型语言模型（LLM）的工具。它可以测试模型在不同任务和数据集上的性能及泛化能力。 |\n| lm-evaluation-harness | EleutherAI | [lm-evaluation-harness](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Flm-evaluation-harness) | lm-evaluation-harness是EleutherAI开发的一款用于评估大型语言模型（LLM）的工具。它可以测试模型在不同任务和数据集上的性能及泛化能力。 |\n| lm-evaluation | AI21Labs | [lm-evaluation](https:\u002F\u002Fgithub.com\u002FAI21Labs\u002Flm-evaluation) | 对Jurassic-1技术论文[原文](https:\u002F\u002Fwww.ai21.com\u002Fblog\u002Fannouncing-ai21-studio-and-jurassic-1)中的结果进行评估和复现，目前支持通过AI21 Studio API和OpenAI的GPT-3 API同时运行任务。 |\n| OpenCompass | 上海人工智能实验室 | [OpenCompass](https:\u002F\u002Fgithub.com\u002FInternLM\u002Fopencompass\u002Ftree\u002Fmain) | OpenCompass是一个用于评估大型模型的一站式平台。其主要特点包括：开源且可复现的评估方案；覆盖五大领域的综合能力维度，拥有超过50个数据集和约30万道题目，用于评估模型能力；支持超过20种Hugging Face和API模型；支持分布式高效评估，可通过一行命令完成任务拆分并进行分布式评估，从而在数小时内完成万亿参数模型的全面评估；提供多样化的评估范式，支持零样本、少样本和思维链式评估，并配备标准或对话式提示模板，以轻松激发模型的最佳性能。 |\n| Phase AI提供的大型语言模型评估与工作流框架 | wgryc | [phasellm](https:\u002F\u002Fgithub.com\u002Fwgryc\u002Fphasellm) | Phase AI提供的一套用于评估和管理LLM的框架，帮助用户选择合适的模型、数据集和指标，并对结果进行可视化和分析。 |\n| LLM评估基准 | FreedomIntelligence | [LLMZoo](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FLLMZoo) | LLMZoo是由FreedomIntelligence开发的LLM评估基准，包含多个领域和任务的数据集、指标以及带有结果的预训练模型。 |\n| 语言模型整体评估（HELM） | 斯坦福大学 | [HELM](https:\u002F\u002Fgithub.com\u002Fstanford-crfm\u002Fhelm) | HELM是由斯坦福大学研究团队提出的一种综合性LLM评估方法，综合考虑模型的语言能力、知识、推理、公平性和安全性等多个方面。 |\n| 用于问答的轻量级评估工具 | Langchain | [auto-evaluator](https:\u002F\u002Fgithub.com\u002Frlancemartin\u002Fauto-evaluator) | auto-evaluator是Langchain开发的一款用于评估问答系统的轻量级工具。它可以自动生成问答题目，并计算模型的准确率、召回率和F1分数等指标。 |\n| PandaLM | WeOpenML | [PandaLM](https:\u002F\u002Fgithub.com\u002FWeOpenML\u002FPandaLM) | PandaLM是WeOpenML开发的一款LLM评估工具，用于自动化和可重复的评估。它允许用户根据自身需求和偏好选择合适的数据集、指标和模型，并生成报告和图表。 |\n| FlagEval | 清华大学 | [FlagEval](https:\u002F\u002Fgithub.com\u002FFlagOpen\u002FFlagEval) | FlagEval是清华大学开发的LLM评估平台，提供多项任务和数据集，以及在线测试、排行榜和分析功能。 |\n| AlpacaEval | tatsu-lab | [alpaca_eval](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval) | AlpacaEval是tatsu-lab开发的一款LLM评估工具，能够测试模型在多种语言、领域和任务中的表现，并提供可解释性、鲁棒性和可信度等指标。 |\n| Prompt flow | 微软 | [promptflow](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpromptflow) | 这是一套由微软设计的开发工具，旨在简化基于LLM的AI应用的端到端开发周期，从构思、原型制作、测试和评估，到生产部署和监控。它使提示工程更加容易，并支持开发产品级的LLM应用程序。 |\n| DeepEval | mr-gpt | [DeepEval](https:\u002F\u002Fgithub.com\u002Fconfident-ai\u002Fdeepeval) | DeepEval是一个简单易用的开源LLM评估框架。它类似于Pytest，但专门用于LLM输出的单元测试，结合最新研究成果，依据G-Eval、幻觉、答案相关性、RAGAS等指标对LLM输出进行评估，利用本地运行的LLM及其他NLP模型进行评估。 |\n| CONNER | 腾讯AI实验室 | [CONNER](https:\u002F\u002Fgithub.com\u002FChanLiang\u002FCONNER) | CONNER是一个全面的大模型知识评估框架，旨在从六个关键视角——真实性、相关性、连贯性、信息量、实用性和有效性——系统地自动评估生成的信息。 |\n\n## 数据集或基准\n\n### 一般\n\n|         名称          |      组织       |                           网站                            |                         描述                          |\n| :-------------------: | :---------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n|       MMLU-Pro        |      TIGER-AI-Lab       |     [MMLU-Pro](https:\u002F\u002Fgithub.com\u002FTIGER-AI-Lab\u002FMMLU-Pro)     | MMLU-Pro 是 MMLU 数据集的改进版本。长期以来，MMLU 一直是多项选择知识测试的参考标准。然而，近期研究表明，该数据集存在噪声（部分问题无法解答）且难度偏低（由于模型能力的进步及污染增加所致）。MMLU-Pro 将选项数量从四个增至十个，在更多题目中要求推理，并经过专家评审以减少噪声。其质量和挑战性均高于原始版本。MMLU-Pro 还降低了提示变化对模型性能的影响，而这一问题在前代基准 MMLU 中较为常见。研究显示，采用“思维链”推理的模型在此新基准上的表现更佳，表明 MMLU-Pro 更适合评估 AI 的细微推理能力。（2024-05-20） |\n|  TrustLLM 基准   |        TrustLLM         | [TrustLLM](https:\u002F\u002Ftrustllmbenchmark.github.io\u002FTrustLLM-Website\u002F) | TrustLLM 是一个用于评估大型语言模型可信度的基准。它涵盖六个可信度维度，包含超过30个数据集，全面评估 LLM 的功能能力，从简单的分类任务到复杂的生成任务。每个数据集都具有独特的挑战，并已对16种主流 LLM（包括商业和开源模型）进行了基准测试。|\n|         DyVal         |        Microsoft        |      [DyVal](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpromptbench)       | 人们一直担忧 LLM 庞大的训练语料库可能存在数据污染问题。此外，当前基准测试的静态特性和固定复杂度可能无法充分衡量 LLM 不断发展的能力。DyVal 是一种通用且灵活的动态评估 LLM 的协议。利用有向无环图的优势，DyVal 可动态生成复杂度可控的评估样本。它为数学、逻辑推理和算法问题等推理任务创建了具有挑战性的评估集。从 Flan-T5-large 到 GPT-3.5-Turbo 和 GPT-4，多种 LLM 都接受了评估。实验表明，LLM 在 DyVal 生成的不同复杂度样本上表现更差，凸显了动态评估的重要性。作者还分析了不同提示方法的失败案例和结果。此外，DyVal 生成的样本不仅可用作评估集，还能帮助微调模型，从而提升其在现有基准上的表现。（2024-04-20） |\n|      RewardBench      |          AIAI           |    [RewardBench](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench)    | RewardBench 是一个用于评估语言模型奖励模型的基准，旨在考察各类模型的优势与不足。它揭示了现有模型在推理和指令遵循方面仍存在显著缺陷。该基准包括一个[排行榜](https:\u002F\u002Fhf.co\u002Fspaces\u002Fallenai\u002Freward-bench)、[代码](https:\u002F\u002Fgithub.com\u002Fallenai\u002Freward-bench)和[数据集](https:\u002F\u002Fhf.co\u002Fdatasets\u002Fallenai\u002Freward-bench)（2024-03-20）。|\n|        LV-Eval        |     Infinigence-AI      |       [LVEval](https:\u002F\u002Fgithub.com\u002Finfinigence\u002FLVEval)        | LV-Eval 是一个长文本评估基准，设有五个长度等级（16k、32k、64k、128k 和 256k），最长文本测试长度可达 256k。LV-Eval 的平均文本长度为 102,380 字符，最小\u002F最大文本长度分别为 11,896\u002F387,406 字符。LV-Eval 主要包含两类评估任务：单跳问答和多跳问答，涵盖中文和英文的 11 个子数据集。在设计过程中，LV-Eval 引入了三项关键技术：混淆事实插入（CFI）以增强挑战性；关键词和短语替换（KPR）以减少信息泄露；以及基于答案关键词的评估指标（结合答案关键词和黑名单词汇），以提高评估结果的客观性。（2024-02-06） |\n| LLM-Uncertainty-Bench |         Tencent         | [LLM-Uncertainty-Bench](https:\u002F\u002Fgithub.com\u002Fsmartyfh\u002FLLM-Uncertainty-Bench) | 一种新的 LLM 基准方法被引入，将不确定性量化纳入其中。基于五项代表性 NLP 任务中测试的九种 LLM，研究发现：I）更准确的 LLM 可能表现出较低的确定性；II）规模更大的 LLM 可能比小型模型表现出更高的不确定性；III）指令微调往往会增加 LLM 的不确定性。这些发现强调了在 LLM 评估中纳入不确定性的重要性。（2024-01-22） |\n|  心理测量评估   | Microsoft Research Asia |    [心理测量评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.16379)    | 微软亚洲研究院提出了一种基于心理测量学的 AI 通用评估方法，旨在解决传统评估方法在预测能力、信息量和测试工具质量方面的局限性。该方法借鉴心理测量理论，识别 AI 的关键心理构念，设计针对性测试，并应用项目反应理论进行精确评分。同时引入信度和效度的概念，以确保评估的可靠性和准确性。这一框架将心理测量方法扩展至评估 AI 处理未知复杂任务的能力，但也面临一些开放性问题，例如区分 AI 的“个体”与“群体”、应对提示敏感性以及评估人类与 AI 构念之间的差异。（2023-10-19） |\n|    CommonGen-Eval     |         AllenAI         | [CommonGen-Eval](https:\u002F\u002Fgithub.com\u002Fallenai\u002FCommonGen-Eval)  | 一项使用 CommonGen-lite 数据集评估 LLM 的研究，采用 GPT-4 进行评估并比较不同模型的表现，结果列于排行榜上。（2024-01-04） |\n|         felm          |          HKUST          |          [felm](https:\u002F\u002Fgithub.com\u002Fhkust-nlp\u002Ffelm)           | FELM 是一个用于评估大型语言模型事实判断能力的元基准。该基准包含 847 个问题，覆盖五个不同领域：世界知识、科学\u002F技术、写作\u002F推荐、推理和数学。各领域的提示来自多种来源，包括 TruthfulQA 等标准数据集、GitHub 仓库等在线平台、ChatGPT 生成的提示，或由作者自行拟定。对于每份回答，均采用细粒度的分段标注，包括参考链接、标注者识别出的错误类型及其原因。（2023-10-03） |\n|       just-eval       |       AI2 Mosaic        |      [just-eval](https:\u002F\u002Fgithub.com\u002FRe-Align\u002Fjust-eval)      | 一款基于 GPT 的多维度、可解释的 LLM 评估工具，能够评估帮助性、清晰度、事实性、深度和参与度等方面。（2023-12-05） |\n|       EQ-Bench        |        EQ-Bench         |       [EQ-Bench](https:\u002F\u002Fgithub.com\u002FEQ-bench\u002FEQ-Bench)       | 一个用于评估语言模型情商的基准，包含 171 个问题（相比 v1 的 60 个有所增加），并采用了新的评分体系，能更好地区分不同模型的表现差异。（2023-12-20） |\n|       CRUXEval        |        MIT CSAIL        |         [CRUXEval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03065)         | CRUXEval 是一个用于评估代码推理、理解和执行能力的基准。它包含 800 个 Python 函数及其输入输出对，测试输入预测和输出预测任务。许多在 HumanEval 上表现优异的模型，在 CRUXEval 上却表现不佳，这凸显了提升代码推理能力的必要性。最佳模型 GPT-4 结合思维链（CoT）分别在输入预测和输出预测任务中取得了 75% 和 81% 的通过率。该基准暴露了开源与闭源模型之间的差距。GPT-4 并未完全通过 CRUXEval，这为其局限性及改进方向提供了洞见。（2024-01-05） |\n|     MLAgentBench      |      snap-stanford      | [MLAgentBench](https:\u002F\u002Fgithub.com\u002Fsnap-stanford\u002FMLAgentBench) | MLAgentBench 是一套端到端机器学习（ML）研究任务，用于评估 AI 研究代理。这些代理旨在根据给定的数据集和 ML 任务描述，自主开发或改进 ML 模型。每个任务都代表一个交互式环境，直接反映人类研究人员所面临的场景。代理可以读取可用文件、在计算集群上运行多次实验，并分析结果以实现既定的研究目标。具体而言，它包括 15 种不同的 ML 工程任务，可通过尝试不同的 ML 方法、数据处理、架构和训练过程来完成。（2023-10-05） |\n|      AlignBench       |          THUDM          |      [AlignBench](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAlignBench)       | AlignBench 是一个全面且多维度的基准，用于评估中文大型语言模型的对齐性能。它构建了人机协作的数据生成流程，以确保数据的动态更新。AlignBench 采用多维度、基于规则的模型评估方法（LLM-as-Judge），并结合思维链（CoT）生成多维度分析及最终综合评分，从而提升评估的可靠性和可解释性。（2023-12-01） |\n|       UltraEval       |         OpenBMB         |      [UltraEval](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FUltraEval)       | UltraEval 是一个开源的基础模型能力评估框架，提供轻量级且易于使用的评估系统，支持主流大模型性能评估。其主要特点包括：(1) 轻量级、用户友好的评估框架，设计直观、依赖少、部署简单且可扩展，适用于各种评估场景；(2) 灵活多样的评估方法，统一的提示模板和丰富的评估指标，支持自定义；(3) 高效快速的推理部署，支持多种模型部署方案，包括 torch 和 vLLM，并可进行多实例部署以加速评估进程；(4) 透明开放的排行榜，评估结果公开可查、可追溯且可重复，由社区驱动以确保透明度；(5) 官方权威的评估数据，采用广泛认可的官方数据集，保证评估的公平性和标准化，确保结果的可比性和可重复性。（2023-11-24） |\n|        IFEval         |     google-research     | [Instruction Following Eval](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finstruction_following_eval) | 按照自然语言指令行事是大型语言模型的核心能力。然而，对此能力的评估缺乏标准化：人工评估成本高、速度慢且缺乏客观可重复性，而基于 LLM 的自动化评估则可能受到评估用 LLM 自身能力或局限性的偏倚影响。为解决这些问题，谷歌的研究人员推出了指令遵循评估（IFEval），这是一个简单且可重复的基准，专注于一组“可验证指令”，例如“撰写超过400字”和“至少提及 AI 关键词3次”。IFEval 确定了25条此类可验证指令，并构建了约500个提示，每个提示都包含一条或多条可验证指令。（2023-11-15） |\n|        LLMBar         |      princeton-nlp      |      [LLMBar](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FLLMBar)       | LLMBar 是一个具有挑战性的元评估基准，旨在测试 LLM 评估者识别指令遵循输出的能力。它包含419个实例，每个实例由一条指令和两个输出组成：一个忠实且正确地遵循指令，另一个则偏离指令。每个实例还附有黄金标签，标明哪份输出在客观上更好。（2023-10-29） |\n|HalluQA |复旦大学、上海人工智能实验室 | [HalluQA](https:\u002F\u002Fgithub.com\u002Fxiami2019\u002FHalluQA\u002F) | HalluQA 是一个中文 LLM 幻觉评估基准，包含450个数据点，其中包括175个误导性条目、69个难以辨别误导性的条目以及206个基于知识的条目。每个问题平均标注了2.8个正确和错误的答案。为提升 HalluQA 的实用性，作者设计了一套基于 GPT-4 的评估方法。具体来说，将幻觉判定标准和正确答案作为指令输入 GPT-4，由其评估模型的回答是否包含幻觉。|\n|FMTI |斯坦福大学 | [FMTI](https:\u002F\u002Fcrfm.stanford.edu\u002Ffmti\u002F) | 基础模型透明度指数（FMTI）从数据、计算资源和人力等100个指标评估开发者在模型训练和部署中的透明度。对10家公司旗舰模型的评估显示，平均透明度得分仅为37\u002F100，表明仍有较大改进空间。|\n|ColossalEval | Colossal-AI | [ColossalEval](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI\u002Ftree\u002Fmain\u002Fapplications\u002FColossalEval) | Colossal-AI 推出的一项项目，提供统一的评估工作流，可用于在公共数据集或自定义数据集上使用传统指标和 GPT 辅助评估来评估语言模型。|\n| LLMEval²-WideDeep |阿里巴巴研究 | [LLMEval²](https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FDAMO-ConvAI\u002Ftree\u002Fmain\u002FWideDeep) | 作为面向 LLM 评估者的最大、最多样化的英语评估基准，包含15个任务、8种能力以及2,553个样本。实验结果表明，网络越广泛（涉及多名评审员）、层次越多（一轮讨论），效果越好，Kappa 相关系数可从0.28提升至0.34。WideDeep 也被用于辅助评估中文 LLM，使评估效率提升4.6倍，成本降低60%。|\n|Aviary | Ray Project | [Aviary](https:\u002F\u002Fgithub.com\u002Fray-project\u002Faviary) | 允许在一个平台上与多种大型语言模型（LLMs）互动。支持直接比较不同模型的输出、按质量排名，以及获取成本和延迟估算。尤其支持托管在 Hugging Face 上的模型，在许多情况下也支持 DeepSpeed 推理加速。|\n| Do-Not-Answer | Libr-AI | [Do-Not-Answer](https:\u002F\u002Fgithub.com\u002FLibr-AI\u002Fdo-not-answer) | 一个开源数据集，旨在以低成本评估 LLM 的安全机制。它由负责任的语言模型不应回应的提示组成。除人工标注外，还实施了基于模型的评估，其中经过6亿次微调的 BERT 类似评估器，其结果与人类和 GPT-4 相当。|\n| LucyEval | Oracle | [LucyEval](http:\u002F\u002Flucyeval.besteasy.com\u002F) | 中国 LLM 成熟度评估——LucyEval 可客观地测试模型能力的各个方面，识别模型的不足之处，帮助设计师和工程师更精准地调整和训练模型，助力 LLM 向更高智能迈进。|\n| Zhujiu | 中国科学院自动化研究所 | [Zhujiu](http:\u002F\u002Fwww.zhujiu-benchmark.com) | 涵盖七种能力维度和51项任务；采用三种互补的评估方法；提供全面的中文基准测试，并具备英文评估能力。|\n| ChatEval | THU-NLP | [ChatEval](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FChatEval) | ChatEval 旨在简化生成文本的人工评估流程。针对不同的文本片段，ChatEval 中的角色（由硕士研究生扮演）可以自主讨论细微差别和差异，根据各自指定的角色作出判断。|\n|FlagEval | Zhiyuan\u002F清华大学 | [FlagEval](https:\u002F\u002Fflageval.baai.ac.cn\u002F#\u002Fhome) | 由智源研究院出品，结合主观和客观评分，提供 LLM 分数排名。|\n|InfoQ 综合 LLM 评估 | InfoQ | [InfoQ 评估](https:\u002F\u002Fmp.weixin.qq.com\u002Fs?__biz=MjM5MDE0Mjc4MA==&mid=265117067676&idx=1&sn=b98af3bd14c9f9fbb3e7f0f8f9bb3ec&scene=21#wechat_redirect) | 面向中国的排名：ChatGPT > 文心一言 > Claude > 星火。|\n|Chain-of-Thought 评估 | Yao Fu | [COT 评估](https:\u002F\u002Fgithub.com\u002FFranxYao\u002Fchain-of-thought-hub) | 包括 GSM8k 和 MATH 复杂问题的排名。|\n|Z-Bench | True Fund | [Z-Bench](https:\u002F\u002Fgithub.com\u002Fzhenbench\u002Fz-bench) | 表明国内中文模型的可编程性相对较低，各模型之间性能差异较小。两款 ChatGLM 版本则显示出显著进步。|\n| CMU 聊天机器人评估  | CMU | [zeno-build](https:\u002F\u002Fgithub.com\u002Fzeno-ml\u002Fzeno-build) | 在对话训练场景中，排名显示 ChatGPT > Vicuna > 其他。|\n|lmsys-arena |伯克利 | [lmsys 排名](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-05-03-arena\u002F) | 采用 Elo 评分机制，排名显示 GPT4 > Claude > GPT3.5 > Vicuna > 其他。|\n|Huggingface 开放 LLM 排行榜 | Huggingface | [HF 开放 LLM 排行榜](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FHuggingFaceH4\u002Fopen_llm_leaderboard) | 由 Huggingface 组织，评估多个主流开源 LLM。评估主要集中在四个数据集上：AI2 理性挑战、HellaSwag、MMLU 和 TruthfulQA，内容以英语为主。|\n| AlpacaEval | tatsu-lab | [AlpacaEval](https:\u002F\u002Ftatsu-lab.github.io\u002Falpaca_eval\u002F) | 开源模型领导者，Vicuna、OpenChat 和 WizardLM 在基于 LLM 的自动评估中处于领先地位。|\n| Chinese-LLM-Benchmark | jeinlee1991 | [llm-benchmark](https:\u002F\u002Fgithub.com\u002Fjeinlee1991\u002Fchinese-llm-benchmark) | 中国 LLM 能力评估排名，涵盖百度文心一言、ChatGPT、阿里巴巴通义千问、科大讯飞星火，以及 Belle 和 ChatGLM6B 等开源模型。提供能力评分排名和原始模型输出结果。|\n|开放 LLM 排行榜 | HuggingFace | [排行榜](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FHuggingFaceH4\u002Fopen_llm_leaderboard) | 由 Huggingface 组织，用于评估多个主流开源 LLM。评估主要集中在四个数据集上：AI2 理性挑战、HellaSwag、MMLU 和 TruthfulQA，内容以英语为主。|\n|斯坦福问答数据集（SQuAD） | 斯坦福 NLP 团队 | [SQuAD](https:\u002F\u002Frajpurkar.github.io\u002FSQuAD-explorer\u002F) | 评估模型在阅读理解任务上的表现。|\n|多文体自然语言推理（MultiNLI） | 纽约大学、DeepMind、Facebook AI 研究、艾伦人工智能研究所、谷歌 AI 语言 | [MultiNLI](https:\u002F\u002Fcims.nyu.edu\u002F~sbowman\u002Fmultinli\u002F) | 评估模型在不同文本体裁间理解句子关系的能力。|\n|LogiQA | 清华大学和微软亚洲研究院 | [LogiQA](https:\u002F\u002Fgithub.com\u002Flgw863\u002FLogiQA-dataset) | 评估模型的逻辑推理能力。|\n| HellaSwag | 华盛顿大学和艾伦人工智能研究所 | [HellaSwag](https:\u002F\u002Frowanzellers.com\u002Fhellaswag\u002F) | 评估模型的推理能力。|\n| LAMBADA 数据集 | 特伦托大学和布鲁诺·凯斯勒基金会 | [LAMBADA](https:\u002F\u002Fzenodo.org\u002Frecord\u002F2630551#.ZFUKS-zML0p) | 评估模型预测段落最后一词的能力，反映其长期理解能力。|\n|CoQA | 斯坦福 NLP 团队 | [CoQA](https:\u002F\u002Fstanfordnlp.github.io\u002Fcoqa\u002F) | 评估模型在对话情境下理解文本段落并回答一系列相关问题的能力。|\n|ParlAI | Facebook AI 研究 | [ParlAI](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FParlAI) | 评估模型在准确性、F1 分数、困惑度（模型预测序列中下一个词的能力）、人工评估（相关性、流畅性和连贯性）、速度和资源利用率、鲁棒性（模型在嘈杂输入、对抗性攻击或数据质量变化等不同条件下表现）以及泛化能力等方面的表现。|\n|语言可解释性工具（LIT） | Google | [LIT](https:\u002F\u002Fpair-code.github.io\u002Flit\u002F) | 提供一个平台，允许根据用户自定义的指标评估模型，分析模型的优势、劣势和潜在偏见。|\n|对抗性 NLI（ANLI） | Facebook AI 研究、纽约大学、约翰霍普金斯大学、马里兰大学、艾伦人工智能研究所 | [对抗性 NLI（ANLI)](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fanli) | 评估模型的鲁棒性、泛化能力、推理解释能力、一致性以及资源效率（内存使用、推理时间和训练时间）。|\n\n### 域名\n\n| 名称 | 机构 | 领域 | URL | 简介 |\n| :--: | :--: | :--: | :--: | :-- |\n| 地震仪 | Epic | 医疗健康 | [seismomete](github.com\u002Fepic-open-source\u002Fseismometer) | 地震仪是一款面向医疗领域的AI模型性能评估工具，提供标准化的评估标准，帮助基于本地数据和工作流程做出决策。它支持对模型性能的持续监控。尽管可用于任何领域的模型，但其设计重点在于医疗AI模型的验证，因为本地验证需要交叉参考患者相关数据（如人口统计学特征、临床干预措施和患者预后）以及模型性能。（2024-05-22） |\n| 医学基准 | OpenMEDLab | 医疗健康 | [medbench](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Fopencompass\u002Ftree\u002Fmain\u002Fopencompass\u002Fdatasets\u002Fmedbench\u002F) | MedBench致力于为中国医学大模型创建科学、公平且严谨的评估体系和开放平台。基于权威的医学标准，它不断更新和维护高质量的医疗数据集，以全面、多维度地量化模型在各个医学维度上的能力。MedBench包含来自真实考试题和各医学分支临床报告的40,041道题目，由四个关键部分组成：中国医师资格考试、住院医师规范化培训考试、主治医师资格考试，以及涵盖检查、诊断和治疗的真实临床病例。（2023-12-20） |\n| 金融评测 | 蚂蚁集团、上海财经大学 | 金融 | [Fin-Eva](https:\u002F\u002Fgithub.com\u002Falipay\u002Ffinancial_evaluation_dataset) | 由蚂蚁集团和上海财经大学联合推出的Fin-Eva 1.0版本，覆盖财富管理、保险、投研等多个金融场景及金融专业学科，共包含超过13,000道评估题目。蚂蚁集团的数据来源包括各业务领域数据及公开互联网数据，经过数据脱敏、文本聚类、语料筛选和数据重写等处理，并结合金融专家评审构建数据集。上海财经大学的数据主要基于相关领域权威考试中的真题和模拟题，遵循知识大纲要求。蚂蚁集团部分涵盖金融认知、金融知识、金融逻辑、内容生成和安全合规五大能力，细分为33个子维度，包含8,445道评估题目；上海财经大学部分则覆盖金融、经济、会计和证书四大领域，包含34个不同学科的4,661道题目。Fin-Eva 1.0采用固定答案的选择题形式，并配有相应指令，使模型能够以标准格式输出。（2023-12-20） |\n| GenMedicalEval | 上海交通大学 | 医疗健康 | [GenMedicalEval](https:\u002F\u002Fgithub.com\u002FMediaBrain-SJTU\u002FGenMedicalEval) | 1. **大规模综合性能评估**：GenMedicalEval基于四万余道医学考试真题和五万五千余份三甲医院患者病历，构建了涵盖16个主要科室、3个医生培训阶段、6个医疗临床应用场景的共计十万余条医疗评估数据。该数据集从医学基础知识、临床应用和安全标准等方面全面评估大模型在真实复杂医疗场景下的整体表现，弥补了现有评估基准未能覆盖诸多医疗实践挑战的不足。2. **深入多维度场景评估**：GenMedicalEval整合了医生的临床记录和医学影像资料，围绕检查、诊断和治疗等关键医疗场景构建了一系列主题丰富、形式多样的生成式评估题目。这为现有的问答式评估提供了有力补充，使其更贴近真实的开放式诊断环境。3. **创新的开放性评估指标与自动化评估模型**：针对开放式生成任务缺乏有效评估指标的难题，GenMedicalEval采用先进的结构化提取和术语对齐技术，构建了一套创新的生成式评估指标体系。该体系能够准确衡量生成答案的医学知识准确性。此外，基于自建知识库训练了一款医学自动评估模型，其评估结果与人工评估高度相关。该模型可提供多维度的医学评分及评估理由，具有无数据泄露、可控性强等特点，相较于GPT-4等其他模型具有独特优势。（2023-12-08） |\n| 开放金融数据 | 上海人工智能实验室 | 金融 | [OpenFinData](https:\u002F\u002Fopencompass.org.cn) | 由上海人工智能实验室发布的“OpenCompass”框架下首个全场景金融评估数据集——OpenFinData，包含六个模块和十九个金融任务维度，覆盖多层次数据类型和多样化的金融场景。每一条数据均源自真实的金融业务场景。（2024-01-04） |\n| LAiW | 四川大学 | 法律 | [LAiW](https:\u002F\u002Fgithub.com\u002FDai-shen\u002FLAiW) | 从法律视角和可行性出发，LAIW将法律NLP的能力划分为三大能力，共包含13项基础任务：(1) 法律NLP基础能力：评估法律基础任务、NLP基础任务以及法律信息抽取能力，包括法律条款推荐、要素识别、命名实体识别、裁判要点归纳和案件识别等五项基础任务；(2) 法律应用基础能力：评估大模型在法律领域的基础应用能力，包括争议焦点挖掘、案件匹配、刑事判决预测、民事判决预测和法律问答等五项基础任务；(3) 法律应用复杂能力：评估大模型在法律领域的复杂应用能力，包括司法推理生成、案件理解和法律咨询等三项基础任务。（2023-10-08） |\n| 法律基准 | 南京大学 | 法律 | [LawBench](https:\u002F\u002Fgithub.com\u002Fopen-compass\u002Flawbench) | LawBench精心设计，旨在精准评估大型语言模型的法律能力。在设计测试任务时，它模拟了司法认知的三个维度，并选取20项任务来评估大模型的能力。与一些仅包含选择题的现有基准相比，LawBench增加了更多与实际应用密切相关的任务类型，如法律实体识别、阅读理解、犯罪金额计算和法律咨询等。LawBench认识到，当前大模型的安全策略可能导致其拒绝回答某些法律问题或难以理解指令而无法作答，因此专门开发了一种“回避率”评估指标，用于衡量模型拒绝回答或未能正确理解指令的频率。研究人员已在LawBench上评估了51种大型语言模型的表现，其中包括20种多语言模型、22种中文模型和9种法律专用大型语言模型。（2023-09-28） |\n| 心理评估 | 上海交通大学 | 心理 | [PsyEval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09189) | 在心理健康研究中，大型语言模型（LLMs）的应用日益受到关注，尤其是在疾病检测方面的显著能力。研究人员专门设计了首个心理健康领域的综合性基准，以系统地评估LLMs在该领域的能力。该基准包含六个子任务，覆盖三个维度，全面评估LLMs在心理健康方面的能力。每个子任务都设计了简洁的提示词，并对八种先进LLMs进行了全面评估。（2023-11-15） |\n| PPTC | 微软、北京大学 | 办公 | [PPTC](https:\u002F\u002Fgithub.com\u002Fgydpku\u002FPPTC) | PPTC是一个用于测试大模型PPT生成能力的基准，包含279轮多轮对话，覆盖不同主题和数百条涉及多模态操作的指令。研究团队还提出了PPTX-Match评估体系，该体系根据预测文件而非标签API序列来评估大语言模型是否完成了指令，因此支持各种LLM生成的API序列。目前，PPT生成面临三大挑战：多轮对话中的误差累积、长PPT模板的处理以及多模态感知问题。（2023-11-04） |\n| LLMRec | 阿里巴巴 | 推荐 | [LLMRec](https:\u002F\u002Fgithub.com\u002Fwilliamliujl\u002FLLMRec) | 对流行的LLM（如ChatGPT、LLaMA、ChatGLM等）进行了五个推荐相关任务的基准测试，包括评分预测、序列推荐、直接推荐、解释生成和评论摘要。此外，还研究了监督微调对提升LLM指令跟随能力的效果。（2023-10-08） |\n| LAiW | Dai-shen | 法律 | [LAiW](https:\u002F\u002Fgithub.com\u002FDai-shen\u002FLAiW) | 面对法律大语言模型的快速发展，首个基于法律能力的中国法律大语言模型基准被提出。法律能力被划分为三个层次：基础法律自然语言处理能力、基础法律应用能力和复杂法律应用能力。第一阶段的评估已完成，重点是基础法律自然语言处理能力的评估。评估结果显示，虽然部分法律大语言模型的表现优于其基础模型，但仍与ChatGPT存在差距。（2023-10-25） |\n| 运维评估 | 清华大学 | AIOps | [OpsEval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07637) | OpsEval是面向大型语言模型的综合性任务导向AIOps基准测试，评估LLM在有线网络运维、5G通信运维和数据库运维三个关键场景中的熟练程度。这些场景涉及不同的能力层级，包括知识回忆、分析思维和实际应用。该基准包含7,200道选择题和问答题，支持英语和中文两种语言。（2023-10-02） |\n| SWE-bench | princeton-nlp | 软件 | [SWE-bench](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FSWE-bench) | SWE-bench是一个用于评估大型语言模型在GitHub上收集的真实软件问题上表现的基准。给定一个代码仓库和一个问题，语言模型的任务是生成一个能够解决所述问题的补丁。 |\n| BLURB | Mindrank AI | 医疗健康 | [BLURB](https:\u002F\u002Fmicrosoft.github.io\u002FBLURB\u002Findex.html) | BLURB包含基于PubMed的生物医学自然语言处理应用的综合性基准测试，以及用于跟踪社区进展的排行榜。BLURB由六项多样化任务和十三个公开可用的数据集组成。为避免过度强调拥有大量数据集的任务（如命名实体识别NER），BLURB以所有任务的宏观平均值作为主要得分。BLURB排行榜不依赖于具体模型，任何能够使用相同训练和开发数据生成测试预测的系统均可参与。BLURB的主要目标是降低参与生物医学自然语言处理的门槛，助力这一对社会和人类具有积极影响的重要领域加速发展。 |\n| 智能游戏 | 微软 | 游戏 | [SmartPlay](github.com\u002Fmicrosoft\u002FSmartPlay) | SmartPlay是一款专为易用性设计的大语言模型（LLM）基准，提供多种游戏用于测试。 |\n| 金融评估 | SUFE-AIFLM-Lab | 金融 | [FinEval](github.com\u002FSUFE-AIFLM-Lab\u002FFinEval) | FinEval：一系列高质量的选择题集合，涵盖金融、经济、会计和证书等领域。 |\n| GSM8K | OpenAI | 数学 | [GSM8K](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgrade-school-math) | GSM8K是一个包含8,500道高质量、语言多样化的小学数学文字应用题的数据集。GSM8K将其分为7,500道训练题和1,000道测试题。这些题目通常需要2至8步才能解答，解题过程主要涉及一系列基本算术运算（+ - \u002F *）以得出最终答案。\n\n### RAG-評估\n\n| 名稱 | 機構 | 網址 | 簡介 |\n| :--: | :--: | :--: | :-- |\n| BERGEN | NAVER | [BERGEN](https:\u002F\u002Fgithub.com\u002Fnaver\u002Fbergen) | BERGEN：一個專注於問答（QA）任務的RAG系統基準測試庫，旨在增強對RAG管道中各組件影響的理解與比較。它通過HuggingFace簡化了新數據集和模型的可重現性及整合過程。BERGEN（BEnchmarking Retrieval-augmented GENeration）是一個用於基準測試RAG系統的庫，特別關注問答任務。不一致的基準測試是比較不同方法以及理解RAG管道中各組件影響的主要挑戰。BERGEN的設計旨在借助HuggingFace，簡化新數據集和模型的可重現性與整合工作（2024年5月31日） |\n| CRAG | Meta Reality Labs | [CRAG](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.04744) | CRAG是一個包含近4,500組問答對及模擬API的RAG基準測試，涵蓋廣泛的領域和問題類型，以激勵研究人員提升問答系統的可靠性和精確度。這是一個由4,409組問答對及模擬API組成的事實型問答基準測試，用以模擬網絡和知識圖譜（KG）搜索。CRAG旨在囊括五個領域和八種問題類別中的多樣化問題，反映從熱門到長尾的不同實體受歡迎程度，以及從數年到數秒不等的時間動態變化（2024年6月7日） |\n| raga-llm-hub | RAGA-AI | [raga-llm-hub](https:\u002F\u002Fgithub.com\u002Fraga-ai-hub\u002Fraga-llm-hub) | raga-llm-hub是一個全面的語言與學習模型（LLM）評估工具包。它擁有超過100個精心設計的評估指標，是目前最全面的平台，使開發者和組織能夠有效評估和比較LLM，並為LLM及檢索增強生成（RAG）應用建立基礎安全保障。這些測試涵蓋相關性與理解力、內容品質、幻覺現象、安全性與偏見、上下文相關性、安全機制及漏洞掃描等多個方面，同時提供一系列基於指標的定量分析測試（2024年3月10日） |\n| ARES | 斯坦福大學 | [ARES](https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FARES) | ARES是一個用於檢索增強生成系統的自動評估框架，包含三個組件：(1) 一組帶有人工偏好驗證的查詢-文檔-答案三元組，用於評估上下文相關性、答案忠實度和\u002F或答案相關性等標準。至少應有50個示例，但最好有數百個。(2) 一小組用於評分您系統中上下文相關性、答案忠實度和\u002F或答案相關性的示例。(3) 由您的RAG系統生成的大量未註釋查詢-文檔-答案三元組，用於打分。ARES的訓練過程包括三個步驟：(1) 根據特定領域的段落生成合成查詢和答案。(2) 通過使用合成數據訓練LLM評估員，使其能夠評分RAG系統。(3) 將準備好的LLM評估員部署到您的RAG系統上，以關鍵指標評估其性能（2023年9月27日） |\n| RGB | 中科院 | [RGB](https:\u002F\u002Fgithub.com\u002Fchen700564\u002FRGB) | RGB是一個用於評估英語和中文RAG的新語料庫\u002F基準測試（RGB）。它分析了不同大型語言模型在RAG所需的四項基本能力方面的表現，包括抗噪能力、否定拒絕能力、信息整合能力和反事實魯棒性。RGB根據這些基本能力將基準測試中的實例劃分為四個獨立的測試集，以針對不同場景進行測試。隨後，RGB對六種具有代表性的LLM進行了評估，以診斷當前LLM在應用RAG時所面臨的挑戰。評估結果顯示，雖然LLM在抗噪能力方面表現出一定水平，但在否定拒絕、信息整合以及處理虛假資訊等方面仍面臨重大困難。上述評估結果表明，將RAG有效應用於LLM仍有很長的路要走（2023年9月4日） |\n| tvalmetrics | TonicAI | [tvalmetrics](https:\u002F\u002Fgithub.com\u002FTonicAI\u002Ftvalmetrics) | Tonic Validate Metrics中的指標採用LLM輔助評估方式，即利用LLM（例如gpt-4）對RAG應用輸出的不同方面進行打分。這些指標結合具體對象與LLM輔助評估，回答有關RAG應用的問題。(1) 答案相似度分數：RAG答案與正確答案應有多接近？(2) 檢索精確度：檢索到的上下文是否與問題相關？(3) 增強精確度：答案中是否包含與問題相關的檢索上下文？(4) 增強準確度：答案中檢索上下文的比例是多少？(5) 答案一致性（二元）：答案中是否存在檢索上下文之外的信息？(6) 檢索k召回率：對於排名前k的上下文向量，檢索到的上下文是否為前k上下文向量的子集，且所有相關上下文是否都包含在檢索到的上下文之中？（2023年11月11日） |\n\n### 代理能力\n\n| 名稱 | 機構 | 網址 | 簡介 |\n| :--: | :--: | :--: | :-- |\n| SuperCLUE-Agent | CLUE | [SuperCLUE-Agent](https:\u002F\u002Fgithub.com\u002FCLUEbenchmark\u002FSuperCLUE-Agent) | SuperCLUE-Agent是一個多維度的基準測試，專注於代理能力，涵蓋三大核心能力和十項基本任務。它可以用於評估大型語言模型在代理核心能力方面的表現，包括工具使用、任務規劃以及長短期記憶等。對16種支持中文的大型語言模型進行評估後發現，GPT-4模型在中文任務的代理核心能力方面領先顯著。而國內代表性模型，包括開源和閉源模型，則正在接近GPT-3.5的水平（2023年10月20日） |\n| AgentBench | 清華大學 | [AgentBench](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAgentBench) | AgentBench是一套系統化的基準測試工具，用於評估LLM作為智能代理的表現，突顯商業LLM與開源競爭對手之間的性能差距（2023年8月1日） |\n| AgentBench推理與決策能力評估排行榜 | THUDM | [AgentBench](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FAgentBench) | 由清華大學與多所大學聯合推出，涵蓋不同任務環境下模型的推理與決策能力，例如購物、居家和作業系統等場景 |\n| ToolBench工具調用評估 | 智源\u002F清華 | [ToolBench](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FToolBench) | 與工具微調模型及ChatGPT進行對比，提供評估腳本 |\n\n### 代码能力\n\n| 名称 | 机构 | 网址 | 简介 |\n| :--: | :--: | :--: | :-- |\n| McEval | 北航 | [McEval](https:\u002F\u002Fgithub.com\u002FMCEVAL\u002FMcEval) | 为了更全面地探索大型语言模型的代码能力，本研究提出了一项大规模多语言多任务代码评估基准（McEval），涵盖40种编程语言，包含16,000个测试样本。评估结果表明，开源模型在多语言编程能力方面与GPT-4相比仍存在显著差距，大多数开源模型甚至无法超越GPT-3.5。此外，测试还显示，Codestral、DeepSeek-Coder、CodeQwen等开源模型及其一些衍生模型也表现出卓越的多语言能力。McEval是一个大规模多语言代码基准，覆盖40种编程语言，包含16,000个测试样本，极大地推动了代码LLM在多语言场景下的极限。该基准包括具有挑战性的代码补全、理解和生成评估任务，并配备了精心策划的大规模多语言指令语料库McEval-Instruct。McEval排行榜可在此查看[这里](https:\u002F\u002Fmceval.github.io\u002F)（2024年6月11日）|\n| HumanEval-XL | FloatAI | [SuperCLUE-Agent](https:\u002F\u002Fgithub.com\u002FFloatAI\u002FHumanEval-XL) | 现有的基准主要集中在将英文提示翻译成多语言代码，或仅限于非常有限的自然语言。这些基准忽略了大规模多语言NL到多语言代码生成这一广阔领域，从而在评估多语言LLM方面留下了重要空白。为应对这一挑战，作者提出了HumanEval-XL，这是一个旨在填补这一空白的大规模多语言代码生成基准。HumanEval-XL建立了23种自然语言与12种编程语言之间的联系，共包含22,080个提示，每个提示平均有8.33个测试用例。通过确保跨多种NL和PL的平行数据，HumanEval-XL为多语言LLM提供了一个全面的评估平台，能够评估其对不同NL的理解能力。这项工作是解决多语言代码生成中NL泛化评估空白的开创性一步（2024年2月26日）|\n| DebugBench | 清华大学 | [DebugBench](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FDebugBench) | DebugBench是一个LLM调试基准，包含4,253个实例，覆盖C++、Java和Python中的四大类漏洞和18个小类。为了构建DebugBench，作者从LeetCode社区收集了代码片段，利用GPT-4向源数据中植入漏洞，并进行了严格的质量检查（2024年1月9日）|\n\n### 多模态\u002F跨模态\n\n| 名称 | 机构 | 网址 | 简介 |\n| :--: | :--: | :--: | :-- |\n| ChartVLM | 上海人工智能实验室 | [ChartVLM](https:\u002F\u002Fgithub.com\u002FUniModal4Reasoning\u002FChartVLM) | ChartX是一个多模态评估集，包含18种图表类型、7个图表任务、22个主题领域以及高质量的图表数据。此外，本文作者还开发了ChartVLM，为处理依赖于可解释模式的多模态任务提供了新的视角，例如图表或几何图像领域的推理任务（2024年2月19日）|\n| ReForm-Eval | 复旦DISC | [ReForm-Eval](https:\u002F\u002Fgithub.com\u002FFudanDISC\u002FReForm-Eval) | ReForm-Eval是一个用于全面评估大型视觉语言模型的基准数据集。通过以不同任务格式重构现有的多模态基准数据集，ReForm-Eval构建了一个统一格式的基准数据集，适用于大型模型的评估。所构建的ReForm-Eval具有以下特点：它涵盖了八个评估维度，每个维度都提供了充足的评估数据（平均每维超过4,000条）；采用统一的评估问题格式（包括选择题和文本生成题）；使用方便且高效，评估方法可靠，不依赖于ChatGPT等外部服务；能够高效利用现有数据资源，无需额外的人工标注，并可进一步扩展到更多数据集（2023年10月24日）|\n| LVLM-eHub | OpenGVLab | [LVLM-eHub](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FMulti-Modality-Arena) | “多模态竞技场”是一个用于大型多模态模型的评估平台。继Fastchat之后，两个匿名模型在视觉问答任务上被并排比较。“多模态竞技场”允许在提供图像输入的同时对视觉语言模型进行并列基准测试。它支持多种模型，如MiniGPT-4、LLaMA-Adapter V2、LLaVA和BLIP-2 |\n\n### 长上下文\n\n| 名称 | 机构 | 网址 | 简介 |\n| :--: | :--: | :--: | :-- |\n| InfiniteBench | OpenBMB | [InfiniteBench](https:\u002F\u002Fgithub.com\u002FOpenBMB\u002FInfiniteBench) | 理解和处理长文本是大型模型迈向更深层次理解和交互的重要能力。尽管一些大型模型声称可以处理10万+序列，但缺乏标准化的基准数据集。InfiniteBench通过构建一个针对10万+序列的基准来解决这一问题，重点关注大型模型在处理长文本方面的五项关键能力：检索、数学、编码、问答和摘要。（1）长上下文：InfiniteBench测试数据的平均上下文长度为19.5万，远超现有基准。（2）多领域、多语言：该基准包含中英文双语的12项任务，覆盖上述五个领域。（3）前瞻性和挑战性：InfiniteBench的任务设计旨在匹配当前最强模型的能力，如GPT-4和Claude 2。（4）现实与合成场景：InfiniteBench既包含了真实世界的数据，用于测试模型处理实际问题的能力，也包含了合成数据，以便扩展上下文窗口进行测试。InfiniteBench是首个平均数据长度超过10万token的LLM基准。它由中英文双语的合成与现实任务组成，涵盖多个领域。InfiniteBench的任务要求对上下文中长距离依赖关系有深入理解，仅仅从上下文中检索少量段落已不足以完成这些任务。（2024年3月19日）|\n\n### 推理速度\n\n| 名称 | 机构 | 网址 | 简介 |\n| :--: | :--: | :--: | :-- |\n| llmperf | Ray | [llmperf](https:\u002F\u002Fgithub.com\u002Fray-project\u002Fllmperf) | 一个用于检查和基准测试大语言模型性能的库。它测量诸如首个 token 生成时间（TTFT）、token 间延迟（ITL）以及在 3 秒内无数据返回的请求数等指标。此外，它还会验证 LLM 输出的正确性，主要检查是否存在跨请求现象（例如请求 A 收到了请求 B 的响应）。设计时考虑了输入和输出 token 长度的变化，以更好地反映真实场景。目前支持的端点包括与 OpenAI 兼容的端点（如 Anyscale 端点、私有端点、OpenAI、Fireworks 等）、Together、Vertex AI 和 SageMaker。（2023-11-03） |\n| llm-analysis | Databricks | [llm-analysis](https:\u002F\u002Fgithub.com\u002Fcli99\u002Fllm-analysis) | 变换器模型训练与推理的延迟及内存分析。 |\n| llm-inference-benchmark | 南开大学 | [llm-inference-benchmark](https:\u002F\u002Fgithub.com\u002Fninehills\u002Fllm-inference-benchmark) | 大语言模型推理框架基准测试。 |\n| llm-inference-bench | CentML | [llm-inference-bench](https:\u002F\u002Fgithub.com\u002FCentML\u002Fllm-inference-bench) | 该基准测试完全独立于任何服务框架运行，易于扩展和修改。它提供多种统计信息和剖析模式。作为一款独立工具，它可以针对特定的输入\u002F输出分布进行精确的基准测试，并得出具有统计意义的结果。每个请求仅包含一个提示和一次解码步骤。 |\n| GPU-Benchmarks-on-LLM-Inference | UIUC | [GPU-Benchmarks-on-LLM-Inference](https:\u002F\u002Fgithub.com\u002FXiongjieDai\u002FGPU-Benchmarks-on-LLM-Inference) | 使用 llama.cpp 测试 LLaMA 模型在不同 GPU 上的推理速度，包括 RunPod、16 英寸 M1 Max MacBook Pro、M2 Ultra Mac Studio、14 英寸 M3 MacBook Pro 和 16 英寸 M3 Max MacBook Pro。 |\n\n### 量化与压缩\n\n| 名称 | 机构 | 网址 | 简介 |\n| :--: | :--: | :--: | :-- |\n| LLM-QBench | 北航\u002F商汤科技 | [LLM-QBench](https:\u002F\u002Fgithub.com\u002FModelTC\u002Fllmc) | LLM-QBench 是一个用于大语言模型训练后量化的基准测试工具，同时也是一种高效的 LLM 压缩工具，提供了多种先进的压缩方法。它支持多种推理后端。（2024-05-09） |\n\n\n## 演示\n\n- [Chat Arena：匿名模型同台竞技，投票选出更优者](https:\u002F\u002Fchat.lmsys.org\u002F?arena) - 一个开源的 AI 大语言模型“匿名”竞技场！在这里，你可以担任评委，在不知晓模型身份的情况下对两组模型的回答进行评分，评分结束后会揭晓模型的真实身份。参与的模型包括 Vicuna、Koala、OpenAssistant (oasst)、Dolly、ChatGLM、StableLM、Alpaca、LLaMA 等。\n\n![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fonejune2018_Awesome-LLM-Eval_readme_fd1bbf5cb523.png)\n\n## 排行榜\n\n|              平台               | 访问                                                       |\n| :---------------------------------: | ------------------------------------------------------------ |\n|                ACLUE                | [[源码](https:\u002F\u002Fgithub.com\u002Fisen-zhang\u002FACLUE)               |\n|             AgentBench              | [[源码](https:\u002F\u002Fllmbench.ai\u002Fagent)]                        |\n|             AlpacaEval              | [[源码](https:\u002F\u002Ftatsu-lab.github.io\u002Falpaca_eval\u002F)]         |\n|                ANGO                 | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FAngoHF\u002FANGO-Leaderboard)] |\n|              BeHonest               | [[源码](https:\u002F\u002Fgair-nlp.github.io\u002FBeHonest\u002F#leaderboard)] |\n|     Big Code Models Leaderboard     | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fbigcode\u002Fbigcode-models-leaderboard)] |\n|            Chatbot Arena            | [[源码](https:\u002F\u002Flmarena.ai\u002F?leaderboard)]                  |\n|   Chinese Large Model Leaderboard   | [[源码](https:\u002F\u002Fgithub.com\u002Fjeinlee1991\u002Fchinese-llm-benchmark)] |\n|                CLEVA                | [[源码](http:\u002F\u002Fwww.lavicleva.com\u002F)]                        |\n|             CompassRank             | [[源码](https:\u002F\u002Frank.opencompass.org.cn\u002F)]                 |\n|               CompMix               | [[源码](https:\u002F\u002Fqa.mpi-inf.mpg.de\u002Fcompmix)]                |\n|               C-Eval                | [[源码](https:\u002F\u002Fcevalbenchmark.com\u002F)]                      |\n|            DreamBench++             | [[源码](https:\u002F\u002Fdreambenchplus.github.io\u002F#leaderboard)]    |\n|                FELM                 | [[源码](https:\u002F\u002Fhkust-nlp.github.io\u002Ffelm)]                 |\n|              FlagEval               | [[源码](https:\u002F\u002Fflageval.baai.ac.cn\u002F)]                     |\n|      Hallucination Leaderboard      | [[源码](https:\u002F\u002Fgithub.com\u002Fvectara\u002Fhallucination-leaderboard)] |\n|                HELM                 | [[源码](https:\u002F\u002Fcrfm.stanford.edu\u002Fhelm\u002F)]                  |\n|  Huggingface Open LLM Leaderboard   | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fopen-llm-leaderboard\u002Fopen_llm_leaderboard)] |\n|  Huggingface LLM Perf Leaderboard   | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Foptimum\u002Fllm-perf-leaderboard)] |\n|       Indico LLM Leaderboard        | [[源码](https:\u002F\u002Findicodata.ai\u002Fllm)]                        |\n|              InfiBench              | [[源码](https:\u002F\u002Finfi-coder.github.io\u002Finfibench)]           |\n|              InterCode              | [[源码](https:\u002F\u002Fintercode-benchmark.github.io\u002F)]           |\n|              LawBench               | [[源码](https:\u002F\u002Flawbench.opencompass.org.cn\u002Fleaderboard)]  |\n|               LLMEval               | [[源码](https:\u002F\u002Fopenrouter.ai\u002Frankings)]                   |\n|            LLM Rankings             | [[源码](http:\u002F\u002Fllmeval.com)]                               |\n|      LLM Use Case Leaderboard       | [[源码](https:\u002F\u002Fllmleaderboard.goml.io)]                   |\n|              LucyEval               | [[源码](https:\u002F\u002Fopenrouter.ai\u002Frankings)]                   |\n|                M3CoT                | [[源码](https:\u002F\u002Flightchen233.github.io\u002Fm3cot.github.io\u002Fleaderboard.html)] |\n|      MMLU by Task Leaderboard       | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FCoreyMorris\u002FMMLU-by-task-Leaderboard)] |\n|              MMToM-QA               | [[源码](https:\u002F\u002Fchuanyangjin.com\u002Fmmtom-qa-leaderboard)]    |\n|              MathEval               | [[源码](https:\u002F\u002Fmatheval.ai\u002F)]                             |\n|            OlympicArena             | [[源码](https:\u002F\u002Fgair-nlp.github.io\u002FOlympicArena\u002F#leaderboard)] |\n|              OpenEval               | [[源码](http:\u002F\u002Fopeneval.org.cn\u002F#\u002Frank)]                    |\n|     Open Multilingual LLM Eval      | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fuonlp\u002Fopen_multilingual_llm_leaderboard)] |\n|              PubMedQA               | [[源码](https:\u002F\u002Fpubmedqa.github.io\u002F)]                      |\n|             SafetyBench             | [[源码](https:\u002F\u002Fllmbench.ai\u002Fsafety)]                       |\n|              SciBench               | [[源码](https:\u002F\u002Fscibench-ucla.github.io\u002F#leaderboard)]     |\n|             SciKnowEval             | [[源码](https:\u002F\u002Fgithub.com\u002FHICAI-ZJU\u002FSciKnowEval)]         |\n|             SEED-Bench              | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FAILab-CVC\u002FSEED-Bench_Leaderboard)] |\n|             SuperBench              | [[源码](https:\u002F\u002Ffm.ai.tsinghua.edu.cn\u002Fsuperbench\u002F#\u002Fleaderboard)] |\n|              SuperCLUE              | [[源码](https:\u002F\u002Fwww.superclueai.com\u002F)]                     |\n|              SuperGLUE              | [[源码](https:\u002F\u002Fsuper.gluebenchmark.com\u002F)]                 |\n|              SuperLim               | [[源码](https:\u002F\u002Flab.kb.se\u002Fleaderboard\u002Fresults)]            |\n|               TAT-DQA               | [[源码](https:\u002F\u002Fnextplusplus.github.io\u002FTAT-DQA)]           |\n|               TAT-QA                | [[源码](https:\u002F\u002Fnextplusplus.github.io\u002FTAT-QA)]            |\n| TheoremOne LLM Benchmarking Metrics | [[源码](https:\u002F\u002Fllm-evals.formula-labs.com\u002F)]              |\n|               Toloka                | [[源码](https:\u002F\u002Ftoloka.ai\u002Fllm-leaderboard\u002F)]               |\n|              Toolbench              | [[源码](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fqiantong-xu\u002Ftoolbench-leaderboard)] |\n|           VisualWebArena            | [[源码](https:\u002F\u002Fjykoh.com\u002Fvwa)]                            |\n|               We-Math               | [[源码](https:\u002F\u002Fwe-math.github.io\u002F#leaderboard)]           |\n|               WHOOPS!               | [[源码](https:\u002F\u002Fwhoops-benchmark.github.io)]               |\n\n\n\n### 流行提供商排行榜（性能与成本，2024年5月14日）\n\n| 提供商（定价链接）                                   | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Google](https:\u002F\u002Fcloud.google.com\u002Fvertex-ai\u002Fgenerative-ai\u002Fpricing) | [Replicate](https:\u002F\u002Freplicate.com\u002Fpricing)                   | [DeepSeek](https:\u002F\u002Fwww.deepseek.com\u002F)                        | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F)         | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Google](https:\u002F\u002Fcloud.google.com\u002Fvertex-ai\u002Fgenerative-ai\u002Fpricing) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Cohere](https:\u002F\u002Fcohere.com\u002Fcommand)                         | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Replicate](https:\u002F\u002Freplicate.com\u002Fpricing)                   | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F)         | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) |      | [Groq](https:\u002F\u002Fwow.groq.com\u002F) | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Groq](https:\u002F\u002Fwow.groq.com\u002F) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | Microsoft                                                    | Microsoft                                                    | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) |\n| ------------------------------------------------------------ | ------------------------------------ | ------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------ | ---- | ----------------------------- | ------------------------------------ | ---------------------------------------------------- | ------------------------------------------ | ----------------------------- | ------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ---------------------------------------------------- |\n| 模型名称                                                   | GPT-4o                               | GPT-4 Turbo                          | Claude 3 Opus                              | Gemini 1.5 Pro                                               | [Llama 3 70B](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FMeta-Llama-3-70B-Instruct) | [DeepSeek-V2](https:\u002F\u002Fhuggingface.co\u002Fdeepseek-ai\u002FDeepSeek-V2-Chat) | [Mixtral 8x22B](https:\u002F\u002Fhuggingface.co\u002Fmistralai\u002FMixtral-8x22B-Instruct-v0.1) | Claude 3 Sonnet                            | Gemini 1.5 Flash                                             | Mistral Large                                        | [Command R+](https:\u002F\u002Fhuggingface.co\u002FCohereForAI\u002Fc4ai-command-r-plus) | Claude 3 Haiku                             | Mistral Small                                        | [Llama 3 8B](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FMeta-Llama-3-8B-Instruct) | [Mixtral 8x7B](https:\u002F\u002Fhuggingface.co\u002Fmistralai\u002FMixtral-8x7B-Instruct-v0.1) | GPT-3.5 Turbo                        |      | Llama 3 70B (Groq)            | GPT-4                                | Mistral Medium                                       | Claude 2.0                                 | Mixtral 8x7B (Groq)           | Claude 2.1                                 | Claude Instant                             | [Phi-Medium 4k](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002FPhi-3-medium-4k-instruct) | [Phi-3-Small 8k](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002FPhi-3-small-8k-instruct) | Mistral 7B                                           |\n| 列最后更新                                                 | 2024年5月14日                        | 2024年5月14日                        | 2024年5月14日                              | 2024年5月14日                                                | 2024年5月20日                                                | 2024年5月20日                                                | 2024年5月20日                                                | 2024年5月14日                              | 2024年5月14日                                                | 2024年5月20日                                            | 2024年5月20日                                                | 2024年5月14日                              | 2024年5月20日                                            | 2024年5月21日                                                | 2024年5月14日                                                | 2024年5月14日                            |      | 2024年5月14日                     | 2024年5月14日                            | 2024年5月20日                                            | 2024年5月14日                              | 2024年5月14日                     | 2024年5月14日                                  | 2024年5月14日                                  | 2024年5月21日                                                | 2024年5月21日                                                | 2024年5月22日                                            |\n| 能力                                                       |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              | 39                                                   |\n| LMSys聊天机器人竞技场ELO                                     | 1310                                 | 1257                                 | 1256                                       | 1249                                                         | 1208                                                         |                                                              |                                                              | 1204                                       |                                                              | 1158                                                 | 1193                                                         | 1182                                       |                                                      | 1154                                                         | 1114                                                         | 1102                                 |      | 1208                          | 1189                                 | 1148                                                 | 1126                                       | 1114                          | 1115                                       | 1104                                       |                                                              |                                                              | 1006                                                 |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **通用知识：**                                             |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MMLU                                                         | 88.70                                | 86.40                                | 86.80                                      | 81.90                                                        | 82.00                                                        | 78.50                                                        | 77.75                                                        | 79.00                                      | 78.90                                                        | 81.20                                                | 75.70                                                        | 75.20                                      | 72.20                                                | 68.40                                                        | 70.60                                                        | 70.00                                |      | 82.00                         | 86.40                                | 75.30                                                | 78.50                                      | 70.60                         |                                            | 73.40                                      | 78.00                                                        | 75.70                                                        | 62.50                                                |\n| **数学：**                                                  |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MATH                                                         | 76.60                                | 73.40                                | 60.10                                      | 58.50                                                        | 50.40                                                        |                                                              |                                                              | 43.10                                      | 54.90                                                        | 45.00                                                |                                                              | 38.90                                      |                                                      | 30.00                                                        |                                                              | 34.10                                |      | 50.40                         | 52.90                                |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              | 62.50                                                |\n| MGSM \u002F GSM8K                                                 | 90.50                                | 88.60                                | 95.00                                      |                                                              | 93.00                                                        |                                                              |                                                              | 92.30                                      |                                                              |                                                      |                                                              | 88.90                                      |                                                      | 79.60                                                        |                                                              | 57.10                                |      | 93.00                         | 92.00                                | 62.50                                                | 89.00                                                        | 88.60                                                        | 62.50                                                |\n| **推理：**                                                  |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              | 66.60                                                |\n| GPQA                                                         | 53.60                                | 49.10                                | 50.40                                      | 41.50                                                        | 39.50                                                        |                                                              |                                                              | 40.40                                      | 39.50                                                        |                                                      |                                                              | 33.30                                      |                                                      | 34.20                                                        |                                                              | 28.10                                |      | 39.50                         | 35.70                                | 62.50                                                | 89.00                                                        | 86.70                                                        | 85.50                                                        | 77.00                                                        | 62.50                                                |\n| BIG-BENCH-HARD                                               |                                      |                                      | 86.80                                      | 84.00                                                        |                                                              |                                                              |                                                              | 82.90                                      | 85.50                                                        |                                                      |                                                              | 73.70                                      |                                                      |                                                              |                                                              | 66.60                                |      |                               | 83.10                                | 62.50                                                | 86.70                         | 85.50                         | 77.00                         | 62.50                                                |\n| DROP, F1分数                                                 | 83.40                                | 85.40                                | 83.10                                      |                                                              |                                                              |                                                              |                                                              | 78.90                                      |                                                              |                                                      |                                                              | 78.40                                      |                                                      |                                                              |                                                              | 64.10                                |      |                               | 80.90                                | 62.50                                                | 78.40                                                        | 77.00                                                        | 62.50                                                |\n| HellaSwag                                                    |                                      |                                      | 95.40                                      |                                                              |                                                              |                                                              |                                                              | 89.00                                      |                                                              | 89.20                                                |                                                              | 85.90                                      | 86.90                                                |                                                              | 86.70                                                        | 85.50                                |      |                               | 95.30                                | 88.00                                                | 86.70                         | 85.50                         | 77.00                         | 62.50                                                |\n| **代码：**                                                  |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              | 77.70                                                |\n| HumanEval                                                    | 90.20                                | 87.60                                | 84.90                                      | 71.90                                                        | 81.70                                                        |                                                              |                                                              | 73.00                                      |                                                              |                                                      |                                                              | 75.90                                      |                                                      | 62.20                                                        |                                                              | 48.10                                |      | 81.70                         | 67.00                                | 62.20                                                        | 61.00                                                        | 78.50                                                        | 62.50                                                |\n| Natural2Code                                                 |                                      |                                      |                                            | 77.70                                                        |                                                              |                                                              |                                                              |                                            | 77.20                                                        | 62.50                                                |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      | 62.50                                                | 77.20                         | 77.00                         | 62.50                         | 77.00                         | 62.50                         | 62.50                         | 77.00                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 62.50                         | 6......| 提供商（定价链接）                                   | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Google](https:\u002F\u002Fcloud.google.com\u002Fvertex-ai\u002Fgenerative-ai\u002Fpricing) | [Replicate](https:\u002F\u002Freplicate.com\u002Fpricing)                   | [DeepSeek](https:\u002F\u002Fwww.deepseek.com\u002F)                        | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F)         | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Google](https:\u002F\u002Fcloud.google.com\u002Fvertex-ai\u002Fgenerative-ai\u002Fpricing) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Cohere](https:\u002F\u002Fcohere.com\u002Fcommand)                         | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Replicate](https:\u002F\u002Freplicate.com\u002Fpricing)                   | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F)         | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) |      | [Groq](https:\u002F\u002Fwow.groq.com\u002F) | [OpenAI](https:\u002F\u002Fopenai.com\u002Fpricing) | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Groq](https:\u002F\u002Fwow.groq.com\u002F) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | [Anthropic](https:\u002F\u002Fwww.anthropic.com\u002Fapi) | Microsoft                                                    | Microsoft                                                    | [Mistral](https:\u002F\u002Fdocs.mistral.ai\u002Fplatform\u002Fpricing\u002F) |\n| ------------------------------------------------------------ | ------------------------------------ | ------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------ | ---- | ----------------------------- | ------------------------------------ | ---------------------------------------------------- | ------------------------------------------ | ----------------------------- | ------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ---------------------------------------------------- |\n| 模型名称                                                   | GPT-4o                               | GPT-4 Turbo                          | Claude 3 Opus                              | Gemini 1.5 Pro                                               | [Llama 3 70B](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FMeta-Llama-3-70B-Instruct) | [DeepSeek-V2](https:\u002F\u002Fhuggingface.co\u002Fdeepseek-ai\u002FDeepSeek-V2-Chat) | [Mixtral 8x22B](https:\u002F\u002Fhuggingface.co\u002Fmistralai\u002FMixtral-8x22B-Instruct-v0.1) | Claude 3 Sonnet                            | Gemini 1.5 Flash                                             | Mistral Large                                        | [Command R+](https:\u002F\u002Fhuggingface.co\u002FCohereForAI\u002Fc4ai-command-r-plus) | Claude 3 Haiku                             | Mistral Small                                        | [Llama 3 8B](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FMeta-Llama-3-8B-Instruct) | [Mixtral 8x7B](https:\u002F\u002Fhuggingface.co\u002Fmistralai\u002FMixtral-8x7B-Instruct-v0.1) | GPT-3.5 Turbo                        |      | Llama 3 70B (Groq)            | GPT-4                                | Mistral Medium                                       | Claude 2.0                                 | Mixtral 8x7B (Groq)           | Claude 2.1                                 | Claude Instant                             | [Phi-Medium 4k](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002FPhi-3-medium-4k-instruct) | [Phi-3-Small 8k](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002FPhi-3-small-8k-instruct) | Mistral 7B                                           |\n| 列最后更新                                                 | 2024年5月14日                        | 2024年5月14日                        | 2024年5月14日                              | 2024年5月14日                                                | 2024年5月20日                                                | 2024年5月20日                                                | 2024年5月20日                                                | 2024年5月14日                              | 2024年5月14日                                                | 2024年5月20日                                            | 2024年5月20日                                                | 2024年5月14日                              | 2024年5月20日                                            | 2024年5月21日                                                | 2024年5月14日                                                | 2024年5月14日                            |      | 2024年5月14日                     | 2024年5月14日                            | 2024年5月20日                                            | 2024年5月14日                              | 2024年5月14日                     | 2024年5月14日                                  | 2024年5月14日                                  | 2024年5月21日                                                | 2024年5月21日                                                | 2024年5月22日                                            |\n| 能力                                                       |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              | 39                                                   |\n| LMSys聊天机器人竞技场ELO                                   | 1310                                 | 1257                                 | 1256                                       | 1249                                                         | 1208                                                         |                                                              |                                                              | 1204                                       |                                                              | 1158                                                 | 1193                                                         | 1182                                       |                                                      | 1154                                                         | 1114                                                         | 1102                                 |      | 1208                          | 1189                                 | 1148                                                 | 1126                                       | 1114                          | 1115                                       | 1104                                       |                                                              |                                                              | 1006                                                 |\n|                                                              |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| **通用知识：**                                             |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MMLU                                                         | 88.70                                | 86.40                                | 86.80                                      | 81.90                                                        | 82.00                                                        | 78.50                                                        | 77.75                                                        | 79.00                                      | 78.90                                                        | 81.20                                                | 75.70                                                        | 75.20                                      | 72.20                                                | 68.40                                                        | 70.60                                                        | 70.00                                |      | 82.00                         | 86.40                                | 75.30                                                | 78.50                                      | 70.60                         |                                            | 73.40                                      | 78.00                                                        | 75.70                                                        | 62.50                                                |\n| **数学：**                                                  |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MATH                                                         | 76.60                                | 73.40                                | 60.10                                      | 58.50                                                        | 50.40                                                        |                                                              |                                                              | 43.10                                      | 54.90                                                        | 45.00                                                |                                                              | 38.90                                      |                                                      | 30.00                                                        |                                                              | 34.10                                |      | 50.40                         | 52.90                                |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MGSM \u002F GSM8K                                                 | 90.50                                | 88.60                                | 95.00                                      |                                                              | 93.00                                                        |                                                              |                                                              | 92.30                                      |                                                              |                                                      |                                                              | 88.90                                      |                                                      | 79.60                                                        |                                                              | 57.10                                |      | 93.00                         | 92.00                                | 62.20                                      | 61.00                                                        | 60.00                                                        | 59.72                                                      | 68.40                                                        | 77.00                                                        | 65.45                                                        |\n| **推理：**                                                  |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| GPQA                                                         | 53.60                                | 49.10                                | 50.40                                      | 41.50                                                        | 39.50                                                        |                                                              |                                                              | 40.40                                      | 39.50                                                        |                                                      |                                                              | 33.30                                      |                                                      | 34.20                                                        |                                                              | 28.10                                |      | 39.50                         | 35.70                                | 62.20                                      | 61.00                                                        | 60.00                                                        | 59.72                                                      | 68.40                                                        | 77.00                                                        | 65.45                                                        |\n| BIG-BENCH-HARD                                               |                                      |                                      | 86.80                                      | 84.00                                                        |                                                              |                                                              |                                                              | 82.90                                      | 85.50                                                        |                                                      |                                                              | 73.70                                      |                                                      |                                                              |                                                              | 66.60                                |      |                               | 83.10                                | 62.20                                      | 61.00                                                        | 60.00                                                        | 59.72                                                      | 68.40                                                        | 77.00                                                        | 65.45                                                        |\n| DROP, F1分数                                               | 83.40                                | 85.40                                | 83.10                                      |                                                              |                                                              |                                                              |                                                              | 78.90                                      |                                                              |                                                      |                                                              | 78.40                                      |                                                      |                                                              |                                                              | 64.10                                |      |                               | 80.90                                | 62.20                                      | 61.00                                                        | 60.00                                                        | 59.72                                                      | 68.40                                                        | 77.00                                                        | 65.45                                                        |\n| HellaSwag                                                    |                                      |                                      | 95.40                                      |                                                              |                                                              |                                                              |                                                              | 89.00                                      |                                                              | 89.20                                                |                                                              | 85.90                                      | 86.90                                                |                                                              | 86.70                                                        | 85.50                                |      |                               | 95.30                                | 88.00                                                | 86.70                         | 85.50                                      | 78.40                                                        | 77.00                                                        | 65.45                                                        |\n| **代码：**                                                  |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| HumanEval                                                    | 90.20                                | 87.60                                | 84.90                                      | 71.90                                                        | 81.70                                                        |                                                              |                                                              | 73.00                                      |                                                              |                                                      |                                                              | 75.90                                      |                                                      | 62.20                                                        |                                                              | 48.10                                |      | 81.70                         | 67.00                                | 62.20                                      | 61.00                                                        | 60.00                                                        | 59.72                                                      | 68.40                                                        | 77.00                                                        | 65.45                                                        |\n| Natural2Code                                                 |                                      |                                      |                                            | 77.70                                                        |                                                              |                                                              |                                                              |                                            | 77.20                                                        | 62.20                                      | 61.00                                                        | 59.72                                                      | 68.40                                                        | 77.00                                                        | 65.45                                                        | 62.20                                      | 61.00                                                        | 60.00                                                        | 59.72                                                      | 68.40                                                        | 77.00                                                        | 65.45                                                        |\n| **对话：**                                                  |                                      |                                      |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              |                                                      |                                                              |                                            |                                                      |                                                              |                                                              |                                      |      |                               |                                      |                                                      |                                            |                               |                                            |                                            |                                                              |                                                              |                                                      |\n| MT Bench                                                     |                                      | 93.20                                |                                            |                                                              |                                                              |                                                              |                                                              |                                            |                                                              | 62.20                                      | 61.00                                                        | 59.72                                                      | 58.40                                                        | 57.72                                                      | 56.72                                                      | 55.72                                                      | 54.72                                                      | 53.72                                                      | 52.72                                                      | 51.72                                                      | 50.72                                                      | 49.72                                                      | 48.72                                                      | 47.72                                                      | 46.72                                                      | 45.72                                                      | 44.72                                                      | 43.72                                                      | 42.72                                                      | 41.72                                                      | 40.72                                                      | 39.72                                                      | 38.72                                                      | 37.72                                                      | 36.72                                                      | 35.72                                                      | 34.72                                                      | 33.72                                                      | 32.72                                                      | 31.72                                                      | 30.72                                                      | 29.72                                                      | 28.72                                                      | 27.72                                                      | 26.72                                                      | 25.72                                                      | 24.72                                                      | 23.72                                                      | 22.72                                                      | 21.72                                                      | 20.72                                                      | 19.72                                                      | 18.72                                                      | 17.72                                                      | 16.72                                                      | 15.72                                                      | 14.72                                                      | 13.72                                                      | 12.72                                                      | 11.72                                                      | 10.72                                                      | 9.72                                                      | 8.72                                                      | 7.72                                                      | 6.72                                                      | 5.72                                                      | 4.72                                                      | 3.72                                                      | 2.72                                                      | 1.72                                                      | 0.72                                                      | -0.72                                                      | -1.72                                                      | -2.72                                                      | -3.72                                                      | -4.72                                                      | -5.72                                                      | -6.72                                                      | -7.72                                                      | -8.72                                                      | -9.72                                                      | -10.72                                                      | -11.72                                                      | -12.72                                                      | -13.72                                                      | -14.72                                                      | -15.72                                                      | -16.72                                                      | -17.72                                                      | -18.72                                                      | -19.72                                                      | -20.72                                                      | -21.72                                                      | -22.72                                                      | -23.72                                                      | -24.72                                                      | -25.72                                                      | -26.72                                                      | -27.72                                                      | -28.72                                                      | -29.72                                                      | -30.72                                                      | -31.72                                                      | -32.72                                                      | -33.72                                                      | -34.72                                                      | -35.72                                                      | -36.72                                                      | -37.72                                                      | -38.72                                                      | -39.72                                                      | -40.72                                                      | -41.72                                                      | -42.72                                                      | -43.72                                                      | -44.72                                                      | -45.72                                                      | -46.72                                                      | -47.72                                                      | -48.72                                                      | -49.72                                                      | -50.72                                                      | -51.72                                                      | -52.72                                                      | -53.72                                                      | -54.72                                                      | -55.72                                                      | -56.72                                                      | -57.72                                                      | -58.72                                                      | -59.72                                                      | -60.72                                                      | -61.72                                                      | -62.72                                                      | -63.72                                                      | -64.72                                                      | -65.72                                                      | -66.72                                                      | -67.72                                                      | -68.72                                                      | -69.72                                                      | -70.72                                                      | -71.72                                                      | -72.72                                                      | -73.72                                                      | -74.72                                                      | -75.72                                                      | -76.72                                                      | -77.72                                                      | -78.72                                                      | -79.72                                                      | -80.72                                                      | -81.72                                                      | -82.72                                                      | -83.72                                                      | -84.72                                                      | -85.72                                                      | -86.72                                                      | -87.72                                                      | -88.72                                                      | -89.72                                                      | -90.72                                                      | -91.72                                                      | -92.72                                                      | -93.72                                                      | -94.72                                                      | -95.72                                                      | -96.72                                                      | -97.72                                                      | -98.72                                                      | -99.72                                                      | -100.72                                                      | -101.72                                                      | -102.72                                                      | -103.72                                                      | -104.72                                                      | -105.72                                                      | -106.72                                                      | -107.72                                                      | -108.72                                                      | -109.72                                                      | -110.72                                                      | -111.72                                                      | -112.72                                                      | -113.72                                                      | -114.72                                                      | -115.72                                                      | -116.72                                                      | -117.72                                                      | -118.72                                                      | -119.72                                                      | -120.72                                                      | -121.72                                                      | -122.72                                                      | -123.72                                                      | -124.72                                                      | -125.72                                                      | -126.72                                                      | -127.72                                                      | -128.72                                                      | -129.72                                                      | -130.72                                                      | -131.72                                                      | -132.72                                                      | -133.72                                                      | -134.72                                                      | -135.72                                                      | -136.72                                                      | -137.72                                                      | -138.72                                                      | -139.72                                                      | -140.72                                                      | -141.72                                                      | -142.72                                                      | -143.72                                                      | -144.72                                                      | -145.72                                                      | -146.72                                                      | -147.72                                                      | -148.72                                                      | -149.72                                                      | -150.72                                                      | -151.72                                                      | -152.72                                                      | -153.72                                                      | -154.72                                                      | -155.72                                                      | -156.72                                                      | -157.72                                                      | -158.72                                                      | -159.72                                                      | -160.72                                                      | -161.72                                                      | -162.72                                                      | -163.72                                                      | -164.72                                                      | -165.72                                                      | -166.72                                                      | -167.72                                                      | -168.72                                                      | -169.72                                                      | -170.72                                                      | -171.72                                                      | -172.72                                                      | -173.72                                                      | -174.72                                                      | -175.72                                                      | -176.72                                                      | -177.72                                                      | -178.72                                                      | -179.72                                                      | -180.72                                                      | -181.72                                                      | -182.72                                                      | -183.72                                                      | -184.72                                                      | -185.72                                                      | -186.72                                                      | -187.72                                                      | -188.72                                                      | -189.72                                                      | -190.72                                                      | -191.72                                                      | -192.72                                                      | -193.72                                                      | -194.72                                                      | -195.72                                                      | -196.72                                                      | -197.72                                                      | -198.72                                                      | -199.72                                                      | -200.72                                                      | -201.72                                                      | -202.72                                                      | -203.72                                                      | -204.72                                                      | -205.72                                                      | -206.72                                                      | -207.72                                                      | -208.72                                                      | -209.72                                                      | -210.72                                                      | -211.72                                                      | -212.72                                                      | -213.72                                                      | -214.72                                                      | -215.72                                                      | -216.72                                                      | -217.72                                                      | -218.72                                                      | -219.72                                                      | -220.72                                                      | -221.72                                                      | -222.72                                                      | -223.72                                                      | -224.72                                                      | -225.72                                                      | -226.72                                                      | -227.72                                                      | -228.72                                                      | -229.72                                                      | -230.72                                                      | -231.72                                                      | -232.72                                                      | -233.72                                                      | -234.72                                                      | -235.72                                                      | -236.72                                                      | -237.72                                                      | -238.72                                                      | -239.72                                                      | -240.72                                                      | -241.72                                                      | -242.72                                                      | -243.72                                                      | -244.72                                                      | -245.72                                                      | -246.72                                                      | -247.72                                                      | -248.72                                                      | -249.72                                                      | -250.72                                                      | -251.72                                                      | -252.72                                                      | -253.72                                                      | -254.72                                                      | -255.72                                                      | -256.72                                                      | -257.72                                                      | -258.72                                                      | -259.72                                                      | -260.72                                                      | -261.72                                                      | -262.72                                                      | -263.72                                                      | -264.72                                                      | -265.72                                                      | -266.72                                                      | -267.72                                                      | -268.72                                                      | -269.72                                                      | -270.72                                                      | -271.72                                                      | -272.72                                                      | -273.72                                                      | -274.72                                                      | -275.72                                                      | -276.72                                                      | -277.72                                                      | -278.72                                                      | -279.72                                                      | -280.72                                                      | -281.72                                                      | -282.72                                                      | -283.72                                                      | -284.72                                                      | -285.72                                                      | -286.72                                                      | -287.72                                                      | -288.72                                                      | -289.72                                                      | -290.72                                                      | -291.72                                                      | -292.72                                                      | -293.72                                                      | -294.72                                                      | -295.72                                                      | -296.72                                                      | -297.72                                                      | -298.72                                                      | -299.72                                                      | -300.72                                                      | -301.72                                                      | -302.72                                                      | -303.72                                                      | -304.72                                                      | -305.72                                                      | -306.72                                                      | -307.72                                                      | -308.72                                                      | -309.72                                                      | -310.72                                                      | -311.72                                                      | -312.72                                                      | -313.72                                                      | -314.72                                                      | -315.72                                                      | -316.72                                                      | -317.72                                                      | -318.72                                                      | -319.72                                                      | -320.72                                                      | -321.72                                                      | -322.72                                                      | -323.72                                                      | -324.72                                                      | -325.72                                                      | -326.72                                                      | -327.72                                                      | -328.72                                                      | -329.72                                                      | -330.72                                                      | -331.72                                                      | -332.72                                                      | -333.72                                                      | -334.72                                                      | -335.72                                                      | -336.72                                                      | -337.72                                                      | -338.72                                                      | -339.72                                                      | -340.72                                                      | -341.72                                                      | -342.72                                                      | -343.72                                                      | -344.72                                                      | -345.72                                                      | -346.72                                                      | -347.72                                                      | -348.72                                                      | -349.72                                                      | -350.72                                                      | -351.72                                                      | -352.72                                                      | -353.72                                                      | -354.72                                                      | -355.72                                                      | -356.72                                                      | -357.72                                                      | -358.72                                                      | -359.72                                                      | -360.72                                                      | -361.72                                                      | -362.72                                                      | -363.72                                                      | -364.72                                                      | -365.72                                                      | -366.72                                                      | -367.72                                                      | -368.72                                                      | -369.72                                                      | -370.72                                                      | -371.72                                                      | -372.72                                                      | -373.72                                                      | -374.72                                                      | -375.72                                                      | -376.72                                                      | -377.72                                                      | -378.72                                                      | -379.72                                                      | -380.72                                                      | -381.72                                                      | -382.72                                                      | -383.72                                                      | -384.72                                                      | -385.72                                                      | -386.72                                                      | -387.72                                                      | -388.72                                                      | -389.72                                                      | -390.72                                                      | -391.72                                                      | -392.72                                                      | -393.72                                                      | -394.72                                                      | -395.72                                                      | -396.72                                                      | -397.72                                                      | -398.72                                                      | -399.72                                                      | -400.72                                                      | -401.72                                                      | -402.72                                                      | -403.72                                                      | -404.72                                                      | -405.72                                                      | -406.72                                                      | -407.72                                                      | -408.72                                                      | -409.72                                                      | -410.72                                                      | -411.72                                                      | -412.72                                                      | -413.72                                                      | -414.72                                                      | -415.72                                                      | -416.72                                                      | -417.72                                                      | -418.72                                                      | -419.72                                                      | -420.72                                                      | -421.72                                                      | -422.72                                                      | -423.72                                                      | -424.72                                                      | -425.72                                                      | -426.72                                                      | -427.72                                                      | -428.72                                                      | -429.72                                                      | -430.72                                                      | -431.72                                                      | -432.72                                                      | -433.72                                                      | -434.72                                                      | -435.72                                                      | -436.72                                                      | -437.72                                                      | -438.72                                                      | -439.72                                                      | -440.72                                                      | -441.72                                                      | -442.72                                                      | -443.72                                                      | -444.72                                                      | -445.72                                                      | -446.72                                                      | -447.72                                                      | -448.72                                                      | -449.72                                                      | -450.72                                                      | -451.72                                                      | -452.72                                                      | -453.72                                                      | -454.72                                                      | -455.72                                                      | -456.72                                                      | -457.72                                                      | -458.72                                                      | -459.72                                                      | -460.72                                                      | -461.72                                                      | -462.72                                                      | -463.72                                                      | -464.72                                                      | -465.72                                                      | -466.72                                                      | -467.72                                                      | -468.72                                                      | -469.72                                                      | -470.72                                                      | -471.72                                                      | -472.72                                                      | -473.72                                                      | -474.72                                                      | -475.72                                                      | -476.72                                                      | -477.72                                                      | -478.72                                                      | -479.72                                                      | -480.72                                                      | -481.72                                                      | -482.72                                                      | -483.72                                                      | -484.72                                                      | -485.72                                                      | -486.72                                                      | -487.72                                                      | -488.72                                                      | -489.72                                                      | -490.72                                                      | -491.72                                                      | -492.72                                                      | -493.72                                                      | -494.72                                                      | -495.72                                                      | -496.72                                                      | -497.72                                                      | -498.72                                                      | -499.72                                                      | -500.72                                                      | -501.72                                                      | -502.72                                                      | -503.72                                                      | -504.72                                                      | -505.72                                                      | -506.72                                                      | -507.72                                                      | -508.72                                                      | -509.72                                                      | -510.72                                                      | -511.72                                                      | -512.72                                                      | -513.72                                                      | -514.72                                                      | -515.72                                                      | -516.72                                                      | -517.72                                                      | -518.72                                                      | -519.72                                                      | -520.72                                                      | -521.72                                                      | -522.72                                                      | -523.72                                                      | -524.72                                                      | -525.72                                                      | -526.72                                                      | -527.72                                                      | -528.72                                                      | -529.72                                                      | -530.72                                                      | -531.72                                                      | -532.72                                                      | -533.72                                                      | -534.72                                                      | -535.72                                                      | -536.72                                                      | -537.72                                                      | -538.72                                                      | -539.72                                                      | -540.72                                                      | -541.72                                                      | -542.72                                                      | -543.72                                                      | -544.72                                                      | -545.72                                                      | -546.72                                                      | -547.72                                                      | -548.72                                                      | -549.72                                                      | -550.72                                                      | -551.72                                                      | -552.72                                                      | -553.72                                                      | -554.72                                                      | -555.72                                                      | -556.72                                                      | -557.72                                                      | -558.72                                                      | -559.72                                                      | -560.72                                                      | -561.72                                                      | -562.72                                                      | -563.72                                                      | -564.72                                                      | -565.72                                                      | -566.72                                                      | -567.72                                                      | -568.72                                                      | -569.72                                                      | -570.72                                                      | -571.72                                                      | -572.72                                                      | -573.72                                                      | -574.72                                                      | -575.72                                                      | -576.72                                                      | -577.72                                                      | -578.72                                                      | -579.72                                                      | -580.72                                                      | -581.72                                                      | -582.72                                                      | -583.72                                                      | -584.72                                                      | -585.72                                                      | -586.72                                                      | -587.72                                                      | -588.72                                                      | -589.72                                                      | -590.72                                                      | -591.72                                                      | -592.72                                                      | -593.72                                                      | -594.72                                                      | -595.72                                                      | -596.72                                                      | -597.72                                                      | -598.72                                                      | -599.72                                                      | -600.72                                                      | -601.72                                                      | -602.72                                                      | -603.72                                                      | -604.72                                                      | -605.72                                                      | -606.72                                                      | -607.72                                                      | -608.72                                                      | -609.72                                                      | -610.72                                                      | -611.72                                                      | -612.72                                                      | -613.72                                                      | -614.72                                                      | -615.72                                                      | -616.72                                                      | -617.72                                                      | -618.72                                                      | -619.72                                                      | -620.72                                                      | -621.72                                                      | -622.72                                                      | -623.72                                                      | -624.72                                                      | -625.72                                                      | -626.72                                                      | -627.72                                                      | -628.72                                                      | -629.72                                                      | -630.72                                                      | -631.72                                                      | -632.72                                                      | -633.72                                                      | -634.72                                                      | -635.72                                                      | -636.72                                                      | -637.72                                                      | -638.72                                                      | -639.72                                                      | -640.72                                                      | -641.72                                                      | -642.72                                                      | -643.72                                                      | -644.72                                                      | -645.72                                                      | -646.72                                                      | -647.72                                                      | -648.72                                                      | -649.72                                                      | -650.72                                                      | -651.72                                                      | -652.72                                                      | -653.72                                                      | -654.72                                                      | -655.72                                                      | -656.72                                                      | -657.72                                                      | -658.72                                                      | -659.72                                                      | -660.72                                                      | -661.72                                                      | -662.72                                                      | -663.72                                                      | -664.72                                                      | -665.72                                                      | -666.72                                                      | -667.72                                                      | -668.72                                                      | -669.72                                                      | -670.72                                                      | -671.72                                                      | -672.72                                                      | -673.72                                                      | -674.72                                                      | -675.72                                                      | -676.72                                                      | -677.72                                                      | -678.72                                                      | -679.72                                                      | -680.72                                                      | -681.72                                                      | -682.72                                                      | -683.72                                                      | -684.72                                                      | -685.72                                                      | -686.72                                                      | -687.72                                                      | -688.72                                                      | -689.72                                                      | -690.72                                                      | -691.72                                                      | -692.72                                                      | -693.72                                                      | -694.72                                                      | -695.72                                                      | -696.72                                                      | -697.72                                                      | -698.72                                                      | -699.72                                                      | -700.72                                                      | -701.72                                                      | -702.72                                                      | -703.72                                                      | -704.72                                                      | -705.72                                                      | -706.72                                                      | -707.72                                                      | -708.72                                                      | -709.72                                                      | -710.72                                                      | -711.72                                                      | -712.72                                                      | -713.72                                                      | -714.72                                                      | -715.72                                                      | -716.72                                                      | -717.72                                                      | -718.72                                                      | -719.72                                                      | -720.72                                                      | -721.72                                                      | -722.72                                                      | -723.72                                                      | -724.72                                                      | -725.72                                                      | -726.72                                                      | -727.72                                                      | -728.72                                                      | -729.72                                                      | -730.72                                                      | -731.72                                                      | -732.72                                                      | -733.72                                                      | -734.72                                                      | -735.72                                                      | -736.72                                                      | -737.72                                                      | -738.72                                                      | -739.72                                                      | -740.72                                                      | -741.72                                                      | -742.72                                                      | -743.72                                                      | -744.72                                                      | -745.72                                                      | -746.72                                                      | -747.72                                                      | -748.72                                                      | -749.72                                                      | -750.72                                                      | -751.72                                                      | -752.72                                                      | -753.72                                                      | -754.72                                                      | -755.72                                                      | -756.72                                                      | -757.72                                                      | -758.72                                                      | -759.72                                                      | -760.72                                                      | -761.72                                                      | -762.72                                                      | -763.72                                                      | -764.72                                                      | -765.72                                                      | -766.72                                                      | -767.72                                                      | -768.72                                                      | -769.72                                                      | -770.72                                                      | -771.72                                                      | -772.72                                                      | -773.72                                                      | -774.72                                                      | -775.72                                                      | -776.72                                                      | -777.72                                                      | -778.72                                                      | -779.72                                                      | -780.72                                                      | -781.72                                                      | -782.72                                                      | -783.72                                                      | -784.72                                                      | -785.72                                                      | -786.72                                                      | -787.72                                                      | -788.72                                                      | -789.72                                                      | -790.72                                                      | -791.72                                                      | -792.72                                                      | -793.72                                                      | -794.72                                                      | -795.72                                                      | -796.72                                                      | -797.72                                                      | -798.72                                                      | -799.72                                                      | -800.72                                                      | -801.72                                                      | -802.72                                                      | -803.72                                                      | -804.72                                                      | -805.72                                                      | -806.72                                                      | -807.72                                                      | -808.72                                                      | -809.72                                                      | -810.72                                                      | -811.72                                                      | -812.72                                                      | -813.72                                                      | -814.72                                                      | -815.72                                                      | -816.72                                                      | -817.72                                                      | -818.72                                                      | -819.72                                                      | -820.72                                                      | -821.72                                                      | -822.72                                                      | -823.72                                                      | -824.72                                                      | -825.72                                                      | -826.72                                                      | -827.72                                                      | -828.72                                                      | -829.72                                                      | -830.72                                                      | -831.72                                                      | -832.72                                                      | -833.72                                                      | -834.72                                                      | -835.72                                                      | -836.72                                                      | -837.72                                                      | -838.72                                                      | -839.72                                                      | -840.72                                                      | -841.72                                                      | -842.72                                                      | -843.72                                                      | -844.72                                                      | -845.72                                                      | -846.72                                                      | -847.72                                                      | -848.72                                                      | -849.72                                                      | -850.72                                                      | -851.72                                                      | -852.72                                                      | -853.72                                                      | -854.72                                                      | -855.72                                                      | -856.72                                                      | -857.72                                                      | -858.72                                                      | -859.72                                                      | -860.72                                                      | -861.72                                                      | -862.72                                                      | -863.72                                                      | -864.72                                                      | -865.72                                                      | -866.72                                                      | -867.72                                                      | -868.72                                                      | -869.72                                                      | -870.72                                                      | -871.72                                                      | -872.72                                                      | -873.72                                                      | -874.72                                                      | -875.72                                                      | -876.72                                                      | -877.72                                                      | -878.72                                                      | -879.72                                                      | -880.72                                                      | -881.72                                                      | -882.72                                                      | -883.72                                                      | -884.72                                                      | -885.72                                                      | -886.72                                                      | -887.72                                                      | -888.72                                                      | -889.72                                                      | -890.72                                                      | -891.72                                                      | -892.72                                                      | -893.72                                                      | -894.72                                                      | -895.72                                                      | -896.72                                                      | -897.72                                                      | -898.72                                                      | -899.72                                                      | -900.72                                                      | -901.72                                                      | -902.72                                                      | -903.72                                                      | -904.72                                                      | -905.72                                                      | -906.72                                                      | -907.72                                                      | -908.72                                                      | -909.72                                                      | -910.72                                                      | -911.72                                                      | -912.72                                                      | -913.72                                                      | -914.72                                                      | -915.72                                                      | -916.72                                                      | -917.72                                                      | -918.72                                                      | -919.72                                                      | -920.72                                                      | -921.72                                                      | -922.72                                                      | -923.72                                                      | -924.72                                                      | -925.72                                                      | -926.72                                                      | -927.72                                                      | -928.72                                                      | -929.72                                                      | -930.72                                                      | -931.72                                                      | -932.72                                                      | -933.72                                                      | -934.72                                                      | -935.72                                                      | -936.72                                                      | -937.72                                                      | -938.72                                                      | -939.72                                                      | -940.72                                                      | -941.72                                                      | -942.72                                                      | -943.72                                                      | -944.72                                                      | -945.72                                                      | -946.72                                                      | -947.72                                                      | -948.72                                                      | -949.72                                                      | -950.72                                                      | -951.72                                                      | -952.72                                                      | -953.72                                                      | -954.72                                                      | -955.72                                                      | -956.72                                                      | -957.72                                                      | -958.72                                                      | -959.72                                                      | -960.72                                                      | -961.72                                                      | -962.72                                                      | -963.72                                                      | -964.72                                                      | -965.72                                                      | -966.72                                                      | -967.72                                                      | -968.72                                                      | -969.72                                                      | -970.72                                                      | -971.72                                                      | -972.72                                                      | -973.72                                                      | -974.72                                                      | -975.72                                                      | -976.72                                                      | -977.72                                                      | -978.72                                                      | -979.72                                                      | -980.72                                                      | -981.72                                                      | -982.72                                                      | -983.72                                                      | -984.72                                                      | -985.72                                                      | -986.72                                                      | -987.72                                                      | -988.72                                                      | -989.72                                                      | -990.72                                                      | -991.72                                                      | -992.72                                                      | -993.72                                                      | -994.72                                                      | -995.72                                                      | -996.72                                                      | -997.72                                                      | -998.72                                                      | -999.72                                                      | -1000.72                                                      | -1001.72                                                      | -1002.72                                                      | -1003.72                                                      | -1004.72                                                      | -1005.72                                                      | -1006.72                                                      | -1007.72                                                      | -1008.72                                                      | -1009.72                                                      | -1010.72                                                      | -1011.72                                                      | -1012.72                                                      | -1013.72                                                      | -1014.72                                                      | -1015.72                                                      | -1016.72                                                      | -1017.72                                                      | -1018.72                                                      | -1019.72                                                      | -1020.72                                                      | -1021.72                                                      | -1022.72                                                      | -1023.72                                                      | -1024.72                                                      | -1025.72                                                      | -1026.72                                                      | -1027.72                                                      | -1028.72                                                      | -1029.72                                                      | -1030.72                                                      | -1031.72                                                      | -1032.72                                                      | -1033.72                                                      | -1034.72                                                      | -1035.72                                                      | -1036.72                                                      | -1037.72                                                      | -1038.72                                                      | -1039.72                                                      | -1040.72                                                      | -1041.72                                                      | -1042.72                                                      | -1043.72                                                      | -1044.72                                                      | -1045.72                                                      | -1046.72                                                      | -1047.72                                                      | -1048.72                                                      | -1049....\n\n\u003Cbr>\u003Cbr>\n\n\n## 论文\n\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289) [**超越事实性：大型语言模型作为知识生成器的全面评估**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289)，\u003Cbr> 作者：*Liang Chen、Yang Deng、Yatao Bian 等*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03109) [**深入探讨基于大型语言模型的自动评估**](https:\u002F\u002Fbrowse.arxiv.org\u002Fpdf\u002F2310.05657.pdf)，\u003Cbr> 作者：*Cheng-han Chiang、Hungyi Li*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03109) [**大型语言模型评估综述**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03109)，\u003Cbr> 作者：*Yupeng Chang、Xu Wang、Jindong Wang、Yuan Wu、Linyi Yang、Kaijie Zhu、Hao Chen、Xiaoyuan Yi 等*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fpublication\u002Fgpteval-nlg-evaluation-using-gpt-4-with-better-human-alignment\u002F) [**G-Eval：使用 GPT-4 进行更符合人类预期的 NLG 评估**](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fpublication\u002Fgpteval-nlg-evaluation-using-gpt-4-with-better-human-alignment\u002F)，\u003Cbr> 作者：*Yang Liu、Dan Iter、Yichong Xu、Shuohang Wang、Ruochen Xu、Chenguang Zhu*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.04023) [**ChatGPT 在推理、幻觉和交互性方面的多任务、多语言、多模态评估**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.04023)，\u003Cbr> 作者：*Yejin Bang、Samuel Cahyawijaya、Nayeon Lee、Wenliang Dai、Dan Su、Bryan Wilie、Holy Lovenia、Ziwei Ji 等*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.12095) [**超越事实性：大型语言模型作为知识生成器的全面评估**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289)，\u003Cbr> 作者：*Liang Chen、Yang Deng、Yatao Bian、Zeyu Qin、Bingzhe Wu、Tat-Seng Chua、Kam-Fai Wong*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.06476) [**ChatGPT 是通用自然语言处理任务求解器吗？**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.06476)，\u003Cbr> 作者：*Qin、Chengwei、Zhang、Aston、Zhang、Zhuosheng、Chen、Jiaao、Yasunaga、Michihiro 和 Yang、Diyi*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.06466) [**ChatGPT 对比传统知识图谱问答系统：现状及迈向知识图谱聊天机器人的未来方向**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.06466)，\u003Cbr> 作者：*Reham Omar、Omij Mangukiya、Panos Kalnis 和 Essam Mansour*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.13867) [**ChatGPT 的数学能力**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.13867)，\u003Cbr> 作者：*Simon Frieder、Luca Pinchetti、Ryan-Rhys Griffiths、Tommaso Salvatori、Thomas Lukasiewicz、Philipp Christian Petersen、Alexis Chevalier 和 Julius Berner*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.08081) [**探索 ChatGPT 在基于查询或方面的文本摘要方面的极限**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.08081)，\u003Cbr> 作者：*Xianjun Yang、Yan Li、Xinlu Zhang、Haifeng Chen 和 Wei Cheng*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.12095) [**关于 ChatGPT 的鲁棒性：对抗性和分布外视角**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2302.12095)，\u003Cbr> 作者：*Jindong Wang、Xixu Hu、Wenxin Hou、Hao Chen、Runkai Zheng、Yidong Wang、Linyi Yang、Haojun Huang 等*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.04655) [**ChatGPT 并非万能：大型生成式 AI 模型的最新进展综述**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2301.04655)，\u003Cbr> 作者：*Roberto Gozalo-Brizuela 和 Eduardo C. Garrido-Merch\\'an*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10198) [**ChatGPT 能否真正理解？ChatGPT 与微调后的 BERT 的比较研究**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10198)，\u003Cbr> 作者：*Qihuang Zhong、Liang Ding、Juhua Liu、Bo Du 和 Dacheng Tao*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2303.07992) [**ChatGPT 作为复杂问题问答系统的评估**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2303.07992)，\u003Cbr> 作者：*Yiming Tan、Dehai Min、Yu Li、Wenbo Li、Nan Hu、Yongrui Chen 和 Guilin Qi*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2023-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16421) [**ChatGPT 是一位博学但缺乏经验的解题者：对大型语言模型中常识问题的探究**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16421)，\u003Cbr> 作者：*Ning Bian、Xianpei Han、Le Sun、Hongyu Lin、Yaojie Lu 和 Ben He*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2211.09110) [**语言模型的整体评估**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2211.09110)，\u003Cbr> 作者：*Percy Liang、Rishi Bommasani、Tony Lee、Dimitris Tsipras、Dilara Soylu、Michihiro Yasunaga、Yian Zhang、Deepak Narayanan 等*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2204.00498) [**评估大型语言模型的文本到 SQL 能力**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2204.00498)，\u003Cbr> 作者：*Nitarshan Rajkumar、Raymond Li 和 Dzmitry Bahdanau*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCOLING-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.491) [**视觉-语言模型是常识知识库吗？**](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.491)，\u003Cbr> 作者：*Hsiu-Yu Yang 和 Carina Silberer*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2212.10529) [**GPT-3 是精神病患者吗？从心理学视角评估大型语言模型**](https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2212.10529)，\u003Cbr> 作者：*Xingxuan Li、Yutong Li、Linlin Liu、Lidong Bing 和 Shafiq R. Joty*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.132) [**GeoMLAMA：针对多语言预训练语言模型的地理多样性常识探测**](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.132)，\u003Cbr> 作者：*Da Yin、Hritik Bansal、Masoud Monajatipoor、Liunian Harold Li 和 Kai-Wei Chang*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.653) [**RobustLR：用于评估演绎推理者逻辑鲁棒性的诊断基准**](https:\u002F\u002Faclanthology.org\u002F2022.emnlp-main.653)，\u003Cbr> 作者：*Soumya Sanyal、Zeyi Liao 和 Xiang Ren*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2022-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.13169) [**代码领域大型语言模型的系统性评估**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.13169)，\u003Cbr> 作者：*Frank F. Xu、Uri Alon、Graham Neubig 和 Vincent J. Hellendoorn*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374) [**评估基于代码训练的大型语言模型**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374)，\u003Cbr> 作者：*Mark Chen、Jerry Tworek、Heewoo Jun、Qiming Yuan、Henrique Pond\\'e de Oliveira Pinto、Jared Kaplan、Harrison Edwards、Yuri Burda 等*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FACL-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.36) [**GLGE：一种新的通用语言生成评估基准**](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.36)，\u003Cbr> 作者：*Dayiheng Liu、Yu Yan、Yeyun Gong、Weizhen Qi、Hang Zhang、Jian Jiao、Weizhu Chen、Jie Fu 等*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05861) [**评估预训练模型在软件工程用户反馈分析中的应用：以应用评论分类为例的研究**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05861)，\u003Cbr> 作者：*Mohammad Abdul Hadi 和 Fatemeh H. Fard*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FACL_Findings-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.322) [**语言模型能否进行可泛化的常识推理？**](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.findings-acl.322)，[\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCode-skyblue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fgithub.com\u002Fwangpf3\u002FLM-for-CommonsenseInference)\u003Cbr> 作者：*Peifeng Wang、Filip Ilievski、Muhao Chen 和 Xiang Ren*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FEMNLP-2021-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.emnlp-main.598) [**RICA：基于常识公理的鲁棒推理能力评估**](https:\u002F\u002Fdoi.org\u002F10.18653\u002Fv1\u002F2021.emnlp-main.598)，\u003Cbr> 作者：*Pei Zhou、Rahul Khanna、Seyeon Lee、Bill Yuchen Lin、Daniel Ho、Jay Pujara 和 Xiang Ren*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2020-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.14799) [**文本生成评估：综述**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.14799)，\u003Cbr> 作者：*Asli Celikyilmaz、Elizabeth Clark 和 Jianfeng Gao*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCoRR-2020-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15780) [**神经网络语言生成：形式化、方法与评估**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15780)，\u003Cbr> 作者：*Cristina Garbacea 和 Qiaozhu Mei*\n  \u003Cbr>\u003Cbr>\n- [\u003Cimg src=https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FICLR-2020-blue alt=\"img\" style=\"zoom:100%; vertical-align: middle\" \u002F>](https:\u002F\u002Fopenreview.net\u002Fforum?id=SkeHuCVFDr) [**BERTScore：使用 BERT 评估文本生成**](https:\u002F\u002Fopenreview.net\u002Fforum?id=SkeHuCVFDr)，\u003Cbr> 作者：*Tianyi Zhang、Varsha Kishore、Felix Wu、Kilian Q. Weinberger 和 Yoav Artzi*\n  \u003Cbr>\u003Cbr>\n\n\u003Cbr>\u003Cbr>\n\n\n## LLM 列表\n\n### 典型 LLM 详情\n\n| 模型       | 参数量 | 层数 | 注意力头数 | 维度 | 学习率 | 批量大小 | 训练 token 数 |\n| ---------- | -------- | ------ | ------------ | ------ | ----------- | ----------- | -------------- |\n| LLaMA2     | 67亿    | 32     | 32           | 4096   | 3.00E-04    | 400万       | 1.0万亿        |\n| LLaMA2     | 130亿   | 40     | 40           | 5120   | 3.00E-04    | 400万       | 1.0万亿        |\n| LLaMA2     | 325亿   | 60     | 52           | 6656   | 1.50E-04    | 400万       | 1.4万亿        |\n| LLaMA2     | 652亿   | 80     | 64           | 8192   | 1.50E-04    | 400万       | 1.4万亿        |\n| nano-GPT   | 85,584  | 3      | 3            | 768    | 3.00E-04    |             |                |\n| GPT2-small | 1.2亿   | 12     | 12           | 768    | 2.50E-04    |             |                |\n| GPT2-XL    | 15亿    | 48     | 25           | 1600   | 1.50E-04    |             |                |\n| GPT3       | 175亿   | 96     | 96           | 12288  | 1.50E-04    |             | 0.5万亿        |\n\n### 预训练 LLM\n\n|       模型        | 规模 | 架构   |                            访问                            | 日期   | 来源                                                       |\n| :----------------: | :--: | :-------------: | :----------------------------------------------------------: | :-----: | ------------------------------------------------------------ |\n| Switch Transformer | 1.6T |  解码器(MOE)   |                              -                               | 2021-01 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.03961.pdf)                |\n|        GLaM        | 1.2T |  解码器(MOE)   |                              -                               | 2021-12 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.06905.pdf)                |\n|        PaLM        | 540B |     解码器     |                              -                               | 2022-04 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.02311.pdf)                |\n|       MT-NLG       | 530B |     解码器     |                              -                               | 2022-01 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.11990.pdf)                |\n|      J1-Jumbo      | 178B |     解码器     |        [API](https:\u002F\u002Fdocs.ai21.com\u002Fdocs\u002Fcomplete-api)        | 2021-08 | [论文](https:\u002F\u002Fuploads-ssl.webflow.com\u002F60fd4503684b466578c0d307\u002F61138924626a6981ee09caf6_jurassic_tech_paper.pdf) |\n|        OPT         | 175B |     解码器     | [API](https:\u002F\u002Fopt.alpa.ai) \\| [检查点](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq\u002Ftree\u002Fmain\u002Fprojects\u002FOPT) | 2022-05 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.01068.pdf)                |\n|       BLOOM        | 176B |     解码器     | [API](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) \\| [检查点](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) | 2022-11 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.05100.pdf)                |\n|      GPT 3.0       | 175B |     解码器     |                [API](https:\u002F\u002Fopenai.com\u002Fapi\u002F)                | 2020-05 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.14165.pdf)                |\n|       LaMDA        | 137B |     解码器     |                              -                               | 2022-01 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.08239.pdf)                |\n|        GLM         | 130B |     解码器     |          [检查点](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FGLM-130B)           | 2022-10 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.02414.pdf)                |\n|        YaLM        | 100B |     解码器     |         [检查点](https:\u002F\u002Fgithub.com\u002Fyandex\u002FYaLM-100B)          | 2022-06 | [博客](https:\u002F\u002Fmedium.com\u002Fyandex\u002Fyandex-publishes-yalm-100b-its-the-largest-gpt-like-neural-network-in-open-source-d1df53d0e9a6) |\n|       LLaMA        | 65B  |     解码器     |      [检查点](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fllama)       | 2022-09 | [论文](https:\u002F\u002Fresearch.facebook.com\u002Fpublications\u002Fllama-open-and-efficient-foundation-language-models\u002F) |\n|      GPT-NeoX      | 20B  |     解码器     |        [检查点](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neox)        | 2022-04 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.06745.pdf)                |\n|        UL2         | 20B  |    不分架构     | [检查点](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Ful2#:~:text=UL2%20is%20a%20unified%20framework%20for%20pretraining%20models,downstream%20fine-tuning%20is%20associated%20with%20specific%20pre-training%20schemes.) | 2022-05 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.05131v1.pdf)              |\n|         T5         | 11B  | 编码器-解码器 |            [检查点](https:\u002F\u002Fhuggingface.co\u002Ft5-11b)             | 2019-10 | [论文](https:\u002F\u002Fjmlr.org\u002Fpapers\u002Fv21\u002F20-074.html)             |\n|      CPM-Bee       | 10B  |     解码器     |          [API](https:\u002F\u002Flive.openbmb.org\u002Fmodels\u002Fbee)          | 2022-10 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.00413.pdf)                |\n|       rwkv-4       |  7B  |      RWKV       |    [检查点](https:\u002F\u002Fhuggingface.co\u002FBlinkDL\u002Frwkv-4-pile-7b)     | 2022-09 | [GitHub](https:\u002F\u002Fgithub.com\u002FBlinkDL\u002FRWKV-LM)                 |\n|       GPT-J        |  6B  |     解码器     |      [检查点](https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fgpt-j-6B)      | 2022-09 | [GitHub](https:\u002F\u002Fgithub.com\u002Fkingoflolz\u002Fmesh-transformer-jax) |\n|      GPT-Neo       | 2.7B |     解码器     |        [检查点](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)         | 2021-03 | [GitHub](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)              |\n|      GPT-Neo       | 1.3B |     解码器     |        [检查点](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)         | 2021-03 | [GitHub](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)              |\n\n\u003Cbr>\u003Cbr>\n\n### 指令微调的大模型\n|    模型    | 规模 |  架构   |                            访问                            |  日期   | 来源                                                 |\n| :---------: | :--: | :-------------: | :----------------------------------------------------------: | :-----: | ------------------------------------------------------ |\n|  Flan-PaLM  | 540B |     解码器     |                              -                               | 2022-10 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416.pdf)          |\n|   BLOOMZ    | 176B |     解码器     |       [检查点](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz)       | 2022-11 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.01786.pdf)          |\n| InstructGPT | 175B |     解码器     |         [API](https:\u002F\u002Fplatform.openai.com\u002Foverview)          | 2022-03 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.02155.pdf)          |\n|  Galactica  | 120B |     解码器     |    [检查点](https:\u002F\u002Fhuggingface.co\u002Ffacebook\u002Fgalactica-120b)    | 2022-11 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09085.pdf)          |\n| OpenChatKit | 20B  |        -        |   [检查点](https:\u002F\u002Fgithub.com\u002Ftogethercomputer\u002FOpenChatKit)    | 2023-3  | -                                                      |\n|  Flan-UL2   | 20B  |     解码器     | [检查点](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Ful2) | 2023-03 | [博客](https:\u002F\u002Fwww.yitay.net\u002Fblog\u002Fflan-ul2-20b)        |\n|   Gopher    |  -   |        -        |                              -                               |    -    | -                                                      |\n| Chinchilla  |  -   |        -        |                              -                               |    -    | -                                                      |\n|   Flan-T5   | 11B  | 编码器-解码器 | [检查点](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ft5x\u002Fblob\u002Fmain\u002Fdocs\u002Fmodels.md#flan-t5-checkpoints) | 2022-10 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416.pdf)          |\n|     T0      | 11B  | 编码器-解码器 |         [检查点](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002FT0)         | 2021-10 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.08207.pdf)          |\n|   Alpaca    |  7B  |     解码器     |          [演示](https:\u002F\u002Fcrfm.stanford.edu\u002Falpaca\u002F)           | 2023-03 | [GitHub](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca) |\n\n\u003Cbr>\u003Cbr>\n### 对齐的大模型\n|  模型  | 规模 | 架构 |                            访问                            |  日期   | 来源                                                     |\n| :-----: | :--: | :----------: | :----------------------------------------------------------: | :-----: | ---------------------------------------------------------- |\n|  GPT 4  |  -   |      -       |                              -                               | 2023-03 | [博客](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4)                  |\n| ChatGPT |  -   |   解码器    | [演示](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F)\\|[API](https:\u002F\u002Fshare.hsforms.com\u002F1u4goaXwDRKC9-x9IvKno0A4sk30) | 2022-11 | [博客](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F)                   |\n| Sparrow | 70B  |      -       |                              -                               | 2022-09 | [论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.14375.pdf)              |\n| Claude  |  -   |      -       | [演示](https:\u002F\u002Fpoe.com\u002Fclaude)\\|[API](https:\u002F\u002Fwww.anthropic.com\u002Fearlyaccess) | 2023-03 | [博客](https:\u002F\u002Fwww.anthropic.com\u002Findex\u002Fintroducing-claude) |\n\n\u003Cbr>\u003Cbr>\n\n### 开源大模型\n\n- [LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai\u002F) - 一个基础的、拥有650亿参数的大语言模型。[LLaMA.cpp](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp) [Lit-LLaMA](https:\u002F\u002Fgithub.com\u002FLightning-AI\u002Flit-llama)\n  - [Alpaca](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html) - 基于LLaMA 7B模型，在5.2万条指令遵循示例上微调得到的模型。[Alpaca.cpp](https:\u002F\u002Fgithub.com\u002Fantimatter15\u002Falpaca.cpp) [Alpaca-LoRA](https:\u002F\u002Fgithub.com\u002Ftloen\u002Falpaca-lora)\n  - [Flan-Alpaca](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fflan-alpaca) - 由人类和机器共同进行的指令微调。\n  - [Baize](https:\u002F\u002Fgithub.com\u002Fproject-baize\u002Fbaize-chatbot) - Baize是一个使用[LoRA](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FLoRA)训练的开源聊天模型，它使用了让ChatGPT与自身对话生成的10万条对话数据。\n  - [Cabrita](https:\u002F\u002Fgithub.com\u002F22-hours\u002Fcabrita) - 一个葡萄牙语微调的指令型LLaMA模型。\n  - [Vicuna](https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat) - 一款开源聊天机器人，其性能可媲美ChatGPT，质量达到ChatGPT的90%。\n  - [Llama-X](https:\u002F\u002Fgithub.com\u002FAetherCortex\u002FLlama-X) - 关于改进LLaMA以达到当前最优水平的大模型的开放学术研究。\n  - [Chinese-Vicuna](https:\u002F\u002Fgithub.com\u002FFacico\u002FChinese-Vicuna) - 一个基于LLaMA的中文指令遵循模型。\n  - [GPTQ-for-LLaMA](https:\u002F\u002Fgithub.com\u002Fqwopqwop200\u002FGPTQ-for-LLaMa) - 使用[GPTQ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.17323)对[LLaMA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13971)进行4位量化。\n  - [GPT4All](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all) - 提供演示、数据和代码，用于训练基于GPT-J和LLaMA的开源助手型大语言模型。\n  - [Koala](https:\u002F\u002Fbair.berkeley.edu\u002Fblog\u002F2023\u002F04\u002F03\u002Fkoala\u002F) - 一个用于学术研究的对话模型。\n  - [BELLE](https:\u002F\u002Fgithub.com\u002FLianjiaTech\u002FBELLE) - 成为每个人的大语言模型引擎。\n  - [StackLLaMA](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fstackllama) - 一份使用RLHF训练LLaMA的实践指南。\n  - [RedPajama](https:\u002F\u002Fgithub.com\u002Ftogethercomputer\u002FRedPajama-Data) - 一个开源配方，用于重现LLaMA的训练数据集。\n  - [Chimera](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FLLMZoo) - 拉丁凤凰。\n- [BLOOM](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) - BigScience大型开源多语言语言模型 [BLOOM-LoRA](https:\u002F\u002Fgithub.com\u002Flinhduongtuan\u002FBLOOM-LORA)\n  - [BLOOMZ&mT0](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz) - 一系列能够在零样本情况下用数十种语言理解并执行人类指令的模型。\n  - [Phoenix](https:\u002F\u002Fgithub.com\u002FFreedomIntelligence\u002FLLMZoo)\n\n- [T5](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) - 文本到文本迁移变换器\n  - [T0](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.08207) - 多任务提示训练使模型能够实现零样本任务泛化。\n\n- [OPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068) - 开放式预训练Transformer语言模型。\n- [UL2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.05131v1) - 一种统一的框架，用于预训练在不同数据集和设置中都通用有效的模型。\n- [GLM](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FGLM)- GLM是一种通用语言模型，采用自回归填空目标进行预训练，可用于各种自然语言理解和生成任务的微调。\n- [RWKV](https:\u002F\u002Fgithub.com\u002FBlinkDL\u002FRWKV-LM) - 可并行化的RNN，具有Transformer级别的大模型性能。\n  - [ChatRWKV](https:\u002F\u002Fgithub.com\u002FBlinkDL\u002FChatRWKV) - ChatRWKV类似于ChatGPT，但由我的RWKV（100% RNN）语言模型驱动。\n- [StableLM](https:\u002F\u002Fstability.ai\u002Fblog\u002Fstability-ai-launches-the-first-of-its-stablelm-suite-of-language-models) - Stability AI语言模型。\n- [YaLM](https:\u002F\u002Fmedium.com\u002Fyandex\u002Fyandex-publishes-yalm-100b-its-the-largest-gpt-like-neural-network-in-open-source-d1df53d0e9a6) - 一个类似GPT的神经网络，用于生成和处理文本。全球开发者和研究人员均可免费使用。\n- [GPT-Neo](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo) - 使用[mesh-tensorflow](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Fmesh)库实现的、具备模型并行和数据并行能力的[GPT3](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165)类模型。\n- [GPT-J](https:\u002F\u002Fgithub.com\u002Fkingoflolz\u002Fmesh-transformer-jax\u002F#gpt-j-6b) - 一个拥有60亿参数的自回归文本生成模型，基于[The Pile](https:\u002F\u002Fpile.eleuther.ai\u002F)数据集训练而成。\n  - [Dolly](https:\u002F\u002Fwww.databricks.com\u002Fblog\u002F2023\u002F03\u002F24\u002Fhello-dolly-democratizing-magic-chatgpt-open-models.html) - 一个成本低廉、却展现出惊人指令遵循能力的LLM，其表现堪比ChatGPT。\n\n- [Pythia](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fpythia) - 解读跨时间和规模的自回归Transformer模型\n  - [Dolly 2.0](https:\u002F\u002Fwww.databricks.com\u002Fblog\u002F2023\u002F04\u002F12\u002Fdolly-first-open-commercially-viable-instruction-tuned-llm) - 第一个开源、指令微调的LLM，基于获准用于科研和商业用途的人工指令数据集进行微调。\n- [OpenFlamingo](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_flamingo) - DeepMind Flamingo模型的开源复现版本。\n- [Cerebras-GPT](https:\u002F\u002Fwww.cerebras.net\u002Fblog\u002Fcerebras-gpt-a-family-of-open-compute-efficient-large-language-models\u002F) - 一系列开源、计算高效的大型语言模型。\n- [GALACTICA](https:\u002F\u002Fgithub.com\u002Fpaperswithcode\u002Fgalai\u002Fblob\u002Fmain\u002Fdocs\u002Fmodel_card.md) - GALACTICA系列模型是在大规模科学语料库上训练而成。\n  - [GALPACA](https:\u002F\u002Fhuggingface.co\u002FGeorgiaTechResearchInstitute\u002Fgalpaca-30b) - 在Alpaca数据集上微调的GALACTICA 30B模型。\n\n- [Palmyra](https:\u002F\u002Fhuggingface.co\u002FWriter\u002Fpalmyra-base) - Palmyra Base主要以英文文本进行预训练。\n- [Camel](https:\u002F\u002Fhuggingface.co\u002FWriter\u002Fcamel-5b-hf) - 一款最先进的指令遵循大型语言模型，旨在提供卓越的性能和多功能性。\n- [h2oGPT](https:\u002F\u002Fgithub.com\u002Fh2oai\u002Fh2ogpt)\n- [PanGu-α](https:\u002F\u002Fopeni.org.cn\u002Fpangu\u002F) - PanGu-α是华为诺亚方舟实验室、MindSpore团队和鹏城实验室联合开发的2000亿参数自回归中文语言模型。\n- [Open-Assistant](https:\u002F\u002Fgithub.com\u002FLAION-AI\u002FOpen-Assistant) - 一个旨在让每个人都能使用优秀的基于聊天的大语言模型的项目。\n- [HuggingChat](https:\u002F\u002Fhuggingface.co\u002Fchat\u002F) - 由Open Assistant最新模型驱动——目前最好的开源聊天模型，并通过@huggingface推理API提供服务。\n- [Baichuan](https:\u002F\u002Fgithub.com\u002Fbaichuan-inc\u002FBaichuan-13B) - 由百川智能科技公司在Baichuan-7B的基础上开发的开源、商用大型语言模型，包含130亿个参数。（2023年7月15日）\n- [Qwen](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen-7B) - Qwen-7B是阿里巴巴云提出的Qwen（通义千问）系列大语言模型中的70亿参数版本。Qwen-7B是一个基于Transformer的大语言模型，已在大量数据上进行预训练，包括网页文本、书籍、代码等。（2023年8月3日）\n\n\u003Cbr>\u003Cbr>\n### 流行大模型\n\n|          **模型**          |        **作者数量**         |                          **链接**                          | **参数量** |     **基础模型**     | **层数** | **编码器** | **解码器** | **预训练 token 数** | **微调样本数** |  **RLHF**  |\n| :-------------------------: | :-------------------------: | :----------------------------------------------------------: | :-------------: | :--------------------: | :---------: | :-----------: | :-----------: | :-------------------: | :--------------: | :--------: |\n|          GPT3-Ada           |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      0.35B      |           -            |     24      |       -       |      24       |           -           |        -         |     -      |\n|          Pythia-1B          |     biderman2023pythia      |         https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-1b          |       1B        |           -            |     16      |       -       |      16       |      300B tokens      |        -         |     -      |\n|        GPT3-Babbage         |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      1.3B       |           -            |     24      |       -       |      24       |           -           |        -         |     -      |\n|           GPT2-XL           |     radford2019language     |                https:\u002F\u002Fhuggingface.co\u002Fgpt2-xl                |      1.5B       |           -            |     48      |       -       |      48       |      40B tokens       |        -         |     -      |\n|          BLOOM-1b7          |        scao2022bloom        |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom-1b7          |      1.7B       |           -            |     24      |       -       |      24       |      350B tokens      |        -         |     -      |\n|         BLOOMZ-1b7          | muennighoff2022crosslingual |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-1b7         |      1.7B       |       BLOOM-1b7        |     24      |       -       |      24       |           -           |   8.39B tokens   |     -      |\n|         Dolly-v2-3b         |          2023dolly          |        https:\u002F\u002Fhuggingface.co\u002Fdatabricks\u002Fdolly-v2-3b         |      2.8B       |      Pythia-2.8B       |     32      |       -       |      32       |           -           |       15K        |     -      |\n|         Pythia-2.8B         |     biderman2023pythia      |        https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-2.8b         |      2.8B       |           -            |     32      |       -       |      32       |      300B tokens      |        -         |     -      |\n|          BLOOM-3b           |        scao2022bloom        |          https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom-3b          |       3B        |           -            |     30      |       -       |      30       |      350B tokens      |        -         |     -      |\n|          BLOOMZ-3b          | muennighoff2022crosslingual |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-3b          |       3B        |        BLOOM-3b        |     30      |       -       |      30       |           -           |   8.39B tokens   |     -      |\n|   StableLM-Base-Alpha-3B    |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-base-alpha-3b   |       3B        |           -            |     16      |       -       |      16       |      800B tokens      |        -         |     -      |\n|   StableLM-Tuned-Alpha-3B   |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-tuned-alpha-3b  |       3B        | StableLM-Base-Alpha-3B |     16      |       -       |      16       |           -           |       632K       |     -      |\n|         ChatGLM-6B          | zeng2023glm-130b,du2022glm  |           https:\u002F\u002Fhuggingface.co\u002FTHUDM\u002Fchatglm-6b            |       6B        |           -            |     28      |      28       |      28       |       1T tokens       |    \\checkmark    | \\checkmark |\n|          DoctorGLM          |     xiong2023doctorglm      |          https:\u002F\u002Fgithub.com\u002Fxionghonglin\u002FDoctorGLM           |       6B        |       ChatGLM-6B       |     28      |      28       |      28       |           -           |      6.38M       |     -      |\n|         ChatGLM-Med         |         ChatGLM-Med         |            https:\u002F\u002Fgithub.com\u002FSCIR-HI\u002FMed-ChatGLM            |       6B        |       ChatGLM-6B       |     28      |      28       |      28       |           -           |        8K        |     -      |\n|         GPT3-Curie          |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      6.7B       |           -            |     32      |       -       |      32       |           -           |        -         |     -      |\n|         MPT-7B-Chat         |   MosaicML2023Introducing   |         https:\u002F\u002Fhuggingface.co\u002Fmosaicml\u002Fmpt-7b-chat          |      6.7B       |         MPT-7B         |     32      |       -       |      32       |           -           |       360K       |     -      |\n|       MPT-7B-Instruct       |   MosaicML2023Introducing   |       https:\u002F\u002Fhuggingface.co\u002Fmosaicml\u002Fmpt-7b-instruct        |      6.7B       |         MPT-7B         |     32      |       -       |      32       |           -           |      59.3K       |     -      |\n|   MPT-7B-StoryWriter-65k+   |   MosaicML2023Introducing   |      https:\u002F\u002Fhuggingface.co\u002Fmosaicml\u002Fmpt-7b-storywriter      |      6.7B       |         MPT-7B         |     32      |       -       |      32       |           -           |    \\checkmark    |     -      |\n|         Dolly-v2-7b         |          2023dolly          |        https:\u002F\u002Fhuggingface.co\u002Fdatabricks\u002Fdolly-v2-7b         |      6.9B       |      Pythia-6.9B       |     32      |       -       |      32       |           -           |       15K        |     -      |\n| h2ogpt-oig-oasst1-512-6.9b  |         2023h2ogpt          |   https:\u002F\u002Fhuggingface.co\u002Fh2oai\u002Fh2ogpt-oig-oasst1-512-6.9b    |      6.9B       |      Pythia-6.9B       |     32      |       -       |      32       |           -           |       398K       |     -      |\n|         Pythia-6.9B         |     biderman2023pythia      |        https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-6.9b         |      6.9B       |           -            |     32      |       -       |      32       |      300B tokens      |        -         |     -      |\n|          Alpaca-7B          |           alpaca            |       https:\u002F\u002Fhuggingface.co\u002Ftatsu-lab\u002Falpaca-7b-wdiff       |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       52K        |     -      |\n|       Alpaca-LoRA-7B        |       2023alpacalora        |         https:\u002F\u002Fhuggingface.co\u002Ftloen\u002Falpaca-lora-7b          |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       52K        |     -      |\n|          Baize-7B           |         xu2023baize         |      https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-lora-7B      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       263K       |     -      |\n|     Baize Healthcare-7B     |         xu2023baize         | https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-healthcare-lora-7B |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       201K       |     -      |\n|         ChatDoctor          |   yunxiang2023chatdoctor    |           https:\u002F\u002Fgithub.com\u002FKent0n-Li\u002FChatDoctor            |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       167K       |     -      |\n|           HuaTuo            |       wang2023huatuo        |     https:\u002F\u002Fgithub.com\u002Fscir-hi\u002Fhuatuo-llama-med-chinese      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |        8K        |     -      |\n|          Koala-7B           |     koala_blogpost_2023     |           https:\u002F\u002Fhuggingface.co\u002Fyoung-geng\u002Fkoala            |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       472K       |     -      |\n|          LLaMA-7B           |      touvron2023llama       |     https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-7b-hf     |       7B        |           -            |     32      |       -       |      32       |       1T tokens       |        -         |     -      |\n|     Luotuo-lora-7b-0.3      |           luotuo            |     https:\u002F\u002Fhuggingface.co\u002Fsilk-road\u002Fluotuo-lora-7b-0.3      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       152K       |     -      |\n|   StableLM-Base-Alpha-7B    |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-base-alpha-7b   |       7B        |           -            |     16      |       -       |      16       |      800B tokens      |        -         |     -      |\n|   StableLM-Tuned-Alpha-7B   |        2023StableLM         |  https:\u002F\u002Fhuggingface.co\u002Fstabilityai\u002Fstablelm-tuned-alpha-7b  |       7B        | StableLM-Base-Alpha-7B |     16      |       -       |      16       |           -           |       632K       |     -      |\n|    Vicuna-7b-delta-v1.1     |         vicuna2023          |      https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat\\#vicuna-weights      |       7B        |        LLaMA-7B        |     32      |       -       |      32       |           -           |       70K        |     -      |\n| BELLE-7B-0.2M \u002F0.6M \u002F1M \u002F2M |     belle2023exploring      |        https:\u002F\u002Fhuggingface.co\u002FBelleGroup\u002FBELLE-7B-2M         |      7.1B       |     Bloomz-7b1-mt      |     30      |       -       |      30       |           -           | 0.2M\u002F0.6M\u002F1M\u002F2M  |     -      |\n|          BLOOM-7b1          |        scao2022bloom        |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom-7b1          |      7.1B       |           -            |     30      |       -       |      30       |      350B tokens      |        -         |     -      |\n|     BLOOMZ-7b1 \u002Fmt \u002Fp3      | muennighoff2022crosslingual |       https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-7b1-p3        |      7.1B       |       BLOOM-7b1        |     30      |       -       |      30       |           -           |   4.19B tokens   |     -      |\n|        Dolly-v2-12b         |          2023dolly          |        https:\u002F\u002Fhuggingface.co\u002Fdatabricks\u002Fdolly-v2-12b        |       12B       |       Pythia-12B       |     36      |       -       |      36       |           -           |       15K        |     -      |\n|    h2ogpt-oasst1-512-12b    |         2023h2ogpt          |      https:\u002F\u002Fhuggingface.co\u002Fh2oai\u002Fh2ogpt-oasst1-512-12b      |       12B       |       Pythia-12B       |     36      |       -       |      36       |           -           |      94.6K       |     -      |\n|  Open-Assistant-SFT-4-12B   |      2023openassistant      | https:\u002F\u002Fhuggingface.co\u002FOpenAssistant\u002Foasst-sft-4-pythia-12b-epoch-3.5 |       12B       |   Pythia-12B-deduped   |     36      |       -       |      36       |           -           |       161K       |     -      |\n|         Pythia-12B          |     biderman2023pythia      |         https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fpythia-12b         |       12B       |           -            |     36      |       -       |      36       |      300B tokens      |        -         |     -      |\n|          Baize-13B          |         xu2023baize         |     https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-lora-13B      |       13B       |       LLaMA-13B        |     40      |       -       |      40       |           -           |       263K       |     -      |\n|          Koala-13B          |     koala_blogpost_2023     |           https:\u002F\u002Fhuggingface.co\u002Fyoung-geng\u002Fkoala            |       13B       |       LLaMA-13B        |     40      |       -       |      40       |           -           |       472K       |     -      |\n|          LLaMA-13B          |      touvron2023llama       |    https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-13b-hf     |       13B       |           -            |     40      |       -       |      40       |       1T tokens       |        -         |     -      |\n|      StableVicuna-13B       |        2023StableLM         |   https:\u002F\u002Fhuggingface.co\u002FCarperAI\u002Fstable-vicuna-13b-delta    |       13B       |     Vicuna-13B v0      |     40      |       -       |      40       |           -           |       613K       | \\checkmark |\n|    Vicuna-13b-delta-v1.1    |         vicuna2023          |      https:\u002F\u002Fgithub.com\u002Flm-sys\u002FFastChat\\#vicuna-weights      |       13B       |       LLaMA-13B        |     40      |       -       |      40       |           -           |       70K        |     -      |\n|      moss-moon-003-sft      |          2023moss           |        https:\u002F\u002Fhuggingface.co\u002Ffnlp\u002Fmoss-moon-003-sft         |       16B       |   moss-moon-003-base   |     34      |       -       |      34       |           -           |       1.1M       |     -      |\n|  moss-moon-003-sft-plugin   |          2023moss           |     https:\u002F\u002Fhuggingface.co\u002Ffnlp\u002Fmoss-moon-003-sft-plugin     |       16B       |   moss-moon-003-base   |     34      |       -       |      34       |           -           |       1.4M       |     -      |\n|        GPT-NeoX-20B         |           gptneox           |        https:\u002F\u002Fhuggingface.co\u002FEleutherAI\u002Fgpt-neox-20b        |       20B       |           -            |     44      |       -       |      44       |         825GB         |        -         |     -      |\n|    h2ogpt-oasst1-512-20b    |         2023h2ogpt          |      https:\u002F\u002Fhuggingface.co\u002Fh2oai\u002Fh2ogpt-oasst1-512-20b      |       20B       |      GPT-NeoX-20B      |     44      |       -       |      44       |           -           |      94.6K       |     -      |\n|          Baize-30B          |         xu2023baize         |     https:\u002F\u002Fhuggingface.co\u002Fproject-baize\u002Fbaize-lora-30B      |       33B       |       LLaMA-30B        |     60      |       -       |      60       |           -           |       263K       |     -      |\n|          LLaMA-30B          |      touvron2023llama       |    https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-30b-hf     |       33B       |           -            |     60      |       -       |      60       |      1.4T tokens      |        -         |     -      |\n|          LLaMA-65B          |      touvron2023llama       |    https:\u002F\u002Fhuggingface.co\u002Fdecapoda-research\u002Fllama-65b-hf     |       65B       |           -            |     80      |       -       |      80       |      1.4T tokens      |        -         |     -      |\n|        GPT3-Davinci         |      brown2020language      |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3         |      175B       |           -            |     96      |       -       |      96       |      300B tokens      |        -         |     -      |\n|            BLOOM            |        scao2022bloom        |           https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom            |      176B       |           -            |     70      |       -       |      70       |      366B tokens      |        -         |     -      |\n|       BLOOMZ \u002Fmt \u002Fp3        | muennighoff2022crosslingual |         https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-p3          |      176B       |         BLOOM          |     70      |       -       |      70       |           -           |   2.09B tokens   |     -      |\n|    ChatGPT~(2023.05.01)     |        openaichatgpt        |       https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-3-5        |        -        |        GPT-3.5         |      -      |       -       |       -       |           -           |    \\checkmark    | \\checkmark |\n|     GPT-4~(2023.05.01)      |       openai2023gpt4        |        https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fmodels\u002Fgpt-4         |        -        |           -            |      -      |       -       |       -       |           -           |    \\checkmark    | \\checkmark |\n\n\u003Cbr>\u003Cbr>\n\n\n## 训练框架\n\n- [Accelerate](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Faccelerate) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhuggingface\u002Faccelerate.svg?style=social) - 🚀 一种简单的方法，用于在多 GPU、TPU 和混合精度下训练和使用 PyTorch 模型。\n- [Apache MXNet](https:\u002F\u002Fgithub.com\u002Fapache\u002Fmxnet) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fapache\u002Fmxnet.svg?style=social) - 轻量级、可移植、灵活的分布式\u002F移动深度学习框架，具有动态、支持变异的数据流依赖调度器。\n- [Caffe](https:\u002F\u002Fgithub.com\u002FBVLC\u002Fcaffe) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBVLC\u002Fcaffe.svg?style=social) - 一个快速的开源深度学习框架。\n- [ColossalAI](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhpcaitech\u002FColossalAI.svg?style=social) - 一个集成的大规模模型训练系统，采用高效的并行化技术。\n- [DeepSpeed](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FDeepSpeed.svg?style=social) - DeepSpeed 是一个深度学习优化库，使分布式训练和推理变得简单、高效且有效。\n- [Horovod](https:\u002F\u002Fgithub.com\u002Fhorovod\u002Fhorovod) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhorovod\u002Fhorovod.svg?style=social) - 面向 TensorFlow、Keras、PyTorch 和 Apache MXNet 的分布式训练框架。\n- [Jax](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fjax) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fgoogle\u002Fjax.svg?style=social) - 用于高性能机器学习研究的自动微分和 XLA。\n- [Kedro](https:\u002F\u002Fgithub.com\u002Fkedro-org\u002Fkedro) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fkedro-org\u002Fkedro.svg?style=social) - Kedro 是一个开源 Python 框架，用于创建可复现、可维护且模块化的数据科学代码。\n- [Keras](https:\u002F\u002Fgithub.com\u002Fkeras-team\u002Fkeras) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fkeras-team\u002Fkeras.svg?style=social) - Keras 是用 Python 编写的深度学习 API，运行在机器学习平台 TensorFlow 之上。\n- [LightGBM](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FLightGBM) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmicrosoft\u002FLightGBM.svg?style=social) - 基于决策树算法的快速、分布式、高性能梯度提升（GBT、GBDT、GBRT、GBM 或 MART）框架，用于排序、分类以及许多其他机器学习任务。\n- [MegEngine](https:\u002F\u002Fgithub.com\u002FMegEngine\u002FMegEngine) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FMegEngine\u002FMegEngine.svg?style=social) - MegEngine 是一个快速、可扩展且易于使用的深度学习框架，支持自动微分。\n- [metric-learn](https:\u002F\u002Fgithub.com\u002Fscikit-learn-contrib\u002Fmetric-learn) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fscikit-learn-contrib\u002Fmetric-learn.svg?style=social) - Python 中的度量学习算法。\n- [MindSpore](https:\u002F\u002Fgithub.com\u002Fmindspore-ai\u002Fmindspore) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmindspore-ai\u002Fmindspore.svg?style=social) - MindSpore 是一个新的开源深度学习训练\u002F推理框架，可用于移动、边缘和云场景。\n- [Oneflow](https:\u002F\u002Fgithub.com\u002FOneflow-Inc\u002Foneflow) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FOneflow-Inc\u002Foneflow.svg?style=social) - OneFlow 是一个以性能为中心的开源深度学习框架。\n- [PaddlePaddle](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddle) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FPaddlePaddle\u002FPaddle.svg?style=social) - 来自工业实践的机器学习框架。\n- [PyTorch](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Fpytorch) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpytorch\u002Fpytorch.svg?style=social) - 在 Python 中使用张量和动态神经网络，并具有强大的 GPU 加速功能。\n- [PyTorch Lightning](https:\u002F\u002Fgithub.com\u002Flightning-AI\u002Flightning) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flightning-AI\u002Flightning.svg?style=social) - 一种深度学习框架，用于闪电般快速地训练、部署和交付 AI 产品。\n- [XGBoost](https:\u002F\u002Fgithub.com\u002Fdmlc\u002Fxgboost) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdmlc\u002Fxgboost.svg?style=social) - 可扩展、可移植且分布式的梯度提升（GBDT、GBRT 或 GBM）库。\n- [scikit-learn](https:\u002F\u002Fgithub.com\u002Fscikit-learn\u002Fscikit-learn) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fscikit-learn\u002Fscikit-learn.svg?style=social) - Python 中的机器学习。\n- [TensorFlow](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Ftensorflow) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftensorflow\u002Ftensorflow.svg?style=social) - 一个面向所有人的开源机器学习框架。\n- [VectorFlow](https:\u002F\u002Fgithub.com\u002FNetflix\u002Fvectorflow) ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FNetflix\u002Fvectorflow.svg?style=social) - 一个针对稀疏数据和单机环境优化的极简神经网络库。\n\n\n\u003Cbr>\u003Cbr>\n## 大语言模型运维\n\n| 名称 | 星标数 | 描述 |\n| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |\n| [Byzer-LLM](https:\u002F\u002Fgithub.com\u002Fallwefantasy\u002Fbyzer-llm ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fallwefantasy\u002Fbyzer-llm.svg?style=social ) | Byzer-LLM 是一套全面的大模型基础设施，支持预训练、微调、部署和推理等大模型相关能力。Byzer-Retrieval 是专为大模型打造的存储基础设施，支持多种数据源的批量导入、实时单条更新以及全文检索、向量检索和混合检索等功能，从而简化 Byzer-LLM 的数据使用流程。Byzer-SQL\u002FPython 提供了友好的交互式 API，降低了用户使用上述产品的门槛。 |\n| [agenta](https:\u002F\u002Fgithub.com\u002FAgenta-AI\u002Fagenta ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FAgenta-AI\u002Fagenta.svg?style=social ) | 一个用于构建强大 LLM 应用的 LLMOps 平台。它允许轻松试验和评估不同的提示词、模型和工作流，以构建健壮的应用程序。 |\n| [Arize-Phoenix](https:\u002F\u002Fgithub.com\u002FArize-ai\u002Fphoenix ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FArize-ai\u002Fphoenix.svg?style=social ) | 面向 LLM、视觉、语言和表格数据模型的机器学习可观测性平台。 |\n| [BudgetML](https:\u002F\u002Fgithub.com\u002Febhy\u002Fbudgetml ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Febhy\u002Fbudgetml.svg?style=social ) | 使用不到 10 行代码，在有限预算下部署 ML 推理服务。 |\n| [CometLLM](https:\u002F\u002Fgithub.com\u002Fcomet-ml\u002Fcomet-llm ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fcomet-ml\u002Fcomet-llm.svg?style=social ) | 一个开源的 LLMOps 平台，用于记录、管理和可视化 LLM 提示词及链路。它可以跟踪提示模板、变量、执行时长、令牌使用情况等元数据，并对提示输出进行评分，同时在单一 UI 中可视化聊天历史。 |\n| [deeplake](https:\u002F\u002Fgithub.com\u002Factiveloopai\u002Fdeeplake ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Factiveloopai\u002FHub.svg?style=social ) | 流式传输大型多模态数据集，实现接近 100% 的 GPU 利用率。支持数据查询、可视化和版本控制，无需重新计算嵌入即可访问数据，便于模型微调。 |\n| [Dify](https:\u002F\u002Fgithub.com\u002Flanggenius\u002Fdify ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flanggenius\u002Fdify.svg?style=social ) | 一个开源框架，使开发者（甚至非开发者）能够快速构建基于大型语言模型的实用应用，确保这些应用具有可见性、可操作性和可改进性。 |\n| [Dstack](https:\u002F\u002Fgithub.com\u002Fdstackai\u002Fdstack ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdstackai\u002Fdstack.svg?style=social ) | 在任何云环境中（AWS、GCP、Azure、Lambda 等）经济高效地开发 LLM。 |\n| [Embedchain](https:\u002F\u002Fgithub.com\u002Fembedchain\u002Fembedchain ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fembedchain\u002Fembedchain.svg?style=social ) | 一个用于基于数据集创建类似 ChatGPT 机器人的工作框架。 |\n| [GPTCache](https:\u002F\u002Fgithub.com\u002Fzilliztech\u002FGPTCache ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fzilliztech\u002FGPTCache.svg?style=social ) | 构建语义缓存，用于存储 LLM 查询的响应。 |\n| [Haystack](https:\u002F\u002Fgithub.com\u002Fdeepset-ai\u002Fhaystack ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdeepset-ai\u002Fhaystack.svg?style=social ) | 快速构建包含 LLM 代理、语义搜索、问答等功能的应用程序。 |\n| [langchain](https:\u002F\u002Fgithub.com\u002Fhwchase17\u002Flangchain ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhwchase17\u002Flangchain.svg?style=social ) | 通过组件化方式构建 LLM 应用。 |\n| [LangFlow](https:\u002F\u002Fgithub.com\u002Flogspace-ai\u002Flangflow ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flogspace-ai\u002Flangflow.svg?style=social ) | 一种无需繁琐操作的方式，通过拖放组件和聊天界面来试验和原型化 LangChain 流程。 |\n| [LangKit](https:\u002F\u002Fgithub.com\u002Fwhylabs\u002Flangkit ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwhylabs\u002Flangkit.svg?style=social ) | 一个开箱即用的 LLM 遥测收集库，能够提取 LLM 性能随时间变化的指标、提示词、响应及元数据，从而大规模识别问题。 |\n| [LiteLLM 🚅](https:\u002F\u002Fgithub.com\u002FBerriAI\u002Flitellm\u002F ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBerriAI\u002Flitellm.svg?style=social ) | 一个简单轻量的 100 行包，用于标准化跨 OpenAI、Azure、Cohere、Anthropic、Replicate 等多个 API 端点的 LLM API 调用。 |\n| [LlamaIndex](https:\u002F\u002Fgithub.com\u002Fjerryjliu\u002Fllama_index ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjerryjliu\u002Fllama_index.svg?style=social ) | 提供一个中心化接口，将您的 LLM 与外部数据连接起来。 |\n| [LLMApp](https:\u002F\u002Fgithub.com\u002Fpathwaycom\u002Fllm-app ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpathwaycom\u002Fllm-app.svg?style=social ) | LLM App 是一个 Python 库，只需几行代码即可帮助您构建实时的 LLM 驱动数据管道。 |\n| [LLMFlows](https:\u002F\u002Fgithub.com\u002Fstoyan-stoyanov\u002Fllmflows ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fstoyan-stoyanov\u002Fllmflows.svg?style=social ) | LLMFlows 是一个用于构建简单、清晰且透明的 LLM 应用程序的框架，例如聊天机器人、问答系统和智能代理。 |\n| [LLMonitor](https:\u002F\u002Fgithub.com\u002Fllmonitor\u002Fllmonitor ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fllmonitor\u002Fllmonitor.svg?style=social ) | 面向 AI 应用和智能代理的可观测性与监控工具。通过强大的追踪和日志功能调试智能代理，利用分析工具深入请求历史。提供易于集成到 LangChain 的开发者友好模块。 |\n| [magentic](https:\u002F\u002Fgithub.com\u002Fjackmpcollins\u002Fmagentic ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjackmpcollins\u002Fmagentic.svg?style=social ) | 将 LLM 无缝集成为 Python 函数。使用类型注解指定结构化输出。将 LLM 查询和函数调用与常规 Python 代码结合，以创建复杂的 LLM 驱动功能。 |\n| [Pezzo 🕹️](https:\u002F\u002Fgithub.com\u002Fpezzolabs\u002Fpezzo ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpezzolabs\u002Fpezzo.svg?style=social ) | Pezzo 是一个面向开发者和团队的开源 LLMOps 平台。只需两行代码，即可轻松排查 AI 运营中的问题，协作管理提示词，并从一处即时部署更改。 |\n| [promptfoo](https:\u002F\u002Fgithub.com\u002Ftyppo\u002Fpromptfoo ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftyppo\u002Fpromptfoo.svg?style=social ) | 一个开源的工具，用于测试和评估提示词质量。创建测试用例，自动检查输出质量，并捕捉回归问题，从而降低评估成本。 |\n| [prompttools](https:\u002F\u002Fgithub.com\u002Fhegelai\u002Fprompttools ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhegelai\u002Fprompttools.svg?style=social ) | 一个开源的工具，用于测试和尝试提示词。其核心理念是让开发者能够使用熟悉的界面，如代码和笔记本，来评估提示词。只需几行代码，即可在不同模型上测试提示词和参数（无论您使用的是 OpenAI、Anthropic 还是 LLaMA 模型）。甚至还可以评估向量数据库检索的准确性。 |\n| [TrueFoundry](https:\u002F\u002Fwww.truefoundry.com\u002F ) | 无 GitHub 链接 | 在您自己的 Kubernetes（EKS、AKS、GKE、本地部署）基础设施上部署 LLMOps 工具，包括向量数据库、嵌入式服务器等。这涵盖了用于部署、微调、提示词跟踪以及提供完整数据安全和优化 GPU 管理的开源 LLM 模型。采用最佳软件工程实践，以生产规模训练并上线您的 LLM 应用。 |\n| [ReliableGPT 💪](https:\u002F\u002Fgithub.com\u002FBerriAI\u002FreliableGPT\u002F ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBerriAI\u002FreliableGPT.svg?style=social ) | 处理生产级 LLM 应用中遇到的 OpenAI 错误（OpenAI 服务器过载、密钥轮换或上下文窗口错误）。 |\n| [Weights & Biases (Prompts)](https:\u002F\u002Fdocs.wandb.ai\u002Fguides\u002Fprompts ) | 无 GitHub 链接 | W&B MLOps 平台中面向开发者的一套 LLMOps 工具。使用 W&B Prompts 可以可视化和检查 LLM 执行流程，跟踪输入和输出，查看中间结果，并管理提示词及 LLM 链配置。 |\n| [xTuring](https:\u002F\u002Fgithub.com\u002Fstochasticai\u002Fxturing ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fstochasticai\u002Fxturing.svg?style=social ) | 使用快速高效的微调技术构建和控制您自己的 LLM。 |\n| [ZenML](https:\u002F\u002Fgithub.com\u002Fzenml-io\u002Fzenml ) | ![](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fzenml-io\u002Fzenml.svg?style=social ) | 一个开源框架，用于编排、实验和部署生产级机器学习解决方案，内置 `langchain` 和 `llama_index` 集成。 |\n\n\u003Cbr>\u003Cbr>\n\n\n## 课程\n\n- [大语言模型课程notebooks集-Large Language Model Course](https:\u002F\u002Fgithub.com\u002Fmlabonne\u002Fllm-course) - 一门包含路线图和notebooks的课程，帮助你入门大型语言模型（LLMs）。\n- [全栈LLM训练营](https:\u002F\u002Fihower.tw\u002Fnotes\u002F技術筆記-AI\u002FFull+Stack+LLM+Bootcamp) - LLM相关学习\u002F应用资源集。\n\n\n\n\n\u003Cbr>\u003Cbr>\n## 其他精选列表\n\n- [Awesome LLM](https:\u002F\u002Fgithub.com\u002FHannibal046\u002FAwesome-LLM\u002F) - 一份精心整理的大语言模型相关论文列表。\n- [高效LLM精选](https:\u002F\u002Fgithub.com\u002Fhorseee\u002FAwesome-Efficient-LLM) - 针对高效大型语言模型的精选资源列表。\n- [生产级机器学习精选](https:\u002F\u002Fgithub.com\u002FEthicalML\u002Fawesome-production-machine-learning) - 用于部署、监控、版本管理和扩展机器学习模型的优秀开源库精选列表。\n- [营销数据科学精选](https:\u002F\u002Fgithub.com\u002Funderlines\u002Fawesome-marketing-datascience) - 有用的LLM、分析和数据科学资源精选列表。\n- [LLM工具精选](https:\u002F\u002Fgithub.com\u002Funderlines\u002Fawesome-marketing-datascience\u002Fblob\u002Fmaster\u002Fllm-tools.md) - 有用的LLM工具精选列表。\n- [LLM压缩精选](https:\u002F\u002Fgithub.com\u002FHuangOwen\u002FAwesome-LLM-Compression) - 针对高效LLM压缩技术的精选资源列表。\n- [多模态大语言模型精选](https:\u002F\u002Fgithub.com\u002FBradyFU\u002FAwesome-Multimodal-Large-Language-Models) - 多模态大语言模型的精选资源列表。\n- [LLMOps精选](https:\u002F\u002Fgithub.com\u002Ftensorchord\u002FAwesome-LLMOps) - 为开发者提供的最佳LLMOps工具的精选列表。\n- [MLOps精选](https:\u002F\u002Fgithub.com\u002Fvisenger\u002Fawesome-mlops) - MLOps（机器学习运维）相关参考资料的精选列表。\n- [ChatGPT提示词精选](https:\u002F\u002Fgithub.com\u002Ff\u002Fawesome-chatgpt-prompts) - 一系列可用于ChatGPT模型的提示词示例集合。\n- [中文版ChatGPT提示词精选](https:\u002F\u002Fgithub.com\u002FPlexPt\u002Fawesome-chatgpt-prompts-zh) - 一系列可用于ChatGPT模型的中文提示词示例集合。\n- [ChatGPT精选](https:\u002F\u002Fgithub.com\u002Fhumanloop\u002Fawesome-chatgpt) - OpenAI旗下ChatGPT和GPT-3相关资源的精选列表。\n- [思维链论文集](https:\u002F\u002Fgithub.com\u002FTimothyxxx\u002FChain-of-ThoughtsPapers) - 以“通过思维链提示激发大型语言模型的推理能力”为开端的趋势相关论文集合。\n- [指令微调论文集](https:\u002F\u002Fgithub.com\u002FSinclairCoder\u002FInstruction-Tuning-Papers) - 以`Natrural-Instruction`（ACL 2022）、`FLAN`（ICLR 2022）和`T0`（ICLR 2022）为起点的趋势相关论文集合。\n- [LLM阅读清单](https:\u002F\u002Fgithub.com\u002Fcrazyofapple\u002FReading_groups\u002F) - 大型语言模型相关的论文和资源清单。\n- [利用语言模型进行推理](https:\u002F\u002Fgithub.com\u002Fatfortes\u002FLM-Reasoning-Papers) - 关于利用语言模型进行推理的论文和资源集合。\n- [思维链中心](https:\u002F\u002Fgithub.com\u002FFranxYao\u002Fchain-of-thought-hub) - 用于评估大型语言模型推理性能的平台。\n- [GPT精选](https:\u002F\u002Fgithub.com\u002Fformulahendry\u002Fawesome-gpt) - 与GPT、ChatGPT、OpenAI、LLM等相关的优秀项目和资源的精选列表。\n- [GPT-3精选](https:\u002F\u002Fgithub.com\u002Felyase\u002Fawesome-gpt3) - 一系列关于[OpenAI GPT-3 API](https:\u002F\u002Fopenai.com\u002Fblog\u002Fopenai-api\u002F)的演示和文章集合。\n\n\n\u003Cbr>\u003Cbr>\n## 许可证\n\n[![MIT许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-blue.svg)](https:\u002F\u002Flbesson.mit-license.org\u002F)\n\n[MIT许可证](https:\u002F\u002Flbesson.mit-license.org\u002F)。\n\n[![CC BY-NC-SA 4.0](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-CC%20BY--NC--SA%204.0-lightgrey.svg)](http:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc-sa\u002F4.0\u002F)\n\n[知识共享署名-非商业性使用-相同方式共享4.0国际许可协议](http:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc-sa\u002F4.0\u002F)。\n\n\u003Cbr>\u003Cbr>\n\n## 引用\n\n```bibtex\n@misc{llm-eval-anthropomorphic,\n      title={超越基准：基于拟人化与价值导向的大型语言模型评估路线图}, \n      author={Jun Wang, Ninglun Gu, Kailai Zhang, Zijiao Zhang, Yelun Bao, Jin Yang, Xu Yin, Liwei Liu, Yihuan Liu, Pengyong Li, Gary G. Yen, Junchi Yan},\n      year={2025},\n      eprint={2508.18646},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.18646}, \n}\n```","# Awesome-LLM-Eval 快速上手指南\n\n**Awesome-LLM-Eval** 并非一个单一的 Python 包或可执行工具，而是一个**精选资源列表（Curated List）**。它汇集了用于大语言模型（LLM）评估的工具、数据集、基准测试、论文、排行榜及模型列表。\n\n本指南将帮助你如何利用该仓库快速找到适合的评估方案，并演示如何安装和使用其中收录的典型评估框架（以 **OpenCompass** 和 **LightEval** 为例）。\n\n## 1. 环境准备\n\n由于本仓库包含多种不同技术栈的评估工具，建议根据你的具体需求准备环境。通用推荐配置如下：\n\n*   **操作系统**: Linux (Ubuntu 20.04+) 或 macOS\n*   **Python 版本**: 3.8 - 3.10 (大多数 LLM 工具推荐 3.9)\n*   **硬件要求**:\n    *   **CPU**: 仅运行轻量级指标计算时可用。\n    *   **GPU**: 运行本地模型推理评估时，推荐 NVIDIA GPU (显存 >= 16GB 用于 7B-13B 模型，>= 24GB 用于更大模型)。\n*   **前置依赖**:\n    *   Git\n    *   Conda 或 venv (强烈建议使用虚拟环境隔离不同工具)\n    *   CUDA Toolkit (如需 GPU 加速)\n\n## 2. 获取资源与安装示例\n\n### 步骤一：克隆仓库\n首先获取完整的评估资源列表：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fyour-repo\u002FAwesome-LLM-Eval.git\ncd Awesome-LLM-Eval\n```\n\n### 步骤二：选择并安装具体评估工具\n浏览仓库中的 `Tools` 或 `Datasets \u002F Benchmark` 章节，选择适合你的工具。以下是两个主流工具的安裝命令（优先使用国内镜像源加速）：\n\n#### 方案 A：安装 OpenCompass (上海人工智能实验室出品，中文支持好)\n适用于综合性能评估，支持大量中文数据集。\n\n```bash\n# 创建虚拟环境\nconda create -n opencompass python=3.9 -y\nconda activate opencompass\n\n# 使用清华源安装 opencompass\npip install opencompass -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n\n# 安装额外依赖（如需评测特定任务）\npip install datasets pandas tqdm -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n#### 方案 B：安装 LightEval (Hugging Face 出品，轻量级)\n适用于快速基准测试，兼容 Hugging Face 生态。\n\n```bash\n# 创建虚拟环境\nconda create -n lighteval python=3.9 -y\nconda activate lighteval\n\n# 使用阿里源安装 lighteval\npip install lighteval -i https:\u002F\u002Fmirrors.aliyun.com\u002Fpypi\u002Fsimple\u002F\n\n# 安装 accelerate 用于多卡推理\npip install accelerate -i https:\u002F\u002Fmirrors.aliyun.com\u002Fpypi\u002Fsimple\u002F\n```\n\n## 3. 基本使用示例\n\n以下展示如何使用上述工具进行最简单的模型评估。请确保你已下载模型权重或知道模型的 Hugging Face ID。\n\n### 示例 1：使用 OpenCompass 评估模型\n创建一个配置文件 `config_demo.py`：\n\n```python\nfrom mmengine.config import read_base\n\nwith read_base():\n    from opencompass.configs.summarizers.medium import summarizer\n    from opencompass.configs.datasets.mmlu.mmlu_gen import mmlu_datasets\n\ndatasets = mmlu_datasets\nmodels = [\n    dict(\n        type=HuggingFaceCausalLM,\n        abbr='qwen-7b-demo',\n        path=\"Qwen\u002FQwen-7B\", # 替换为你要评估的模型路径\n        model_kwargs=dict(\n            device_map='auto',\n            trust_remote_code=True,\n        ),\n        batch_size=8,\n        max_out_len=100,\n        max_seq_len=2048,\n        run_cfg=dict(num_gpus=1),\n    )\n]\n\nsummarizers = [summarizer]\n```\n\n运行评估命令：\n\n```bash\nopencompass config_demo.py --work-dir .\u002Foutputs\n```\n\n### 示例 2：使用 LightEval 评估 MMLU 基准\n直接使用命令行调用，评估 `meta-llama\u002FLlama-2-7b-hf` 模型：\n\n```bash\nlighteval \\\n  --model_args \"pretrained=meta-llama\u002FLlama-2-7b-hf\" \\\n  --tasks \"mmlu\" \\\n  --batch_size 4 \\\n  --limit 10 \\\n  --output_dir \".\u002Feval_results\"\n```\n\n### 下一步行动\n回到 `Awesome-LLM-Eval` 仓库目录，查看 `README.md` 中的以下章节以获取更多专项资源：\n*   **RAG-Evaluation**: 检索增强生成评估工具。\n*   **Agent-Capabilities**: 智能体能力评估基准。\n*   **Coding-Capabilities**: 代码生成与调试能力评估。\n*   **Leaderboards**: 查看最新的模型排行榜数据。","某金融科技公司算法团队正急需为即将上线的智能投顾大模型进行全方位能力评估，以确保其在专业问答、逻辑推理及安全性上符合监管要求。\n\n### 没有 Awesome-LLM-Eval 时\n- **资源搜集低效**：团队成员需花费数天在 GitHub、Hugging Face 和论文网站间手动搜索分散的评测工具与数据集，极易遗漏关键基准。\n- **评测维度单一**：仅依赖通用的准确率指标，缺乏针对金融领域（如 FinEval）、长上下文理解及 Agent 自主决策能力的专项评估方案。\n- **选型盲目**：面对层出不穷的新模型，缺乏权威的排行榜（Leaderboard）和横向对比数据，难以判断哪个开源模型最适合业务场景。\n- **标准不统一**：不同成员采用的评测脚本和数据处理方式各异，导致评估结果无法复现，团队内部对模型性能争议不断。\n\n### 使用 Awesome-LLM-Eval 后\n- **一站式获取资源**：直接利用其精选列表，快速定位到 ColossalEval、OpenCompass 等成熟工具及 OpenFinData 等专业数据集，将准备周期从数天缩短至几小时。\n- **构建多维评估体系**：参考其分类架构，迅速搭建涵盖通用能力、金融垂直领域、RAG 检索增强及安全对齐的全方位评测框架，无死角探查模型边界。\n- **科学决策选型**：依托集成的最新排行榜和论文综述，精准锁定在金融推理任务上表现最优的开源模型，大幅降低试错成本。\n- **统一评测规范**：采纳社区公认的评测范式与文档，确保团队内部测试流程标准化，输出结果具备高度可比性和说服力。\n\nAwesome-LLM-Eval 通过整合碎片化的评测生态，帮助团队从“盲目摸索”转向“科学度量”，显著提升了大模型落地的效率与可靠性。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fonejune2018_Awesome-LLM-Eval_31e6bafb.png","onejune2018","JUN","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fonejune2018_02bd66a7.png","IBM Research & CMCC & Peking University & ETH Zurich & CAS & PA","IBM Research China",null,"https:\u002F\u002Fgithub.com\u002Fonejune2018",627,54,"2026-03-30T02:10:36","MIT",1,"","未说明",{"notes":90,"python":88,"dependencies":91},"Awesome-LLM-Eval 是一个 curated list（精选列表）项目，主要收录用于大语言模型评估的工具、数据集、基准测试、论文和模型等资源清单。该项目本身不是一个可执行的软件工具或框架，因此没有特定的操作系统、GPU、内存、Python 版本或依赖库安装要求。用户若需使用列表中提到的具体评估工具（如 OpenCompass, AlpacaEval, DeepEval 等），请参考各工具独立的文档以获取相应的运行环境需求。",[],[54,51,26,13],[94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112],"awsome-list","benchmark","bert","chatglm","chatgpt","dataset","evaluation","gpt3","large-language-model","leaderboard","llm","machine-learning","nlp","openai","awsome-lists","llama","llm-evaluation","qwen","rag","2026-03-27T02:49:30.150509","2026-04-06T05:44:18.633912",[],[]]