[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-Tencent-Hunyuan--Tencent-Hunyuan-Large":3,"tool-Tencent-Hunyuan--Tencent-Hunyuan-Large":65},[4,17,27,35,48,57],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",155373,2,"2026-04-14T11:34:08",[13,14,15],"开发框架","Agent","语言模型","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,3,"2026-04-06T11:19:32",[15,26,14,13],"图像",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":10,"last_commit_at":33,"category_tags":34,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":10,"last_commit_at":41,"category_tags":42,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85092,"2026-04-10T11:13:16",[26,43,44,45,14,46,15,13,47],"数据工具","视频","插件","其他","音频",{"id":49,"name":50,"github_repo":51,"description_zh":52,"stars":53,"difficulty_score":54,"last_commit_at":55,"category_tags":56,"status":16},5784,"funNLP","fighting41love\u002FfunNLP","funNLP 是一个专为中文自然语言处理（NLP）打造的超级资源库，被誉为\"NLP 民工的乐园”。它并非单一的软件工具，而是一个汇集了海量开源项目、数据集、预训练模型和实用代码的综合性平台。\n\n面对中文 NLP 领域资源分散、入门门槛高以及特定场景数据匮乏的痛点，funNLP 提供了“一站式”解决方案。这里不仅涵盖了分词、命名实体识别、情感分析、文本摘要等基础任务的标准工具，还独特地收录了丰富的垂直领域资源，如法律、医疗、金融行业的专用词库与数据集，甚至包含古诗词生成、歌词创作等趣味应用。其核心亮点在于极高的全面性与实用性，从基础的字典词典到前沿的 BERT、GPT-2 模型代码，再到高质量的标注数据和竞赛方案，应有尽有。\n\n无论是刚刚踏入 NLP 领域的学生、需要快速验证想法的算法工程师，还是从事人工智能研究的学者，都能在这里找到急需的“武器弹药”。对于开发者而言，它能大幅减少寻找数据和复现模型的时间；对于研究者，它提供了丰富的基准测试资源和前沿技术参考。funNLP 以开放共享的精神，极大地降低了中文自然语言处理的开发与研究成本，是中文 AI 社区不可或缺的宝藏仓库。",79857,1,"2026-04-08T20:11:31",[15,43,46],{"id":58,"name":59,"github_repo":60,"description_zh":61,"stars":62,"difficulty_score":54,"last_commit_at":63,"category_tags":64,"status":16},6590,"gpt4all","nomic-ai\u002Fgpt4all","GPT4All 是一款让普通电脑也能轻松运行大型语言模型（LLM）的开源工具。它的核心目标是打破算力壁垒，让用户无需依赖昂贵的显卡（GPU）或云端 API，即可在普通的笔记本电脑和台式机上私密、离线地部署和使用大模型。\n\n对于担心数据隐私、希望完全掌控本地数据的企业用户、研究人员以及技术爱好者来说，GPT4All 提供了理想的解决方案。它解决了传统大模型必须联网调用或需要高端硬件才能运行的痛点，让日常设备也能成为强大的 AI 助手。无论是希望构建本地知识库的开发者，还是单纯想体验私有化 AI 聊天的普通用户，都能从中受益。\n\n技术上，GPT4All 基于高效的 `llama.cpp` 后端，支持多种主流模型架构（包括最新的 DeepSeek R1 蒸馏模型），并采用 GGUF 格式优化推理速度。它不仅提供界面友好的桌面客户端，支持 Windows、macOS 和 Linux 等多平台一键安装，还为开发者提供了便捷的 Python 库，可轻松集成到 LangChain 等生态中。通过简单的下载和配置，用户即可立即开始探索本地大模型的无限可能。",77307,"2026-04-11T06:52:37",[15,13],{"id":66,"github_repo":67,"name":68,"description_en":69,"description_zh":70,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":69,"owner_location":69,"owner_email":69,"owner_twitter":69,"owner_website":79,"owner_url":80,"languages":81,"stars":90,"forks":91,"last_commit_at":92,"license":93,"difficulty_score":94,"env_os":95,"env_gpu":96,"env_ram":95,"env_deps":97,"category_tags":105,"github_topics":69,"view_count":10,"oss_zip_url":69,"oss_zip_packed_at":69,"status":16,"created_at":106,"updated_at":107,"faqs":108,"releases":137},7584,"Tencent-Hunyuan\u002FTencent-Hunyuan-Large","Tencent-Hunyuan-Large",null,"Tencent-Hunyuan-Large 是腾讯开源的一款超大规模语言模型，也是目前行业内最大的基于 Transformer 架构的混合专家（MoE）模型。它旨在解决大模型在规模扩张过程中，如何平衡卓越性能与资源消耗的行业难题。\n\n该模型拥有惊人的 3890 亿总参数量，但在实际推理时仅激活 520 亿参数。这种独特的“混合专家”机制使其能够在保持顶尖任务处理能力的同时，显著降低计算成本和延迟，实现了高效能与低资源的完美统一。无论是复杂的自然语言理解、逻辑推理，还是长文本处理，它都能提供出色的表现。\n\nTencent-Hunyuan-Large 特别适合 AI 研究人员、大模型开发者以及企业技术团队使用。研究人员可借此深入探索 MoE 架构的前沿技术；开发者则能利用其开源权重（包括 FP8 量化版本），在相对有限的算力条件下部署高性能应用，构建智能客服、内容创作或数据分析系统。通过开放技术细节与模型权重，Tencent-Hunyuan-Large 致力于推动社区创新，让超大模型的强大能力更易于被获取和应用。","\u003Cp align=\"left\">\n    \u003Ca href=\"README_CN.md\">中文\u003C\u002Fa>&nbsp ｜ English\u003C\u002Fa>\n\u003C\u002Fp>\n\u003Cbr>\u003Cbr>\n\n\u003Cp align=\"center\">\n \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_Tencent-Hunyuan-Large_readme_9a3e6454d04e.png\" width=\"400\"\u002F> \u003Cbr>\n\u003C\u002Fp>\u003Cp>\u003C\u002Fp>\n\n\u003Cp align=\"center\">\n    🫣&nbsp\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\">\u003Cb>Hugging Face\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp |  &nbsp&nbsp🖥️&nbsp&nbsp\u003Ca href=\"https:\u002F\u002Fllm.hunyuan.tencent.com\u002F\" style=\"color: red;\">\u003Cb>official website\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp｜&nbsp&nbsp🕖&nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fcloud.tencent.com\u002Fproduct\u002Fhunyuan\" >\u003Cb>HunyuanAPI\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp｜&nbsp&nbsp🐳&nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fgitee.com\u002FTencent\u002FTencent-Hunyuan-Large\" >\u003Cb>Gitee\u003C\u002Fb>\u003C\u002Fa>\n\u003C\u002Fp>\u003Cp align=\"center\">\n    \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.02265\" style=\"color: red;\">\u003Cb>Technical Report\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp｜&nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Ftencent\u002FHunyuan-Large\">\u003Cb>Demo\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp&nbsp｜&nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F851\u002F112032\" style=\"color: red;\">\u003Cb>Tencent Cloud TI\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp&nbsp\u003C\u002Fp>\n\u003Cp>\u003Cbr>\u003C\u002Fp>\n\u003Cp>\n    \u003Ctable align=\"center\">\n        \u003Ctbody>\n            \u003Ctr>\n                \u003Ctd align=\"center\" colspan=\"3\">\u003Cstrong>Download Models\u003C\u002Fstrong>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd align=\"center\" style=\"width: 100px;\" >Models\u003C\u002Ftd>\n                \u003Ctd align=\"center\" style=\"width: 500px;\">Huggingface Download URL\u003C\u002Ftd>\n                \u003Ctd align=\"center\" style=\"width: 500px;\">Tencent Cloud Download URL\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd style=\"width: 100px;\">Hunyuan-A52B-Instruct-FP8\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Instruct-FP8\" style=\"color: red;\">Hunyuan-A52B-Instruct-FP8\u003C\u002Fa>\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fcdn-large-model.hunyuan.tencent.com\u002FHunyuan-A52B-Instruct-128k-fp8-20241116.zip\" style=\"color: red;\">Hunyuan-A52B-Instruct-FP8\u003C\u002Fa>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd style=\"width: 100px;\">Hunyuan-A52B-Instruct\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Instruct\" style=\"color: red;\">Hunyuan-A52B-Instruct\u003C\u002Fa>\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fcdn-large-model.hunyuan.tencent.com\u002FHunyuan-A52B-Instruct-128k-20241116.zip\" style=\"color: red;\">Hunyuan-A52B-Instruct\u003C\u002Fa>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd style=\"width: 100px;\">Hunyuan-A52B-Pretrain\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Pretrain\" style=\"color: red;\">Hunyuan-A52B-Pretrain\u003C\u002Fa>\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fcdn-large-model.hunyuan.tencent.com\u002FHunyuan-A52B-Pretrain-256k.zip\" style=\"color: red;\">Hunyuan-A52B-Pretrain\u003C\u002Fa>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n        \u003C\u002Ftbody>\n    \u003C\u002Ftable>\n\u003C\u002Fp>\n\n\u003Cp>\u003C\u002Fp>\n\n\n## Model Introduction\n\nWith the rapid development of artificial intelligence technology, large language models (LLMs) have made significant progress in fields such as natural language processing, computer vision, and scientific tasks. However, as the scale of these models increases, optimizing resource consumption while maintaining high performance has become a key challenge. To address this challenge, we have explored Mixture of Experts (MoE) models. The currently unveiled Hunyuan-Large (Hunyuan-MoE-A52B) model is the largest open-source Transformer-based MoE model in the industry, featuring a total of 389 billion parameters and 52 billion active parameters. This is currently the largest open-source Transformer-based MoE model in the industry, featuring a total of 389 billion parameters and 52 billion active parameters. \n\nBy open-sourcing the Hunyuan-Large model and revealing related technical details, we hope to inspire more researchers with innovative ideas and collectively advance the progress and application of AI technology. We welcome you to join our open-source community to explore and optimize future AI models together!\n \n### Introduction to Technical Advantages\n\n#### Model\n- **High-Quality Synthetic Data**: By enhancing training with synthetic data, Hunyuan-Large can learn richer representations, handle long-context inputs, and generalize better to unseen data.\n\n- **KV Cache Compression**: Utilizes Grouped Query Attention (GQA) and Cross-Layer Attention (CLA) strategies to significantly reduce memory usage and computational overhead of KV caches, improving inference throughput.\n\n- **Expert-Specific Learning Rate Scaling**: Sets different learning rates for different experts to ensure each sub-model effectively learns from the data and contributes to overall performance.\n\n- **Long-Context Processing Capability**: The pre-trained model supports text sequences up to 256K, and the Instruct model supports up to 128K, significantly enhancing the ability to handle long-context tasks.\n\n- **Extensive Benchmarking**: Conducts extensive experiments across various languages and tasks to validate the practical effectiveness and safety of Hunyuan-Large.\n\n#### Inference Framework\n- This open-source release offers two inference backend options tailored for the Hunyuan-Large model: the popular [vLLM-backend](https:\u002F\u002Fgithub.com\u002Fquinnrong94\u002Fvllm\u002Ftree\u002Fdev_hunyuan) and the TensorRT-LLM Backend. Both solutions include optimizations for enhanced performance. For instance, the introduction of a new CLA structure significantly reduces GPU memory usage, achieving a 50% savings in the KV-Cache portion, which ensures efficient handling of long text scenarios. Additionally, by employing FP8 quantization, we achieve a 50% reduction in memory usage compared to traditional FP16\u002FBF16 quantization, while maintaining precision and resulting in a 70% increase in throughput. Meanwhile, by leveraging the efficient operators at the core of TRT-LLM, the performance of the TRT-LLM solution surpasses that of vLLM by over 30%. The TRT-LLM solution is widely used in Tencent's Hunyuan project. In this release, we are initially open-sourcing the vLLM solution, with plans to release the TRT-LLM solution in the near future.\n\n#### Training Framework\n\n- The Hunyuan-Large open-source model is fully compatible with the Hugging Face format, enabling researchers and developers to perform model fine-tuning using the hf-deepspeed framework. Additionally, we support training acceleration through the use of flash attention. To further assist in the adoption process, we have made the corresponding training scripts and model implementations publicly available to the community through this release, facilitating subsequent model training and fine-tuning operations based on these resources.\n\n&nbsp;\n\n## Related News\n* 2024.11.25 Our self-developed long-context benchmark, i.e., PenguinScrolls, has been officially released! You can explore the project on [GitHub](https:\u002F\u002Fgithub.com\u002FPenguin-Scrolls\u002FPenguinScrolls) and access the dataset on [Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPenguin-Scrolls\u002FPenguinScrolls).\n* 2024.11.18 **Hunyuan-A52B-Instruct** and **Hunyuan-A52B-Instruct-FP8** model update. \n* 2024.11.5 [TI Platform](https:\u002F\u002Fcloud.tencent.com\u002Fproduct\u002Fti) has integrated Hunyuan-Large model already, you can easily train and deploy it in just a few steps. Visit [Chat with Hunyuan-Large](https:\u002F\u002Fconsole.cloud.tencent.com\u002Ftione\u002Fv2\u002Faimarket\u002Fdetail\u002Fhunyuan_series?PublicAlgoGroupId=hunyuan-large-chat&detailTab=demo) to experience real-time conversations with the model, and explore [Hunyuan-Large Best Practice on TI](https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F851\u002F112032) to create your own customized Hunyuan-Large model. \n* 2024.11.5 We have open-sourced **Hunyuan-A52B-Pretrain**, **Hunyuan-A52B-Instruct**, and **Hunyuan-A52B-Instruct-FP8** on Hugging Face. We also released a technical report and a training and inference operations manual, providing detailed information on the model's capabilities and the procedures for training and inference.\n\n\n\n\n## Benchmark Evaluation\n**Hunyuan-Large pre-trained model** achieves the best overall performance compared to both Dense and MoE based \ncompetitors having similar activated parameter sizes.  For aggregated benchmarks such as MMLU, MMLU-Pro, and CMMLU, \nHunyuan-Large consistently achieves the best performance, confirming its comprehensive abilities on aggregated tasks.\nHunyuan-Large also shows superior performance in commonsense understanding and reasoning, and classical NLP tasks \nsuch as QA and reading comprehension tasks (e.g., CommonsenseQA, PIQA and TriviaQA).  \nFor the mathematics capability, Hunyuan-Large outperforms all baselines in math datasets of GSM8K and MATH, \nand also gains the best results on CMATH in Chinese.We also observe that Hunyuan-Large achieves the overall \nbest performance in all Chinese tasks (e.g., CMMLU, C-Eval).\n\n| Model            | LLama3.1-405B | LLama3.1-70B | Mixtral-8x22B | DeepSeek-V2 | Hunyuan-Large |\n|------------------|---------------|--------------|---------------|-------------|---------------|\n| MMLU             | 85.2          | 79.3         | 77.8          | 78.5        | **88.4**          |\n| MMLU-Pro         | **61.6**          | 53.8         | 49.5          | -           | 60.2          |\n| BBH              | 85.9          | 81.6         | 78.9          | 78.9        | **86.3**          |\n| HellaSwag        | -             | -            | **88.7**      | 87.8        | 86.8          |\n| CommonsenseQA    | 85.8          | 84.1         | 82.4          | -           | **92.9**          |\n| WinoGrande       | 86.7          | 85.3         | 85.0          | 84.9        | **88.7**          |\n| PIQA             | -             | -            | 83.6          | 83.7        | **88.3**          |\n| NaturalQuestions | -             | -            | 39.6          | 38.7        | **52.8**          |\n| DROP             | 84.8          | 79.6         | 80.4          | 80.1        | **88.9**          |\n| ARC-C            | **96.1**          | 92.9         | 91.2          | 92.4        | 95.0          |\n| TriviaQA         | -             | -            | 82.1          | 79.9        | **89.2**          |\n| CMMLU            | -             | -            | 60.0          | 84.0        | **90.2**          |\n| C-Eval           | -             | -            | 59.6          | 81.7        | **91.9**          |\n| C3               | -             | -            | 71.4          | 77.4        | **82.3**          |\n| GSM8K            | 89.0          | 83.7         | 83.7          | 79.2        | **92.8**          |\n| MATH             | 53.8          | 41.4         | 42.5          | 43.6        | **69.8**          |\n| CMATH            | -             | -            | 72.3          | 78.7        | **91.3**          |\n| HumanEval        | 61.0          | 58.5         | 53.1          | 48.8        | **71.4**          |\n| MBPP             | **73.4**          | 68.6         | 64.2          | 66.6        | 72.6          |\n\n**Hunyuan-Large-Instruct** achieves consistent improvements on most types of tasks compared to LLMs having similar \nactivated parameters, indicating the effectiveness of our post-training.    Delving into the model performance \nin different categories of benchmarks, we find that our instruct model achieves the best performance on MMLU and MATH dataset.  \nNotably, on the MMLU dataset, our model demonstrates a significant improvement, outperforming the LLama3.1-405B model by 2.6%.   \nThis enhancement is not just marginal but indicative of the Hunyuan-Large-Instruct’s superior understanding and reasoning \ncapabilities across a wide array of language understanding tasks. The model’s prowess is further underscored in its performance \non the MATH dataset, where it surpasses the LLama3.1-405B by a notable margin of 3.6%.  \nRemarkably, this leap in accuracy is achieved with only 52 billion activated parameters, underscoring the efficiency of our model.\n\n| Model                | LLama3.1 405B Inst. | LLama3.1 70B Inst. | Mixtral 8x22B Inst. | DeepSeekV2.5 Chat | Hunyuan-Large Inst. |\n|----------------------|---------------------|--------------------|---------------------|-------------------|---------------------|\n| MMLU                 | 87.3                | 83.6               | 77.8                | 80.4              | **89.9**            |\n| CMMLU                | -                   | -                  | 61.0                | -                 | **90.4**            |\n| C-Eval               | -                   | -                  | 60.0                | -                 | **88.6**            |\n| BBH                  | -                   | -                  | 78.4                | 84.3              | **89.5**            |\n| HellaSwag            | -                   | -                  | 86.0                | **90.3**          | 88.5                |\n| ARC-C                | **96.9**            | 94.8               | 90.0                | -                 | 94.6                |\n| GPQA_diamond         | **51.1**            | 46.7               | -                   | -                 | 42.4                |\n| MATH                 | 73.8                | 68.0               | 49.8                | 74.7              | **77.4**            |\n| HumanEval            | 89.0                | 80.5               | 75.0                | 89.0              | **90.0**            |\n| AlignBench           | 6.0                 | 5.9                | 6.2                 | 8.0               | **8.3**             |\n| MT-Bench             | 9.1                 | 8.8                | 8.1                 | 9.0               | **9.4**             |\n| IFEval strict-prompt | **86.0**            | 83.6               | 71.2                | -                 | 85.0                |\n| Arena-Hard |  69.3            | 55.7               |  -                | 76.2                 | **81.8**            |\n| AlpacaEval-2.0 | 39.3            | 34.3               | 30.9                | 50.5                 | **51.8**            |\n\n\n\n\n\n&nbsp;\n\n## Data\n\nHunyuan-Large provides model training-related processes. In this section, you can process the training data format for use in model training.\n\n### Training Data Format and Processing\n\nTraining data is processed into a messages format as follows. The default system prompt for training and inference is \"You are a helpful assistant.\" Below are examples of single-turn and multi-turn data:\n\n\n```python\n# Single-Turn Data Format\n\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"海水为什么是咸的\" },\n    {\"role\": \"assistant\", \"content\": \"海水是咸的主要是因为其中含有许多溶解在水中的盐类和矿物质。这些盐类和矿物质来自于地球表面的岩石和土壤中的化学物质，随着时间的推移，它们被带到了海洋中。当海水蒸发时，水分蒸发掉了，但盐类和矿物质仍然留在水中，导致海水变得更加咸味。因此，海水的咸度是由其中的盐类和矿物质的含量决定的。\"}\n]\n\n# Multi-Turn Data Format\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"知道重庆森林这部电影吗？\" },\n    {\"role\": \"assistant\", \"content\": \"知道呀，是一部由王家卫导演的片子。\" },\n    {\"role\": \"user\", \"content\": \" 而主演里更是有王菲，一上映便受到追捧。\" },\n    {\"role\": \"assistant\", \"content\": \"所以此片获得了第14届香港电影金像奖最佳影片奖。\"}\n]\n\nfrom tokenizers import ByteLevelBPETokenizer\nfrom transformers import AutoTokenizer\n\ntokenizer = AutoTokenizer.from_pretrained(\".\u002Fmodels\", use_fast=False, trust_remote_code=True)\n\nids = tokenizer.apply_chat_template(messages)\n\n```\n\n\n\nFor more usage references, see the `.\u002Fmodels\u002Ftest.py` file.\n\n&nbsp;\n\n## Quick Start\n\nYou can quickly get started by referring to the content in the \u003Ca href=\"examples\u002FREADME.md\">Quick Start Guide\u003C\u002Fa>.\n\n## Model Training\n\nTo simplify the Training process, HunyuanLLM provides a pre-built Docker image:\n\n [hunyuaninfer\u002Fhunyuan-large](https:\u002F\u002Fhub.docker.com\u002Frepository\u002Fdocker\u002Fhunyuaninfer\u002Fhunyuan-large\u002Fgeneral). \n\n### Hardware Requirements\n\nTested on H20, without enabling `make_moe_param_leaf_module` and using `zero3+offload`, with a `max_seq_length` of 2048, full fine-tuning requires at least 32 GPUs, and LoRA fine-tuning requires at least 8 GPUs.\n\n### Training Performance\n\nWith the minimum configuration (8 GPUs for LoRA fine-tuning), `per_device_train_batch_size` is set to 1, and `gradient_accumulation_steps` is set to 1, resulting in approximately 35 seconds per iteration.\n\n### Launch Method\n\nRefer to: [HuggingFace Transformers Trainer](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fv4.19.2\u002Fen\u002Fmain_classes\u002Ftrainer)\n\n#### Single-Machine Training\n\nIn the `train` directory, execute:\n\n```sh\npip install -r requirements.txt\nbash train.sh\n```\n\n#### Multi-Machine Training\n\nTo start training on multiple machines, follow the steps below and ensure that all machines are within the same cluster.\n\n##### Configure Passwordless SSH Login Between Machines\n\nThe following steps use two machines as an example, with their IPs represented as `${ip1}` and `${ip2}`. These operations are performed within a Docker container.\n\nFirst, configure passwordless SSH between containers on each machine.\n\n\n```sh\nssh-keygen\t\t\t# Generate id_rsa and id_rsa.pub for passwordless login\nssh-keygen -t rsa -A    # Generate \u002Fetc\u002Fssh\u002Fssh_host_rsa_key and ssh_host_ecdsa_key for starting 'SSH listen' later\n\u002Fusr\u002Fsbin\u002Fsshd -p 36005 -o ListenAddress=0.0.0.0        # Start SSH listen\necho \"Port 36005\" > ~\u002F.ssh\u002Fconfig   # Change SSH connection port to 36005\npasswd root    # Set root password to avoid alerts from monitoring platforms\n```\n\n\nNote: The `36005` here is an example. You can choose any port, but ensure that the port is **open** and **not occupied by other processes**.\n\nNext, within the container on each machine, execute:\n\n```sh\ncat ~\u002F.ssh\u002Fid_rsa.pub\n```\n\n**Copy the output SSH public key and paste it into the `~\u002F.ssh\u002Fauthorized_keys` file, with one public key per line. This must be done on every machine.** Ultimately, the `~\u002F.ssh\u002Fauthorized_keys` file on each machine should be identical and contain the public keys of all machines.\n\nIt's important to note that during multi-node training, the code executed on each node must be consistent. It is recommended to mount a shared network drive. If mounting a shared drive is not possible, you need to manually copy the dataset, scripts, and code to the same directory on all machines.\n\n##### Start Multi-Machine Training\n\nOnce the preparation steps are completed and dependencies are confirmed to be installed (if not, execute `pip install -r requirements.txt` to install), you can add the following configuration at the beginning of `train.sh`:\n\n```shell\nexport HOST_GPU_NUM=8\n# Current machine IP\nexport LOCAL_IP=${ip1}\n# Multi-node machine IPs, separated by commas\nexport NODE_IP_LIST=\"${ip1}:8,${ip2}:8\"\n# Number of machine nodes\nexport NODES=2\nexport NODE_NUM=$((${NODES} * ${HOST_GPU_NUM}))\n```\n\nNote: Replace `${ip1}` and `${ip2}` with the actual IP addresses!\n\nThen, on the machine with `${ip1}`, execute `bash train.sh` in the `train\u002F` directory. Note that on the first run, you might see the following output:\n\n```ssh\nThe authenticity of host '[ip]:36005 ([ip]:36005)' can't be established.\nECDSA key fingerprint is xxxxxx.\nECDSA key fingerprint is MD5:xxxxxx.\nAre you sure you want to continue connecting (yes\u002Fno)?\n```\n\nAt this point, type `yes` to continue.\n\n##### Key Parameters\n\nThe key parameters in the script are as follows:\n\n- `--deepspeed`: This parameter should point to a DeepSpeed configuration file. The `train` folder provides three default DeepSpeed configuration files: `ds_zero2_no_offload.json`, `ds_zero3_no_offload.json`, `ds_zero3_offload.json`. The required GPU memory decreases in this order.\n- `--model_name_or_path`: The path to the HF pre-trained model. Ensure this path contains the `modeling_hunyuan.py` and `configuration_hunyuan.py` files; otherwise, it cannot be loaded.\n- `--tokenizer_name_or_path`: The path to the tokenizer folder. Ensure this path contains the `tokenization_hy.py` file; otherwise, it cannot be loaded.\n- `--train_data_file`: The path to the training file, which should be a JSONL file.\n- `--output_dir`: The output directory where logs, tensorboard files, and model weights will be stored.\n- `--per_device_train_batch_size`: The batch size per GPU.\n- `--gradient_accumulation_steps`: The number of gradient accumulation steps. The global batch size is `per_device_train_batch_size * gradient_accumulation_steps * dp_size`.\n- `--max_steps`: The total number of training steps.\n- `--save_steps`: The number of steps between saving checkpoints.\n- `--use_lora`: Whether to use LoRA for training. This also accepts `--lora_rank`, `--lora_alpha`, and `--lora_dropout` parameters. LoRA is applied by default to the 'q_proj', 'k_proj', 'v_proj', 'o_proj' parameters. If you need to change this, modify it in the code. Note: **When using LoRA for training, only the LoRA weights are saved, not the base model weights**. If you need to merge LoRA weights, see the \"LoRA Weight Merging\" section below.\n- `--make_moe_param_leaf_module`: When using zero3 and MoE training, treat the MoE module as a leaf module, meaning its parameters are not split by zero3. This option is expected to significantly increase memory usage.\n- `--gradient_checkpointing`: Enable gradient checkpointing.\n- `--train_attention_params_only`: Whether to train only the attention parameters.\n- `--learning_rate`: The maximum learning rate during training.\n- `--min_lr`: The minimum learning rate during training.\n- `--use_flash_attn`: 开启 flash-attention 进行训练加速\n\n**Note:**\n\n- If you want to continue training from a previously saved checkpoint instead of loading pre-trained weights, specify `--resume_from_checkpoint` with the path to the checkpoint from the previous training. Do not specify `--model_name_or_path`, as this will only load the weights and not the training state.\n- When continuing training from a checkpoint, there might be slight deviations in loss due to randomness introduced by some non-deterministic algorithms, which is considered normal. Refer to: [HuggingFace Transformers Trainer Randomness](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fmain\u002Fen\u002Fperf_train_gpu_one#randomness)\n- When `--model_name_or_path` is specified, all model-related parameters will be ignored.\n- Samples within a batch will be padded to align with the longest sample in the batch, with each sample having a maximum length of `max_seq_length`. Any excess will be truncated.\n- If you encounter warnings about bias weights not being loaded, you can ignore them, as biases are not used in Hunyuan-Large.\n\n\n#### What to Do If Out of Memory?\n\nRefer to: [DeepSpeed Configuration](https:\u002F\u002Fwww.deepspeed.ai\u002Fdocs\u002Fconfig-json\u002F)\n\nYou can try modifying the DeepSpeed configuration by removing the auto attribute from these parameters and reducing their values:\n\n- `stage3_param_persistence_threshold`\n- `stage3_prefetch_bucket_size`\n- `stage3_max_reuse_distance`\n- `stage3_max_reuse_distance`\n\n#### Merging LoRA Models\n\nThe saved LoRA weights cannot be merged into the zero3 model during training because, with zero3 enabled, model weights are split across different data parallel ranks. If you want to merge LoRA weights into the base model, you can do so offline to obtain the merged weight file. Execute `merge_lora_weight.sh` to merge the LoRA weights with the base model weights. The parameters include:\n\n- `--base_model_path`: Directory of the base model weights\n- `--adapter_model_path`: Directory of the LoRA weights\n- `--output_path`: Directory to save the merged weights\n- `--save_dtype`: Data format for storing the merged weights, available options include: fp16, bf16, fp32\n\n&nbsp;\n\n## Inference and Deployment\n\nHunyuanLLM uses TRT-LLM and vLLM for deployment. We are open sourcing the [vLLM-backend](https:\u002F\u002Fgithub.com\u002Fquinnrong94\u002Fvllm\u002Ftree\u002Fdev_hunyuan) deployment (see Reasoning with vLLM), and the TRT-LLM deployment (see Reasoning with TRT-LLM) will be available in the near future.\n\n## Using TRT-LLM for Inference\n\nTo be opened\n\n## Using vLLM for Inference\n\n### Docker:\n\nTo simplify the deployment process, HunyuanLLM provides a pre-built Docker image:\n\n [hunyuaninfer\u002Fhunyuan-large](https:\u002F\u002Fhub.docker.com\u002Frepository\u002Fdocker\u002Fhunyuaninfer\u002Fhunyuan-large\u002Fgeneral). You only need to download the model files and start the Docker container using the code below to begin model inference.\n\n```shell\ndocker run --name hunyuanLLM_infer -itd --privileged --user root --net=host --ipc=host --gpus=8 hunyuaninfer\u002Fhunyuan-large:infer-open-source\n```\n\nNote: Docker container privilege management. The above code uses privileged mode (`--privileged`) to start the Docker container, which grants the container higher privileges, increasing the risk of data leakage and cluster security threats. It is recommended to avoid using privileged mode unless necessary to reduce security risks. For scenarios where privileged mode is required, conduct a thorough security assessment and implement appropriate security monitoring and hardening measures.\n\n### Configure Passwordless SSH Login Between Machines\n\nThe following steps use two machines as an example, with their IPs represented as `${ip1}` and `${ip2}`. These operations are performed within a Docker container.\n\nFirst, run `passwd` on both machines to set a password, for example: `Tmp123,.\u002F`\n\nCopy `inference\u002Flogin_ssh.py` into the container and execute the following command, ensuring the IP and password are correctly entered.\n\n```shell\npython3 login_ssh.py --ips ${ip1},${ip2} --port 36000 --password=Tmp123,.\u002F\n```\n\n**Note 📢: Before starting, be sure to verify multi-machine communication using VLLM's debugging script: https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Flatest\u002Fgetting_started\u002Fdebugging.html**\n\n### BF16 Deployment\n\nBF16 requires 16 H20 GPUs for deployment. After verifying that multi-machine communication is correct, execute the following steps:\n\nBefore running the commands, set the following environment variables:\n\n```shell\n${LOCAL_IP}: The IP corresponding to bond1 on the current machine\n${MODEL_PATH}: Path to the Hunyuan LLM model\n```\n\n#### Step 1: Start Ray\n\nRay is an open-source library for parallel and distributed Python. In this section, we use Ray to achieve multi-machine communication.\n\nRay Component Configuration Hardening: The default configuration of Ray components does not enable authentication mechanisms for service ports (e.g., 6379, 8265), posing risks of unauthorized access and command execution. It is recommended to deploy Ray components only in trusted internal network environments or ensure strict access control list (ACL) policies are implemented for these ports to prevent unauthorized network access.\n\nFirst, start Ray on each node (either in the background or by keeping the terminal running):\n\nOn the head node:\n```shell\nexport VLLM_HOST_IP=${LOCAL_IP}\nexport NCCL_SOCKET_IFNAME=bond1\nexport GLOO_SOCKET_IFNAME=bond1\nray start --block --head --node-ip-address=${LOCAL_IP} --port=6379\n```\n\nOn all worker nodes:\n\nNote: Replace `{HEAD NODE $LOCAL_IP}` with the actual `${LOCAL_IP}` of the head node.\n```shell\nexport VLLM_HOST_IP=${LOCAL_IP}\nexport NCCL_SOCKET_IFNAME=bond1\nexport GLOO_SOCKET_IFNAME=bond1\nray start --block --address={HEAD NODE $LOCAL_IP}:6379 --node-ip-address=${LOCAL_IP}\n```\nIf Ray fails to start, execute `ray stop` and then run the above commands again.\n\n#### Step 2: Execute Inference\n\n#### Method 1: Command Line Inference\n\nBelow is a code snippet demonstrating how to quickly request the chat model using `vLLM`:\n\nNote: vLLM Component Remote Code Execution Protection. In the code below, if the `trust-remote-code` configuration option of the vLLM component is enabled, it will allow loading and executing code from remote model repositories, which may lead to the execution of malicious code. Unless explicitly required by business needs, it is recommended to keep this configuration option disabled to reduce potential security threats.\n\n```python\nimport os\nfrom vllm import LLM, SamplingParams\n\nmodel_path=os.environ.get('MODEL_PATH')\n\nllm = LLM(model=model_path,\n        tokenizer=model_path,\n        trust_remote_code=True,\n        max_model_len=10240,\n        dtype='bfloat16',\n        tensor_parallel_size=16,\n        pipeline_parallel_size=1,\n        disable_log_stats=False,\n        gpu_memory_utilization=0.98,\n        disable_custom_all_reduce=True,\n        #distributed_executor_backend='ray',\n        enforce_eager=True,\n        max_num_seqs=8,\n        use_v2_block_manager=True,\n        quantization=None)\n\nprompts = [\"海水为什么是咸的\"]\n\nsampling_params = SamplingParams(\n    temperature=0.7, top_p=0.6, max_tokens=200, top_k=20, repetition_penalty=1.05)\n\noutputs = llm.generate(prompts, sampling_params)\n\n# Print the outputs.\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\n```\n\n#### Method 2: Service-Based Inference\n\nBelow we demonstrate how to deploy the model using `vLLM` in a service-based manner and make requests.\n\nRun the following on the head node:\n\n```shell\nexport VLLM_HOST_IP=${LOCAL_IP}\nexport NCCL_SOCKET_IFNAME=bond1\nexport GLOO_SOCKET_IFNAME=bond1\n```\n\nNext, start the service by running:\n\n```shell\ncd inference\nsh run_server.sh\n```\n\n*Tips*: Troubleshooting, if you encounter the following error:\n\n```python\nray.exceptions.RaySystemError: System error: No module named 'transformers_modules' traceback: Traceback (most recent call last):\nModuleNotFoundError: No module named 'transformers_modules'\n```\n\nCopy the `~\u002F.cache\u002Fhuggingface\u002Fmodules\u002F` directory from the head node to the corresponding path on all worker nodes.\n\nAfter successfully running `run_server.sh`, execute the request script:\n\n```shell\nsh openapi.sh\n```\n\nBe sure to modify `${LOCAL_IP}` and `${MODEL_PATH}` in `openapi.sh` to values match the corresponding service.\n\n\n### Quantized Model Deployment:\n\nThis section describes the process of deploying a quantized model using vLLM.\n\nImage: The deployment image is the same as for BF16.\n\n#### Int8 Quantized Model Deployment:\n\nTo deploy the Int8-weight-only version of the Hunyuan-L model, simply set the environment variables in `run_server_int8.sh`:\n\n```shell\n${MODEL_PATH}: Path to the BF16 model\n${LOCAL_IP}: The IP corresponding to bond1 on the current machine\n```\n\nThen, start the Int8 service by running:\n\n```shell\nsh run_server_int8.sh\n```\n\nAfter successfully running `run_server_int8.sh`, execute the request script:\n\n```shell\nsh openapi.sh\n```\n\n#### FP8 Quantized Model Deployment:\n\nTo deploy the W8A8C8 version of the Hunyuan-L model, simply set the environment variables in `run_server_fp8.sh`:\n\n```shell\n${MODEL_PATH}: Path to the FP8 model\n${LOCAL_IP}: The IP corresponding to bond1 on the current machine\n```\n\nThen, start the FP8 service by running:\n\n```shell\nsh run_server_fp8.sh\n```\n\nAfter successfully running `run_server_fp8.sh`, execute the request script:\n\n```shell\nsh openapi.sh\n```\n\n#### FP8 BENCHMARK\n\nThis part introduces the Benchmark of Hunyuan Large Instruct FP8 quantitative model.\n\n| Dataset | BF16 | W8A8C8-FP8 |\n|---------|------|------------|\n| ARC-C   | 94.6 | 94.2       |\n| C-Eval  | 88.6 | 89.2       |\n| CMMLU   | 90.4 | 89.8       |\n| MMLU    | 89.9 | 88.9       |\n\n### Inference Performance\n\nThis section presents the efficiency test results of deploying various models (original and quantized) using vLLM, including inference speed (tokens\u002Fs) under different batch sizes.\n\n| Inference Framework | Model                                                                                                  | Number of GPUs (H20) | input_length | batch=1 | batch=4 |\n| ------------------- | ------------------------------------------------------------------------------------------------------ | -------------------- | ------------ |---------|---------|\n| vLLM                | Hunyuan-Large                                                                                              | 16                   | 2048         | 20.2    | 75.5    |\n| vLLM                | Hunyuan-Large(int8 weight only)                                                                            | 8                    | 2048         | 19.3    | 73.6    |\n| vLLM                | Hunyuan-Large(W8A8C8-FP8)                                                                                  | 8                    | 2048         | 19.8    | 74.9    |\n\n## Tokenizer\n\nThe tokenizer used in the HunYuan-Large model balances compression rate and effectiveness, ensuring that embeddings are sufficiently trained. The vocabulary includes 100K tokens integrated from tiktoken. Additionally, we trained an extra 29K Chinese tokens using a large amount of high-quality Chinese training data to enhance the model's Chinese capabilities and the tokenizer's compression rate. Combined, our new tokenizer improves the compression rate compared to the LLaMA3 tokenizer, increasing from 2.78 characters\u002Ftoken to 3.13 characters\u002Ftoken.\n\n## Hunyuan API\n\nYou can experience our Hunyuan-Large model on Tencent Cloud. For details, please visit: https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F1729\u002F97730.\n\n## Interactive Demo Web\n\nThe Hunyuan-Large web demo is now open. Visit https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Ftencent\u002FHunyuan-Large to easily experience our model.\n\n## Training\u002FInference on TI\nTencent Cloud's [TI Platform](https:\u002F\u002Fcloud.tencent.com\u002Fproduct\u002Fti) is a comprehensive machine learning platform tailored for AI engineers. With the Hunyuan-Large model already integrated, you can easily train and deploy it in just a few steps. Visit [Chat with Hunyuan-Large](https:\u002F\u002Fconsole.cloud.tencent.com\u002Ftione\u002Fv2\u002Faimarket\u002Fdetail\u002Fhunyuan_series?PublicAlgoGroupId=hunyuan-large-chat&detailTab=demo) to experience real-time conversations with the model, and explore [Hunyuan-Large Best Practice on TI](https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F851\u002F112032) to create your own customized Hunyuan-Large model. \n\n\n## Citation\nIf you find our work helpful, feel free to give us a cite.\n\n```\n@misc{sun2024hunyuanlargeopensourcemoemodel,\n      title={Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent}, \n      author={Xingwu Sun and Yanfeng Chen and Yiqing Huang and Ruobing Xie and Jiaqi Zhu and Kai Zhang and Shuaipeng Li and Zhen Yang and Jonny Han and Xiaobo Shu and Jiahao Bu and Zhongzhi Chen and Xuemeng Huang and Fengzong Lian and Saiyong Yang and Jianfeng Yan and Yuyuan Zeng and Xiaoqin Ren and Chao Yu and Lulu Wu and Yue Mao and Tao Yang and Suncong Zheng and Kan Wu and Dian Jiao and Jinbao Xue and Xipeng Zhang and Decheng Wu and Kai Liu and Dengpeng Wu and Guanghui Xu and Shaohua Chen and Shuang Chen and Xiao Feng and Yigeng Hong and Junqiang Zheng and Chengcheng Xu and Zongwei Li and Xiong Kuang and Jianglu Hu and Yiqi Chen and Yuchi Deng and Guiyang Li and Ao Liu and Chenchen Zhang and Shihui Hu and Zilong Zhao and Zifan Wu and Yao Ding and Weichao Wang and Han Liu and Roberts Wang and Hao Fei and Peijie She and Ze Zhao and Xun Cao and Hai Wang and Fusheng Xiang and Mengyuan Huang and Zhiyuan Xiong and Bin Hu and Xuebin Hou and Lei Jiang and Jiajia Wu and Yaping Deng and Yi Shen and Qian Wang and Weijie Liu and Jie Liu and Meng Chen and Liang Dong and Weiwen Jia and Hu Chen and Feifei Liu and Rui Yuan and Huilin Xu and Zhenxiang Yan and Tengfei Cao and Zhichao Hu and Xinhua Feng and Dong Du and Tinghao She and Yangyu Tao and Feng Zhang and Jianchen Zhu and Chengzhong Xu and Xirui Li and Chong Zha and Wen Ouyang and Yinben Xia and Xiang Li and Zekun He and Rongpeng Chen and Jiawei Song and Ruibin Chen and Fan Jiang and Chongqing Zhao and Bo Wang and Hao Gong and Rong Gan and Winston Hu and Zhanhui Kang and Yong Yang and Yuhong Liu and Di Wang and Jie Jiang},\n      year={2024},\n      eprint={2411.02265},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.02265}, \n}\n```\n\u003Cbr>\n\n## Contact Us\n\nIf you would like to leave a message for our R&D and product teams, Welcome to contact our open-source team . You can also contact us via email (hunyuan_opensource@tencent.com).\n\n\n","\u003Cp align=\"left\">\n    \u003Ca href=\"README_CN.md\">中文\u003C\u002Fa>&nbsp ｜ 英文\u003C\u002Fa>\n\u003C\u002Fp>\n\u003Cbr>\u003Cbr>\n\n\u003Cp align=\"center\">\n \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_Tencent-Hunyuan-Large_readme_9a3e6454d04e.png\" width=\"400\"\u002F> \u003Cbr>\n\u003C\u002Fp>\u003Cp>\u003C\u002Fp>\n\n\u003Cp align=\"center\">\n    🫣&nbsp\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\">\u003Cb>Hugging Face\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp |  &nbsp&nbsp🖥️&nbsp&nbsp\u003Ca href=\"https:\u002F\u002Fllm.hunyuan.tencent.com\u002F\" style=\"color: red;\">\u003Cb>官方网站\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp｜&nbsp&nbsp🕖&middot;&nbsp \u003Ca href=\"https:\u002F\u002Fcloud.tencent.com\u002Fproduct\u002Fhunyuan\" >\u003Cb>HunyuanAPI\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp｜&nbsp&nbsp🐳&middot;&nbsp \u003Ca href=\"https:\u002F\u002Fgitee.com\u002FTencent\u002FTencent-Hunyuan-Large\" >\u003Cb>Gitee\u003C\u002Fb>\u003C\u002Fa>\n\u003C\u002Fp>\u003Cp align=\"center\">\n    \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.02265\" style=\"color: red;\">\u003Cb>技术报告\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp｜&nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Ftencent\u002FHunyuan-Large\">\u003Cb>演示\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp&nbsp｜&nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F851\u002F112032\" style=\"color: red;\">\u003Cb>Tencent Cloud TI\u003C\u002Fb>\u003C\u002Fa>&nbsp&nbsp&nbsp\u003C\u002Fp>\n\u003Cp>\u003Cbr>\u003C\u002Fp>\n\u003Cp>\n    \u003Ctable align=\"center\">\n        \u003Ctbody>\n            \u003Ctr>\n                \u003Ctd align=\"center\" colspan=\"3\">\u003Cstrong>模型下载\u003C\u002Fstrong>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd align=\"center\" style=\"width: 100px;\" >模型\u003C\u002Ftd>\n                \u003Ctd align=\"center\" style=\"width: 500px;\">Hugging Face 下载链接\u003C\u002Ftd>\n                \u003Ctd align=\"center\" style=\"width: 500px;\">腾讯云下载链接\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd style=\"width: 100px;\">Hunyuan-A52B-Instruct-FP8\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Instruct-FP8\" style=\"color: red;\">Hunyuan-A52B-Instruct-FP8\u003C\u002Fa>\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fcdn-large-model.hunyuan.tencent.com\u002FHunyuan-A52B-Instruct-128k-fp8-20241116.zip\" style=\"color: red;\">Hunyuan-A52B-Instruct-FP8\u003C\u002Fa>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd style=\"width: 100px;\">Hunyuan-A52B-Instruct\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Instruct\" style=\"color: red;\">Hunyuan-A52B-Instruct\u003C\u002Fa>\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fcdn-large-model.hunyuan.tencent.com\u002FHunyuan-A52B-Instruct-128k-20241116.zip\" style=\"color: red;\">Hunyuan-A52B-Instruct\u003C\u002Fa>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n            \u003Ctr>\n                \u003Ctd style=\"width: 100px;\">Hunyuan-A52B-Pretrain\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Pretrain\" style=\"color: red;\">Hunyuan-A52B-Pretrain\u003C\u002Fa>\u003C\u002Ftd>\n                \u003Ctd style=\"width: 500px;\">\u003Ca href=\"https:\u002F\u002Fcdn-large-model.hunyuan.tencent.com\u002FHunyuan-A52B-Pretrain-256k.zip\" style=\"color: red;\">Hunyuan-A52B-Pretrain\u003C\u002Fa>\u003C\u002Ftd>\n            \u003C\u002Ftr>\n        \u003C\u002Ftbody>\n    \u003C\u002Ftable>\n\u003C\u002Fp>\n\n\u003Cp>\u003C\u002Fp>\n\n\n## 模型介绍\n\n随着人工智能技术的快速发展，大型语言模型（LLMs）在自然语言处理、计算机视觉以及科学任务等领域取得了显著进展。然而，随着模型规模的不断扩大，如何在保持高性能的同时优化资源消耗已成为关键挑战。为应对这一挑战，我们探索了专家混合（MoE）模型。目前发布的 Hunyuan-Large（Hunyuan-MoE-A52B）模型是业界最大的开源 Transformer 基础 MoE 模型，总参数量达 3890 亿，活跃参数量为 520 亿。这是目前业界最大的开源 Transformer 基础 MoE 模型，总参数量达 3890 亿，活跃参数量为 520 亿。\n\n通过开源 Hunyuan-Large 模型并公开相关技术细节，我们希望激发更多研究人员的创新灵感，共同推动人工智能技术的进步与应用。我们诚挚邀请您加入我们的开源社区，一起探索和优化未来的 AI 模型！\n\n### 技术优势介绍\n\n#### 模型\n- **高质量合成数据**：通过引入合成数据进行训练，Hunyuan-Large 能够学习到更丰富的表征，更好地处理长上下文输入，并对未见数据具有更强的泛化能力。\n\n- **KV 缓存压缩**：采用分组查询注意力（GQA）和跨层注意力（CLA）策略，显著降低 KV 缓存的内存占用和计算开销，从而提升推理吞吐量。\n\n- **专家特定学习率缩放**：为不同专家设置不同的学习率，确保每个子模型都能有效从数据中学习，并为整体性能做出贡献。\n\n- **长上下文处理能力**：预训练模型支持高达 256K 的文本序列，而 Instruct 模型则支持高达 128K，极大地提升了处理长上下文任务的能力。\n\n- **广泛的基准测试**：我们在多种语言和任务上进行了大量实验，以验证 Hunyuan-Large 的实际效果和安全性。\n\n#### 推理框架\n- 此次开源提供了两种针对 Hunyuan-Large 模型的推理后端选项：流行的 [vLLM 后端](https:\u002F\u002Fgithub.com\u002Fquinnrong94\u002Fvllm\u002Ftree\u002Fdev_hunyuan) 和 TensorRT-LLM 后端。这两种方案都包含了性能优化措施。例如，引入新的 CLA 结构显著减少了 GPU 内存的使用，使 KV 缓存部分节省了 50%，从而确保高效处理长文本场景。此外，通过采用 FP8 量化，相比传统的 FP16\u002FBF16 量化，内存占用减少了 50%，同时保持了精度，并使吞吐量提高了 70%。与此同时，借助 TRT-LLM 核心的高效算子，TRT-LLM 方案的性能比 vLLM 高出 30% 以上。TRT-LLM 方案已在腾讯的 Hunyuan 项目中广泛应用。本次发布我们首先开源的是 vLLM 方案，计划在不久的将来推出 TRT-LLM 方案。\n\n#### 训练框架\n\n- Hunyuan-Large 开源模型完全兼容 Hugging Face 格式，研究人员和开发者可以使用 hf-deepspeed 框架进行模型微调。此外，我们还支持通过 Flash Attention 加速训练。为进一步帮助大家快速上手，我们通过此次发布将相应的训练脚本和模型实现公开给社区，方便后续基于这些资源进行模型训练和微调操作。\n\n&nbsp;\n\n## 相关新闻\n* 2024年11月25日 我们自主研发的长文本基准评测集——PenguinScrolls正式发布！您可以在[GitHub](https:\u002F\u002Fgithub.com\u002FPenguin-Scrolls\u002FPenguinScrolls)上探索该项目，并在[Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPenguin-Scrolls\u002FPenguinScrolls)上获取数据集。\n* 2024年11月18日 **Hunyuan-A52B-Instruct** 和 **Hunyuan-A52B-Instruct-FP8** 模型更新。\n* 2024年11月5日 [TI 平台](https:\u002F\u002Fcloud.tencent.com\u002Fproduct\u002Fti) 已集成 Hunyuan-Large 模型，您只需简单几步即可轻松进行训练和部署。请访问[与 Hunyuan-Large 聊天](https:\u002F\u002Fconsole.cloud.tencent.com\u002Ftione\u002Fv2\u002Faimarket\u002Fdetail\u002Fhunyuan_series?PublicAlgoGroupId=hunyuan-large-chat&detailTab=demo)，体验与该模型的实时对话；同时，您还可以参阅[TI 上的 Hunyuan-Large 最佳实践](https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F851\u002F112032)，打造属于您的定制化 Hunyuan-Large 模型。\n* 2024年11月5日 我们已在 Hugging Face 上开源了 **Hunyuan-A52B-Pretrain**、**Hunyuan-A52B-Instruct** 和 **Hunyuan-A52B-Instruct-FP8**。此外，我们还发布了技术报告以及训练与推理操作手册，详细介绍了模型的能力及训练和推理的具体流程。\n\n## 基准评估\n**Hunyuan-Large 预训练模型** 在与激活参数规模相近的密集型和 MoE 架构竞争对手相比，整体性能表现最佳。在 MMLU、MMLU-Pro 和 CMMLU 等综合基准测试中，Hunyuan-Large 始终取得最优成绩，充分证明了其在综合性任务上的全面能力。\n\n此外，Hunyuan-Large 在常识理解与推理，以及经典 NLP 任务（如问答和阅读理解）方面也展现出卓越性能，例如 CommonsenseQA、PIQA 和 TriviaQA 等数据集。在数学能力方面，Hunyuan-Large 在 GSM8K 和 MATH 数据集上均超越所有基线模型，并且在中国语种的 CMATH 数据集中也取得了最佳成绩。同时，在所有中文任务中（如 CMMLU 和 C-Eval），Hunyuan-Large 同样表现出整体最优性能。\n\n| 模型            | LLama3.1-405B | LLama3.1-70B | Mixtral-8x22B | DeepSeek-V2 | Hunyuan-Large |\n|------------------|---------------|--------------|---------------|-------------|---------------|\n| MMLU             | 85.2          | 79.3         | 77.8          | 78.5        | **88.4**          |\n| MMLU-Pro         | **61.6**          | 53.8         | 49.5          | -           | 60.2          |\n| BBH              | 85.9          | 81.6         | 78.9          | 78.9        | **86.3**          |\n| HellaSwag        | -             | -            | **88.7**      | 87.8        | 86.8          |\n| CommonsenseQA    | 85.8          | 84.1         | 82.4          | -           | **92.9**          |\n| WinoGrande       | 86.7          | 85.3         | 85.0          | 84.9        | **88.7**          |\n| PIQA             | -             | -            | 83.6          | 83.7        | **88.3**          |\n| NaturalQuestions | -             | -            | 39.6          | 38.7        | **52.8**          |\n| DROP             | 84.8          | 79.6         | 80.4          | 80.1        | **88.9**          |\n| ARC-C            | **96.1**          | 92.9         | 91.2          | 92.4        | 95.0          |\n| TriviaQA         | -             | -            | 82.1          | 79.9        | **89.2**          |\n| CMMLU            | -             | -            | 60.0          | 84.0        | **90.2**          |\n| C-Eval           | -             | -            | 59.6          | 81.7        | **91.9**          |\n| C3               | -             | -            | 71.4          | 77.4        | **82.3**          |\n| GSM8K            | 89.0          | 83.7         | 83.7          | 79.2        | **92.8**          |\n| MATH             | 53.8          | 41.4         | 42.5          | 43.6        | **69.8**          |\n| CMATH            | -             | -            | 72.3          | 78.7        | **91.3**          |\n| HumanEval        | 61.0          | 58.5         | 53.1          | 48.8        | **71.4**          |\n| MBPP             | **73.4**          | 68.6         | 64.2          | 66.6        | 72.6          |\n\n**Hunyuan-Large-Instruct** 相较于具有相似激活参数量的 LLM，在大多数类型的任务上都实现了持续提升，这表明我们的后训练方法非常有效。深入分析不同类别基准测试中的模型表现，我们发现该指令微调模型在 MMLU 和 MATH 数据集上取得了最佳成绩。尤其值得一提的是，在 MMLU 数据集中，我们的模型表现显著提升，比 LLama3.1-405B 模型高出 2.6%。这一进步并非微小，而是充分体现了 Hunyuan-Large-Instruct 在广泛的语言理解任务中所具备的卓越理解和推理能力。此外，该模型在 MATH 数据集上的表现同样令人瞩目，以 3.6% 的优势领先 LLama3.1-405B。值得注意的是，如此大幅度的准确率提升仅依靠 520 亿个激活参数便得以实现，进一步凸显了我们模型的高效性。\n\n| 模型                | LLama3.1 405B Inst. | LLama3.1 70B Inst. | Mixtral 8x22B Inst. | DeepSeekV2.5 Chat | Hunyuan-Large Inst. |\n|----------------------|---------------------|--------------------|---------------------|-------------------|---------------------|\n| MMLU                 | 87.3                | 83.6               | 77.8                | 80.4              | **89.9**            |\n| CMMLU                | -                   | -                  | 61.0                | -                 | **90.4**            |\n| C-Eval               | -                   | -                  | 60.0                | -                 | **88.6**            |\n| BBH                  | -                   | -                  | 78.4                | 84.3              | **89.5**            |\n| HellaSwag            | -                   | -                  | 86.0                | **90.3**          | 88.5                |\n| ARC-C                | **96.9**            | 94.8               | 90.0                | -                 | 94.6                |\n| GPQA_diamond         | **51.1**            | 46.7               | -                   | -                 | 42.4                |\n| MATH                 | 73.8                | 68.0               | 49.8                | 74.7              | **77.4**            |\n| HumanEval            | 89.0                | 80.5               | 75.0                | 89.0              | **90.0**            |\n| AlignBench           | 6.0                 | 5.9                | 6.2                 | 8.0               | **8.3**             |\n| MT-Bench             | 9.1                 | 8.8                | 8.1                 | 9.0               | **9.4**             |\n| IFEval strict-prompt | **86.0**            | 83.6               | 71.2                | -                 | 85.0                |\n| Arena-Hard |  69.3            | 55.7               |  -                | 76.2                 | **81.8**            |\n| AlpacaEval-2.0 | 39.3            | 34.3               | 30.9                | 50.5                 | **51.8**            |\n\n\n\n\n\n&nbsp;\n\n## 数据\n\nHunyuan-Large 提供了与模型训练相关的流程。在本节中，您可以对训练数据格式进行处理，以便用于模型训练。\n\n### 训练数据格式及处理\n\n训练数据被处理为如下消息格式。训练和推理时的默认系统提示词是“你是一个乐于助人的助手”。以下是单轮和多轮数据的示例：\n\n\n```python\n# 单轮数据格式\n\nmessages = [\n    {\"role\": \"system\", \"content\": \"你是一个乐于助人的助手。\"},\n    {\"role\": \"user\", \"content\": \"海水为什么是咸的\" },\n    {\"role\": \"assistant\", \"content\": \"海水是咸的主要是因为其中含有许多溶解在水中的盐类和矿物质。这些盐类和矿物质来自于地球表面的岩石和土壤中的化学物质，随着时间的推移，它们被带到了海洋中。当海水蒸发时，水分蒸发掉了，但盐类和矿物质仍然留在水中，导致海水变得更加咸味。因此，海水的咸度是由其中的盐类和矿物质的含量决定的。\"}\n]\n\n# 多轮对话数据格式\nmessages = [\n    {\"role\": \"system\", \"content\": \"你是一个 helpful assistant。\"},\n    {\"role\": \"user\", \"content\": \"知道重庆森林这部电影吗？\" },\n    {\"role\": \"assistant\", \"content\": \"知道呀，是一部由王家卫导演的片子。\" },\n    {\"role\": \"user\", \"content\": \" 而主演里更是有王菲，一上映便受到追捧。\" },\n    {\"role\": \"assistant\", \"content\": \"所以此片获得了第14届香港电影金像奖最佳影片奖。\"}\n]\n\nfrom tokenizers import ByteLevelBPETokenizer\nfrom transformers import AutoTokenizer\n\ntokenizer = AutoTokenizer.from_pretrained(\".\u002Fmodels\", use_fast=False, trust_remote_code=True)\n\nids = tokenizer.apply_chat_template(messages)\n\n```\n\n\n\n更多使用参考，请参阅 `.\u002Fmodels\u002Ftest.py` 文件。\n\n&nbsp;\n\n## 快速入门\n\n您可以通过参考\u003Ca href=\"examples\u002FREADME.md\">快速入门指南\u003C\u002Fa>中的内容，快速上手。\n\n## 模型训练\n\n为了简化训练流程，HunyuanLLM 提供了一个预构建的 Docker 镜像：\n\n[hunyuaninfer\u002Fhunyuan-large](https:\u002F\u002Fhub.docker.com\u002Frepository\u002Fdocker\u002Fhunyuaninfer\u002Fhunyuan-large\u002Fgeneral)。\n\n### 硬件要求\n\n已在 H20 上进行测试，在未启用 `make_moe_param_leaf_module` 并使用 `zero3+offload` 的情况下，当 `max_seq_length` 设置为 2048 时，全量微调至少需要 32 张 GPU 卡，而 LoRA 微调则至少需要 8 张 GPU 卡。\n\n### 训练性能\n\n在最低配置下（8 张 GPU 卡用于 LoRA 微调），将 `per_device_train_batch_size` 设置为 1，同时将 `gradient_accumulation_steps` 设置为 1，每轮迭代大约需要 35 秒。\n\n### 启动方法\n\n请参考：[HuggingFace Transformers Trainer](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fv4.19.2\u002Fen\u002Fmain_classes\u002Ftrainer)\n\n#### 单机训练\n\n在 `train` 目录下，执行以下命令：\n\n```sh\npip install -r requirements.txt\nbash train.sh\n```\n\n#### 多机训练\n\n要在多台机器上启动训练，请按照以下步骤操作，并确保所有机器位于同一个集群内。\n\n##### 配置机器间的无密码 SSH 登录\n\n以下步骤以两台机器为例，其 IP 地址分别表示为 `${ip1}` 和 `${ip2}`。这些操作均在 Docker 容器中进行。\n\n首先，在每台机器上的容器之间配置无密码 SSH 连接。\n\n\n```sh\nssh-keygen\t\t\t# 生成用于无密码登录的 id_rsa 和 id_rsa.pub\nssh-keygen -t rsa -A    # 生成 \u002Fetc\u002Fssh\u002Fssh_host_rsa_key 和 ssh_host_ecdsa_key，以便后续启动 'SSH listen'\n\u002Fusr\u002Fsbin\u002Fsshd -p 36005 -o ListenAddress=0.0.0.0        # 启动 SSH 监听\necho \"Port 36005\" > ~\u002F.ssh\u002Fconfig   # 将 SSH 连接端口更改为 36005\npasswd root    # 设置 root 密码，以避免监控平台发出警报\n```\n\n\n注意：这里的 `36005` 是一个示例。您可以选择任意端口，但务必确保该端口**已开放**且**未被其他进程占用**。\n\n接下来，在每台机器的容器内执行：\n\n```sh\ncat ~\u002F.ssh\u002Fid_rsa.pub\n```\n\n**复制输出的 SSH 公钥，并将其粘贴到 `~\u002F.ssh\u002Fauthorized_keys` 文件中，每行放置一个公钥。这一步骤必须在每一台机器上完成。** 最终，每台机器上的 `~\u002F.ssh\u002Fauthorized_keys` 文件应完全相同，包含所有机器的公钥。\n\n需要注意的是，在多节点训练过程中，每个节点上运行的代码必须保持一致。建议挂载共享网络驱动器。如果无法挂载共享驱动器，则需要手动将数据集、脚本和代码复制到所有机器的同一目录下。\n\n##### 开始多机训练\n\n当准备工作完成后，并确认依赖项已安装（若未安装，请执行 `pip install -r requirements.txt` 进行安装）时，可以在 `train.sh` 文件的开头添加以下配置：\n\n```shell\nexport HOST_GPU_NUM=8\n# 当前机器 IP\nexport LOCAL_IP=${ip1}\n# 多节点机器 IP 列表，用逗号分隔\nexport NODE_IP_LIST=\"${ip1}:8,${ip2}:8\"\n\n# 机器节点数量\nexport NODES=2\nexport NODE_NUM=$((${NODES} * ${HOST_GPU_NUM}))\n```\n\n注意：请将 `${ip1}` 和 `${ip2}` 替换为实际的 IP 地址！\n\n然后，在 `${ip1}` 所在的机器上，进入 `train\u002F` 目录并执行 `bash train.sh`。请注意，首次运行时，您可能会看到如下提示：\n\n```ssh\n无法确认主机 '[ip]:36005 ([ip]:36005)' 的真实性。\nECDSA 密钥指纹是 xxxxxx。\nECDSA 密钥指纹的 MD5 哈希值是 xxxxxx。\n您确定要继续连接吗（是\u002F否）？\n```\n\n此时，请输入 `yes` 继续。\n\n##### 关键参数\n\n脚本中的关键参数如下：\n\n- `--deepspeed`：该参数应指向 DeepSpeed 配置文件。`train` 文件夹提供了三个默认的 DeepSpeed 配置文件：`ds_zero2_no_offload.json`、`ds_zero3_no_offload.json`、`ds_zero3_offload.json`。所需显存依次减少。\n- `--model_name_or_path`：HF 预训练模型的路径。请确保该路径包含 `modeling_hunyuan.py` 和 `configuration_hunyuan.py` 文件，否则无法加载。\n- `--tokenizer_name_or_path`：分词器文件夹的路径。请确保该路径包含 `tokenization_hy.py` 文件，否则无法加载。\n- `--train_data_file`：训练数据文件的路径，应为 JSONL 格式。\n- `--output_dir`：输出目录，用于存储日志、TensorBoard 文件和模型权重。\n- `--per_device_train_batch_size`：每个 GPU 的批量大小。\n- `--gradient_accumulation_steps`：梯度累积步数。全局批量大小为 `per_device_train_batch_size * gradient_accumulation_steps * dp_size`。\n- `--max_steps`：总训练步数。\n- `--save_steps`：保存检查点的间隔步数。\n- `--use_lora`：是否使用 LoRA 进行训练。此参数还接受 `--lora_rank`、`--lora_alpha` 和 `--lora_dropout` 参数。LoRA 默认应用于 `'q_proj'`、`'k_proj'`、`'v_proj'`、`'o_proj'` 参数。如需更改，请在代码中修改。注意：**使用 LoRA 训练时，仅保存 LoRA 权重，不保存基础模型权重**。若需合并 LoRA 权重，请参阅下方的“LoRA 权重合并”部分。\n- `--make_moe_param_leaf_module`：在使用 zero3 和 MoE 训练时，将 MoE 模块视为叶模块，即其参数不会被 zero3 切分。此选项预计将显著增加显存占用。\n- `--gradient_checkpointing`：启用梯度检查点。\n- `--train_attention_params_only`：是否仅训练注意力参数。\n- `--learning_rate`：训练过程中的最大学习率。\n- `--min_lr`：训练过程中的最小学习率。\n- `--use_flash_attn`：开启 flash-attention 以加速训练。\n\n**注意：**\n\n- 如果您希望从先前保存的检查点继续训练，而不是加载预训练权重，请指定 `--resume_from_checkpoint` 并提供上一次训练的检查点路径。请勿同时指定 `--model_name_or_path`，因为那样只会加载权重，而不会恢复训练状态。\n- 从检查点继续训练时，由于某些非确定性算法引入的随机性，损失值可能会出现轻微偏差，这属于正常现象。详情请参阅：[HuggingFace Transformers Trainer Randomness](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fmain\u002Fen\u002Fperf_train_gpu_one#randomness)。\n- 当指定了 `--model_name_or_path` 时，所有与模型相关的参数将被忽略。\n- 同一批次内的样本会被填充至与该批次中最长样本一致的长度，每个样本的最大长度为 `max_seq_length`。超出部分将被截断。\n- 如果遇到关于偏置权重未加载的警告，可以忽略，因为 Hunyuan-Large 中并未使用偏置项。\n\n\n#### 显存不足怎么办？\n\n请参考：[DeepSpeed 配置文档](https:\u002F\u002Fwww.deepspeed.ai\u002Fdocs\u002Fconfig-json\u002F)\n\n您可以尝试修改 DeepSpeed 配置文件，移除这些参数的 `auto` 属性，并降低其数值：\n\n- `stage3_param_persistence_threshold`\n- `stage3_prefetch_bucket_size`\n- `stage3_max_reuse_distance`\n- `stage3_max_reuse_distance`\n\n#### LoRA 模型的合并\n\n在启用 zero3 的情况下，由于模型权重被分割到不同的数据并行 Rank 上，因此无法在训练过程中直接将 LoRA 权重合并到零3模型中。如果需要将 LoRA 权重合并到基础模型中，可以在离线状态下完成合并操作，生成合并后的权重文件。执行 `merge_lora_weight.sh` 脚本即可将 LoRA 权重与基础模型权重合并。该脚本的参数包括：\n\n- `--base_model_path`：基础模型权重的目录。\n- `--adapter_model_path`：LoRA 权重的目录。\n- `--output_path`：保存合并后权重的目录。\n- `--save_dtype`：合并后权重的存储数据类型，可选格式有：fp16、bf16、fp32。\n\n&nbsp;\n\n## 推理与部署\n\nHunyuanLLM 使用 TRT-LLM 和 vLLM 进行部署。我们已开源 [vLLM-backend](https:\u002F\u002Fgithub.com\u002Fquinnrong94\u002Fvllm\u002Ftree\u002Fdev_hunyuan) 部署方案（详见“使用 vLLM 进行推理”），而 TRT-LLM 部署方案（详见“使用 TRT-LLM 进行推理”）将在不久的将来发布。\n\n## 使用 TRT-LLM 进行推理\n\n待开放\n\n## 使用 vLLM 进行推理\n\n### Docker：\n\n为简化部署流程，HunyuanLLM 提供了一个预构建的 Docker 镜像：\n\n[hunyuaninfer\u002Fhunyuan-large](https:\u002F\u002Fhub.docker.com\u002Frepository\u002Fdocker\u002Fhunyuaninfer\u002Fhunyuan-large\u002Fgeneral)。您只需下载模型文件，并使用以下命令启动 Docker 容器，即可开始模型推理。\n\n```shell\ndocker run --name hunyuanLLM_infer -itd --privileged --user root --net=host --ipc=host --gpus=8 hunyuaninfer\u002Fhunyuan-large:infer-open-source\n```\n\n注意：Docker 容器权限管理。上述命令以特权模式（`--privileged`）启动容器，这会赋予容器更高的权限，从而增加数据泄露和集群安全风险。建议尽量避免使用特权模式，以降低安全风险。对于必须使用特权模式的场景，请进行全面的安全评估，并采取适当的安全监控和加固措施。\n\n### 配置多机之间的无密码 SSH 登录\n\n以下步骤以两台机器为例，其 IP 分别表示为 `${ip1}` 和 `${ip2}`。这些操作均在 Docker 容器内进行。\n\n首先，在两台机器上分别运行 `passwd` 设置密码，例如：`Tmp123,.\u002F`。\n\n将 `inference\u002Flogin_ssh.py` 复制到容器中，并执行以下命令，确保正确输入 IP 和密码。\n\n```shell\npython3 login_ssh.py --ips ${ip1},${ip2} --port 36000 --password=Tmp123,.\u002F\n```\n\n**注意 📢：在开始之前，请务必使用 vLLM 的调试脚本验证多机通信是否正常：https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Flatest\u002Fgetting_started\u002Fdebugging.html**\n\n### BF16 部署\n\nBF16 部署需要 16 块 H20 GPU。在确认多机通信正常后，执行以下步骤：\n\n在运行命令之前，请设置以下环境变量：\n\n```shell\n${LOCAL_IP}：当前机器上 bond1 对应的 IP 地址\n${MODEL_PATH}：Hunyuan LLM 模型路径\n```\n\n#### 第一步：启动 Ray\n\nRay 是一个用于并行和分布式 Python 的开源库。在本节中，我们使用 Ray 实现多机通信。\n\nRay 组件配置加固：Ray 组件的默认配置未启用服务端口（例如 6379、8265）的身份验证机制，存在未授权访问和命令执行的风险。建议仅在受信任的内部网络环境中部署 Ray 组件，或确保对这些端口实施严格的访问控制列表（ACL）策略，以防止未经授权的网络访问。\n\n首先，在每个节点上启动 Ray（可以在后台运行，也可以保持终端打开）：\n\n在主节点上：\n```shell\nexport VLLM_HOST_IP=${LOCAL_IP}\nexport NCCL_SOCKET_IFNAME=bond1\nexport GLOO_SOCKET_IFNAME=bond1\nray start --block --head --node-ip-address=${LOCAL_IP} --port=6379\n```\n\n在所有工作节点上：\n\n注意：将 `{HEAD NODE $LOCAL_IP}` 替换为主节点的实际 ${LOCAL_IP}。\n```shell\nexport VLLM_HOST_IP=${LOCAL_IP}\nexport NCCL_SOCKET_IFNAME=bond1\nexport GLOO_SOCKET_IFNAME=bond1\nray start --block --address={HEAD NODE $LOCAL_IP}:6379 --node-ip-address=${LOCAL_IP}\n```\n\n如果 Ray 启动失败，请先执行 `ray stop`，然后再重新运行上述命令。\n\n#### 第二步：执行推理\n\n#### 方法一：命令行推理\n\n以下代码片段展示了如何使用 `vLLM` 快速请求聊天模型：\n\n注意：vLLM 组件远程代码执行防护。在下面的代码中，如果启用了 vLLM 组件的 `trust-remote-code` 配置选项，将允许从远程模型仓库加载并执行代码，这可能导致恶意代码被执行。除非业务明确要求，建议保持该配置选项关闭，以降低潜在的安全威胁。\n\n```python\nimport os\nfrom vllm import LLM, SamplingParams\n\nmodel_path=os.environ.get('MODEL_PATH')\n\nllm = LLM(model=model_path,\n        tokenizer=model_path,\n        trust_remote_code=True,\n        max_model_len=10240,\n        dtype='bfloat16',\n        tensor_parallel_size=16,\n        pipeline_parallel_size=1,\n        disable_log_stats=False,\n        gpu_memory_utilization=0.98,\n        disable_custom_all_reduce=True,\n        #distributed_executor_backend='ray',\n        enforce_eager=True,\n        max_num_seqs=8,\n        use_v2_block_manager=True,\n        quantization=None)\n\nprompts = [\"海水为什么是咸的\"]\n\nsampling_params = SamplingParams(\n    temperature=0.7, top_p=0.6, max_tokens=200, top_k=20, repetition_penalty=1.05)\n\noutputs = llm.generate(prompts, sampling_params)\n\n# 打印输出结果。\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\n```\n\n#### 方法二：基于服务的推理\n\n下面我们演示如何使用 `vLLM` 以服务方式部署模型并进行请求。\n\n在主节点上运行以下命令：\n\n```shell\nexport VLLM_HOST_IP=${LOCAL_IP}\nexport NCCL_SOCKET_IFNAME=bond1\nexport GLOO_SOCKET_IFNAME=bond1\n```\n\n然后通过运行以下命令启动服务：\n\n```shell\ncd inference\nsh run_server.sh\n```\n\n*提示*：故障排除，如果遇到以下错误：\n\n```python\nray.exceptions.RaySystemError: System error: No module named 'transformers_modules' traceback: Traceback (most recent call last):\nModuleNotFoundError: No module named 'transformers_modules'\n```\n\n请将主节点上的 `~\u002F.cache\u002Fhuggingface\u002Fmodules\u002F` 目录复制到所有工作节点的相应路径。\n\n成功运行 `run_server.sh` 后，执行请求脚本：\n\n```shell\nsh openapi.sh\n```\n\n务必修改 `openapi.sh` 中的 `${LOCAL_IP}` 和 `${MODEL_PATH}`，使其与对应服务的值一致。\n\n### 量化模型部署：\n\n本节介绍使用 vLLM 部署量化模型的过程。\n\n图片：部署镜像与 BF16 相同。\n\n#### Int8 量化模型部署：\n\n要部署 Hunyuan-L 模型的 Int8 权重专用版本，只需在 `run_server_int8.sh` 中设置以下环境变量：\n\n```shell\n${MODEL_PATH}：BF16 模型路径\n${LOCAL_IP}：当前机器上 bond1 对应的 IP 地址\n```\n\n然后通过运行以下命令启动 Int8 服务：\n\n```shell\nsh run_server_int8.sh\n```\n\n成功运行 `run_server_int8.sh` 后，执行请求脚本：\n\n```shell\nsh openapi.sh\n```\n\n#### FP8 量化模型部署：\n\n要部署 Hunyuan-L 模型的 W8A8C8 版本，只需在 `run_server_fp8.sh` 中设置以下环境变量：\n\n```shell\n${MODEL_PATH}：FP8 模型路径\n${LOCAL_IP}：当前机器上 bond1 对应的 IP 地址\n```\n\n然后通过运行以下命令启动 FP8 服务：\n\n```shell\nsh run_server_fp8.sh\n```\n\n成功运行 `run_server_fp8.sh` 后，执行请求脚本：\n\n```shell\nsh openapi.sh\n```\n\n#### FP8 基准测试\n\n本部分介绍 Hunyuan Large Instruct FP8 量化模型的基准测试结果。\n\n| 数据集 | BF16 | W8A8C8-FP8 |\n|--------|------|------------|\n| ARC-C  | 94.6 | 94.2       |\n| C-Eval | 88.6 | 89.2       |\n| CMMLU  | 90.4 | 89.8       |\n| MMLU   | 89.9 | 88.9       |\n\n### 推理性能\n\n本节展示了使用 vLLM 部署各种模型（原始模型及量化模型）的效率测试结果，包括不同批处理大小下的推理速度（tokens\u002Fs）。\n\n| 推理框架 | 模型                                                                                                  | GPU 数量（H20） | input_length | batch=1 | batch=4 |\n|----------|------------------------------------------------------------------------------------------------------ | ---------------- | ------------ |---------|---------|\n| vLLM     | Hunyuan-Large                                                                                              | 16               | 2048         | 20.2    | 75.5    |\n| vLLM     | Hunyuan-Large（仅权重为 Int8）                                                                            | 8                | 2048         | 19.3    | 73.6    |\n| vLLM     | Hunyuan-Large（W8A8C8-FP8）                                                                                  | 8                | 2048         | 19.8    | 74.9    |\n\n## 分词器\n\nHunYuan-Large 模型所使用的分词器在压缩率和效果之间取得了平衡，确保嵌入向量能够得到充分训练。该分词器的词汇表包含 10 万个来自 tiktoken 的词元，并在此基础上利用大量高质量的中文训练数据额外训练了 2.9 万个中文词元，以增强模型的中文能力及分词器的压缩率。综合来看，我们的新分词器相比 LLaMA3 的分词器，压缩率从每词元 2.78 个字符提升至 3.13 个字符。\n\n## Hunyuan API\n\n您可以在腾讯云上体验我们的 Hunyuan-Large 模型。详情请访问：https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F1729\u002F97730。\n\n## 交互式 Demo 网页\n\nHunyuan-Large 的网页版 Demo 已正式开放。请访问 https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Ftencent\u002FHunyuan-Large，轻松体验我们的模型。\n\n## 在 TI 平台上进行训练与推理\n\n腾讯云的 [TI 平台](https:\u002F\u002Fcloud.tencent.com\u002Fproduct\u002Fti) 是专为 AI 工程师打造的综合性机器学习平台。Hunyuan-Large 模型已集成其中，您只需简单几步即可完成训练与部署。访问 [与 Hunyuan-Large 对话](https:\u002F\u002Fconsole.cloud.tencent.com\u002Ftione\u002Fv2\u002Faimarket\u002Fdetail\u002Fhunyuan_series?PublicAlgoGroupId=hunyuan-large-chat&detailTab=demo)，实时体验与模型的对话；同时，您还可以参阅 [Hunyuan-Large 在 TI 上的最佳实践](https:\u002F\u002Fcloud.tencent.com\u002Fdocument\u002Fproduct\u002F851\u002F112032)，创建属于您自己的定制化 Hunyuan-Large 模型。\n\n\n## 引用\n\n如果您觉得我们的工作对您有所帮助，请随时引用我们。\n\n```\n@misc{sun2024hunyuanlargeopensourcemoemodel,\n      title={Hunyuan-Large: 腾讯开源的拥有 520 亿激活参数的 MoE 模型}, \n      author={孙兴武、陈彦峰、黄一青、谢若冰、朱嘉琪、张凯、李帅鹏、杨震、韩约翰、舒晓波、卜家豪、陈中志、黄雪萌、连奉宗、杨赛勇、严建峰、曾宇远、任小琴、余超、吴露露、毛悦、杨涛、郑孙聪、吴侃、焦典、薛金宝、张希鹏、吴德成、刘凯、吴登鹏、徐广辉、陈绍华、陈爽、冯晓、洪义庚、郑俊强、许成成、李宗伟、匡雄、胡江陆、陈怡琪、邓雨驰、李贵阳、刘傲、张晨晨、胡世辉、赵子龙、吴子凡、丁耀、王伟超、刘汉、罗伯茨·王、费浩、佘佩杰、赵泽、曹勋、王海、项福生、黄梦源、熊志远、胡斌、侯学斌、姜磊、吴佳佳、邓亚平、沈毅、王倩、刘伟杰、刘杰、陈猛、董亮、贾卫文、陈虎、刘飞飞、袁睿、徐慧琳、闫振祥、曹腾飞、胡志超、冯新华、杜东、佘廷昊、陶阳宇、张峰、朱建臣、许成忠、李西瑞、查重、欧阳文、夏寅本、李翔、何泽坤、陈荣鹏、宋嘉伟、陈瑞彬、蒋凡、赵重庆、王博、龚浩、甘荣、胡温斯顿、康展辉、杨勇、刘玉红、王迪、江杰},\n      year={2024},\n      eprint={2411.02265},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.02265}, \n}\n```\n\u003Cbr>\n\n## 联系我们\n\n如果您想向我们的研发和产品团队留言，欢迎联系我们的开源团队。您也可以通过电子邮件（hunyuan_opensource@tencent.com）与我们取得联系。","# Tencent-Hunyuan-Large 快速上手指南\n\n腾讯混元大模型（Hunyuan-Large）是目前业界最大的开源 Transformer 架构 MoE（混合专家）模型，拥有 3890 亿总参数和 520 亿激活参数。本指南将帮助您快速在本地部署和使用该模型。\n\n## 1. 环境准备\n\n### 系统要求\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+)\n*   **GPU**: 推荐使用 NVIDIA A100\u002FH100 或多卡环境。\n    *   运行 FP16\u002FBF16 版本需较大显存。\n    *   运行 **FP8** 量化版本可显著降低显存需求（相比 FP16 节省约 50% 显存），推荐优先使用。\n*   **CUDA**: 11.8 或更高版本\n*   **Python**: 3.10+\n\n### 前置依赖\n确保已安装 `git`、`python3-pip` 和 `wget` (用于下载模型)。\n\n```bash\nsudo apt-get update\nsudo apt-get install -y git python3-pip wget\n```\n\n## 2. 安装步骤\n\n### 2.1 克隆推理框架\nHunyuan-Large 官方优化了 **vLLM** 后端以支持其特有的 MoE 结构和长上下文特性。请克隆专用的 vLLM 分支：\n\n```bash\ngit clone -b dev_hunyuan https:\u002F\u002Fgithub.com\u002Fquinnrong94\u002Fvllm.git\ncd vllm\npip install -e .\n```\n\n> **注意**：请勿直接使用 PyPI 上的标准 vLLM 版本，必须使用上述针对混元优化的分支以获得最佳性能（如 CLA 结构支持和 FP8 加速）。\n\n### 2.2 下载模型\n推荐通过国内镜像源或腾讯云高速下载。以下以 **Hunyuan-A52B-Instruct-FP8**（指令微调版，量化版，推荐首选）为例：\n\n**方案 A：使用腾讯云 CDN 下载（国内速度最快）**\n```bash\nmkdir -p ~\u002Fmodels\u002FHunyuan-A52B-Instruct-FP8\ncd ~\u002Fmodels\u002FHunyuan-A52B-Instruct-FP8\nwget https:\u002F\u002Fcdn-large-model.hunyuan.tencent.com\u002FHunyuan-A52B-Instruct-128k-fp8-20241116.zip\nunzip Hunyuan-A52B-Instruct-128k-fp8-20241116.zip\n# 解压后请确认模型文件位于当前目录\n```\n\n**方案 B：使用 Hugging Face (需网络代理)**\n```bash\n# 确保已安装 huggingface-cli\npip install huggingface_hub\nhuggingface-cli download tencent\u002FTencent-Hunyuan-Large --local-dir ~\u002Fmodels\u002FHunyuan-A52B-Instruct-FP8 --include \"Hunyuan-A52B-Instruct-FP8\u002F*\"\n```\n\n## 3. 基本使用\n\n### 3.1 启动 API 服务\n使用优化后的 vLLM 启动服务。以下命令假设您使用的是 FP8 版本，并开启了多卡并行（如需单卡请调整 `--tensor-parallel-size`）。\n\n```bash\n# 替换 \u003CMODEL_PATH> 为您实际的模型路径，例如 \u002Froot\u002Fmodels\u002FHunyuan-A52B-Instruct-FP8\nexport MODEL_PATH=\u002Froot\u002Fmodels\u002FHunyuan-A52B-Instruct-FP8\n\npython -m vllm.entrypoints.api_server \\\n    --model $MODEL_PATH \\\n    --trust-remote-code \\\n    --tensor-parallel-size 4 \\\n    --max-model-len 128000 \\\n    --port 8000\n```\n\n*   `--tensor-parallel-size`: 根据可用 GPU 数量调整（FP8 版本建议至少 4 卡，具体视显存而定）。\n*   `--max-model-len`: 混元指令版支持 128K 上下文，预训练版支持 256K。\n\n### 3.2 发送请求测试\n服务启动后，使用 `curl` 或 Python 脚本进行测试。\n\n**使用 curl 测试：**\n```bash\ncurl http:\u002F\u002Flocalhost:8000\u002Fgenerate \\\n    -d '{\n        \"prompt\": \"你好，请介绍一下腾讯混元大模型。\",\n        \"max_tokens\": 512,\n        \"temperature\": 0.7\n    }'\n```\n\n**使用 Python 客户端测试：**\n```python\nfrom openai import OpenAI\n\n# 指向本地启动的 vLLM 服务\nclient = OpenAI(\n    base_url=\"http:\u002F\u002Flocalhost:8000\u002Fv1\",\n    api_key=\"EMPTY\" \n)\n\nresponse = client.chat.completions.create(\n    model=\"tencent-hunyuan-large\",\n    messages=[\n        {\"role\": \"user\", \"content\": \"你好，请介绍一下腾讯混元大模型。\"}\n    ],\n    max_tokens=512,\n    temperature=0.7\n)\n\nprint(response.choices[0].message.content)\n```\n\n### 3.3 直接推理脚本 (可选)\n如果不想启动服务器，可直接使用 vLLM 的离线推理功能：\n\n```bash\npython -m vllm.entrypoints.llm_api \\\n    --model $MODEL_PATH \\\n    --trust-remote-code \\\n    --prompt \"你好，请介绍一下腾讯混元大模型。\" \\\n    --max_tokens 512\n```","某大型金融科技公司正在构建新一代智能投研系统，需要处理海量长篇财报、研报及实时新闻，以生成深度市场分析摘要。\n\n### 没有 Tencent-Hunyuan-Large 时\n- **长文档理解能力不足**：现有模型上下文窗口有限，无法一次性读完百页级的完整财报，导致关键信息遗漏或分析碎片化。\n- **推理成本高昂**：为了覆盖复杂逻辑，团队被迫调用多个专用小模型串联，不仅维护成本高，且整体响应延迟严重。\n- **专业深度欠缺**：通用模型在金融术语理解和复杂因果推导上表现平平，生成的报告常出现事实性幻觉，需人工大量复核。\n- **资源利用率低**：传统稠密模型在处理简单查询时也全量激活参数，造成算力浪费，难以支撑高并发场景。\n\n### 使用 Tencent-Hunyuan-Large 后\n- **超长上下文精准掌控**：依托其 256k 的超长上下文窗口，Tencent-Hunyuan-Large 能一次性“吃透”整本财报，确保摘要逻辑连贯、数据准确无误。\n- **MoE 架构降本增效**：作为拥有 3890 亿总参数但仅激活 520 亿的混合专家（MoE）模型，Tencent-Hunyuan-Large 在保持顶尖性能的同时，大幅降低了单次推理的算力消耗。\n- **行业洞察更深刻**：得益于超大规模参数的知识储备，Tencent-Hunyuan-Large 能精准识别金融隐喻与复杂市场关联，输出的分析结论具备专家级深度。\n- **动态路由提升速度**：针对不同难度的任务，Tencent-Hunyuan-Large 自动调度相应专家网络，显著提升了系统在高并发下的响应速度与稳定性。\n\nTencent-Hunyuan-Large 通过业界最大的开源 MoE 架构，完美平衡了超长文本处理能力与推理成本，让企业级深度智能分析变得触手可及。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_Tencent-Hunyuan-Large_a584060b.png","Tencent-Hunyuan","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FTencent-Hunyuan_c6e5ecd4.png","","https:\u002F\u002Fhunyuan.tencent.com\u002F","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan",[82,86],{"name":83,"color":84,"percentage":85},"Python","#3572A5",96.9,{"name":87,"color":88,"percentage":89},"Shell","#89e051",3.1,1593,122,"2026-04-12T20:03:07","NOASSERTION",5,"未说明","必需 NVIDIA GPU。支持 FP8 量化（显存占用比 FP16\u002FBF16 减少 50%）和全精度运行。推荐使用支持 TensorRT-LLM 或 vLLM 后端的显卡以利用 CLA 结构优化（KV Cache 显存节省 50%）。具体显存大小取决于模型版本（FP8 或 FP16）及上下文长度（最高 256K），未给出具体最低显存数值，但鉴于模型规模（389B 总参数\u002F52B 激活参数），需多卡或多节点环境。",{"notes":98,"python":95,"dependencies":99},"该模型为业界最大的开源 MoE 模型（总参数 3890 亿，激活参数 520 亿）。提供 FP8 量化版本以大幅降低显存需求并提升吞吐量。推理后端支持定制的 vLLM（已开源）和 TensorRT-LLM（即将开源），其中 TRT-LLM 性能优于 vLLM 30% 以上。训练兼容 Hugging Face 格式，可使用 hf-deepspeed 框架进行微调。支持超长上下文（预训练 256K，指令版 128K）。",[100,101,102,103,104],"vLLM (定制分支)","TensorRT-LLM (计划开源)","Hugging Face Transformers","DeepSpeed (hf-deepspeed)","Flash Attention",[15],"2026-03-27T02:49:30.150509","2026-04-15T07:11:42.508819",[109,114,119,124,129,133],{"id":110,"question_zh":111,"answer_zh":112,"source_url":113},33992,"后续会发布 int8 量化版本吗？如何在 8 卡 A100 上进行推理？","int8 量化版本已经发布。您可以使用 int8 进行推理，具体部署方式请参考官方文档中的 [int8-quantized-model-deployment](https:\u002F\u002Fgithub.com\u002FTencent\u002FTencent-Hunyuan-Large?tab=readme-ov-file#int8-quantized-model-deployment) 章节。请注意，使用的模型应为 [Hunyuan-A52B-Instruct](https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Instruct)。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FTencent-Hunyuan-Large\u002Fissues\u002F12",{"id":115,"question_zh":116,"answer_zh":117,"source_url":118},33993,"FP8 模型是否支持单机 8 卡部署？","是的，FP8 模型支持单机 8 卡部署。这在 README 的部署资源说明中已有提及，无需像 BF16 那样需要 2 机 16 卡。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FTencent-Hunyuan-Large\u002Fissues\u002F2",{"id":120,"question_zh":121,"answer_zh":122,"source_url":123},33994,"Tencent-Hunyuan-Large 模型在 OCR Benchmark 中的得分是多少？","Hunyuan-Large 目前是一个文生文（文本生成）模型，不支持多模态任务，因此没有 OCR Benchmark 的得分。团队计划在未来陆续开源多模态模型，届时可能会包含相关指标。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FTencent-Hunyuan-Large\u002Fissues\u002F5",{"id":125,"question_zh":126,"answer_zh":127,"source_url":128},33995,"运行该模型推理最少需要多少张显卡？8 张 A100 (80G) 够用吗？","对于全精度（如 BF16）模型，8 张 A100 (80G) 通常是不够的，官方推荐配置为 2 机 16 卡。如果资源有限，建议使用 FP8 版本（可单机 8 卡部署）或已发布的 int8 量化版本（配合 Hunyuan-A52B-Instruct 模型），这两种方案可以在 8 卡 A100 环境下运行。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FTencent-Hunyuan-Large\u002Fissues\u002F11",{"id":130,"question_zh":131,"answer_zh":132,"source_url":113},33996,"8 卡 A100 (80G) 运行 Int8 模型出现 OOM（显存溢出）怎么办？","请确保您下载并使用的是正确的 Int8 模型版本，即 [Hunyuan-A52B-Instruct](https:\u002F\u002Fhuggingface.co\u002Ftencent\u002FTencent-Hunyuan-Large\u002Ftree\u002Fmain\u002FHunyuan-A52B-Instruct)，并严格按照官方文档 [int8-quantized-model-deployment](https:\u002F\u002Fgithub.com\u002FTencent\u002FTencent-Hunyuan-Large?tab=readme-ov-file#int8-quantized-model-deployment) 中的步骤进行部署。如果仍然报错，请检查 vLLM 版本及启动参数配置是否与文档一致。",{"id":134,"question_zh":135,"answer_zh":136,"source_url":123},33997,"Hunyuan-Large 是多模态模型吗？支持图片识别吗？","不是。目前的 Hunyuan-Large 是纯文本生成模型（文生文），不具备图像识别或多模态处理能力。团队表示多模态模型会在完善后于近期逐步开源，请关注后续更新。",[]]