[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-NVIDIA--Megatron-LM":3,"tool-NVIDIA--Megatron-LM":64},[4,17,25,39,48,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,14,15],"开发框架","Agent","语言模型","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":10,"last_commit_at":23,"category_tags":24,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,15],{"id":26,"name":27,"github_repo":28,"description_zh":29,"stars":30,"difficulty_score":10,"last_commit_at":31,"category_tags":32,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[33,34,35,36,14,37,15,13,38],"图像","数据工具","视频","插件","其他","音频",{"id":40,"name":41,"github_repo":42,"description_zh":43,"stars":44,"difficulty_score":45,"last_commit_at":46,"category_tags":47,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,3,"2026-04-04T04:44:48",[14,33,13,15,37],{"id":49,"name":50,"github_repo":51,"description_zh":52,"stars":53,"difficulty_score":45,"last_commit_at":54,"category_tags":55,"status":16},519,"PaddleOCR","PaddlePaddle\u002FPaddleOCR","PaddleOCR 是一款基于百度飞桨框架开发的高性能开源光学字符识别工具包。它的核心能力是将图片、PDF 等文档中的文字提取出来，转换成计算机可读取的结构化数据，让机器真正“看懂”图文内容。\n\n面对海量纸质或电子文档，PaddleOCR 解决了人工录入效率低、数字化成本高的问题。尤其在人工智能领域，它扮演着连接图像与大型语言模型（LLM）的桥梁角色，能将视觉信息直接转化为文本输入，助力智能问答、文档分析等应用场景落地。\n\nPaddleOCR 适合开发者、算法研究人员以及有文档自动化需求的普通用户。其技术优势十分明显：不仅支持全球 100 多种语言的识别，还能在 Windows、Linux、macOS 等多个系统上运行，并灵活适配 CPU、GPU、NPU 等各类硬件。作为一个轻量级且社区活跃的开源项目，PaddleOCR 既能满足快速集成的需求，也能支撑前沿的视觉语言研究，是处理文字识别任务的理想选择。",74913,"2026-04-05T10:44:17",[15,33,13,37],{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":45,"last_commit_at":62,"category_tags":63,"status":16},2181,"OpenHands","OpenHands\u002FOpenHands","OpenHands 是一个专注于 AI 驱动开发的开源平台，旨在让智能体（Agent）像人类开发者一样理解、编写和调试代码。它解决了传统编程中重复性劳动多、环境配置复杂以及人机协作效率低等痛点，通过自动化流程显著提升开发速度。\n\n无论是希望提升编码效率的软件工程师、探索智能体技术的研究人员，还是需要快速原型验证的技术团队，都能从中受益。OpenHands 提供了灵活多样的使用方式：既可以通过命令行（CLI）或本地图形界面在个人电脑上轻松上手，体验类似 Devin 的流畅交互；也能利用其强大的 Python SDK 自定义智能体逻辑，甚至在云端大规模部署上千个智能体并行工作。\n\n其核心技术亮点在于模块化的软件智能体 SDK，这不仅构成了平台的引擎，还支持高度可组合的开发模式。此外，OpenHands 在 SWE-bench 基准测试中取得了 77.6% 的优异成绩，证明了其解决真实世界软件工程问题的能力。平台还具备完善的企业级功能，支持与 Slack、Jira 等工具集成，并提供细粒度的权限管理，适合从个人开发者到大型企业的各类用户场景。",70612,"2026-04-05T11:12:22",[15,14,13,36],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":79,"owner_email":79,"owner_twitter":79,"owner_website":80,"owner_url":81,"languages":82,"stars":102,"forks":103,"last_commit_at":104,"license":105,"difficulty_score":106,"env_os":107,"env_gpu":108,"env_ram":109,"env_deps":110,"category_tags":116,"github_topics":117,"view_count":10,"oss_zip_url":79,"oss_zip_packed_at":79,"status":16,"created_at":121,"updated_at":122,"faqs":123,"releases":153},2128,"NVIDIA\u002FMegatron-LM","Megatron-LM","Ongoing research training transformer models at scale","Megatron-LM 是 NVIDIA 推出的开源项目，专注于在大规模 GPU 集群上高效训练 Transformer 模型。它主要解决了大模型训练中遇到的显存受限、计算效率低以及分布式并行策略复杂等核心难题，让千亿甚至万亿参数模型的训练成为可能。\n\n该项目包含两个核心部分：Megatron-LM 提供了一套完整的参考实现和预配置脚本，非常适合研究团队快速上手、学习分布式训练原理或进行实验验证；而 Megatron Core 则是一个模块化的底层库，提供了经过 GPU 深度优化的构建块，适合框架开发者和机器学习工程师用来定制专属的训练流水线。\n\n其技术亮点在于支持多种先进的并行策略（包括张量并行、流水线并行、数据并行及上下文并行等），并全面适配混合精度训练（如 FP16、BF16 乃至最新的 FP8\u002FFP4）。此外，它还提供了与 Hugging Face 生态双向互通的模型转换工具，极大提升了工程落地的便利性。无论是希望探索前沿模型架构的研究人员，还是致力于构建高性能训练平台的工程师，Megatron-LM 都是值得信赖的强大助手。","\u003Cdiv align=\"center\">\n\nMegatron-LM and Megatron Core\n=============================\n\n\u003Ch4>GPU-optimized library for training transformer models at scale\u003C\u002Fh4>\n\n[![Documentation](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdocs-latest-brightgreen.svg?style=flat)](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Findex.html)\n[![version](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Frelease-0.15.0-green)](.\u002FCHANGELOG.md)\n[![license](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Flicense-Apache-blue)](.\u002FLICENSE)\n\n\u003Cdiv align=\"left\">\n\n## About\n\nThis repository contains two components: **Megatron-LM** and **Megatron Core**.\n\n**Megatron-LM** is a reference example that includes Megatron Core plus pre-configured training scripts. Best for research teams, learning distributed training, and quick experimentation.\n\n**Megatron Core** is a composable library with GPU-optimized building blocks for custom training frameworks. It provides transformer building blocks, advanced parallelism strategies (TP, PP, DP, EP, CP), mixed precision support (FP16, BF16, FP8, FP4), and model architectures. Best for framework developers and ML engineers building custom training pipelines.\n\n**[Megatron Bridge](https:\u002F\u002Fgithub.com\u002FNVIDIA-NeMo\u002FMegatron-Bridge)** provides bidirectional Hugging Face ↔ Megatron checkpoint conversion with production-ready recipes.\n\n## Getting Started\n\n**Install from PyPI:**\n\n```bash\nuv pip install megatron-core\n```\n\n**Or clone and install from source:**\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM.git\ncd Megatron-LM\nuv pip install -e .\n```\n\n> **Note:** Building from source can use a lot of memory. If the build runs out of memory, limit parallel compilation jobs by setting `MAX_JOBS` (e.g. `MAX_JOBS=4 uv pip install -e .`).\n\nFor NGC container setup and all installation options, see the **[Installation Guide](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fget-started\u002Finstall.html)**.\n\n- **[Your First Training Run](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fget-started\u002Fquickstart.html)** - End-to-end training examples with data preparation\n- **[Parallelism Strategies](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fuser-guide\u002Fparallelism-guide.html)** - Scale training across GPUs with TP, PP, DP, EP, and CP\n- **[Contribution Guide](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fdeveloper\u002Fcontribute.html)** - How to contribute to Megatron Core\n\n# Latest News\n\n- **[2026\u002F03]** **Deprecating Python 3.10 support:** We're officially dropping Python 3.10 support with the upcoming 0.17.0 release. Downstream applications must raise their lower boundary to 3.12 to stay compatible with MCore.\n- **[2026\u002F01]** **[Dynamic Context Parallelism](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Fspeeding-up-variable-length-training-with-dynamic-context-parallelism-and-nvidia-megatron-core\u002F)** - Up to 1.48x speedup for variable-length sequence training with adaptive CP sizing.\n- **[2025\u002F12]** **Megatron Core development has moved to GitHub!** All development and CI now happens in the open. We welcome community contributions.\n- **[2025\u002F10]** **[Megatron Dev Branch](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Ftree\u002Fdev)** - early access branch with experimental features.\n- **[2025\u002F10]** **[Megatron Bridge](https:\u002F\u002Fgithub.com\u002FNVIDIA-NeMo\u002FMegatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.\n- **[2025\u002F08]** **[MoE Q3-Q4 2025 Roadmap](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.\n- **[2025\u002F08]** **[GPT-OSS Model](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.\n- **[2025\u002F06]** **[Megatron MoE Model Zoo](https:\u002F\u002Fgithub.com\u002Fyanring\u002FMegatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.\n- **[2025\u002F05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Fturbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework\u002F)).\n\n\u003Cdetails>\n\u003Csummary>Previous News\u003C\u002Fsummary>\n\n- **[2024\u002F07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Ftrain-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities\u002F)).\n- **[2024\u002F06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.07887) and [code example](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Ftree\u002Fssm\u002Fexamples\u002Fmamba).\n- **[2024\u002F01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Ftree\u002Fmain\u002Fmegatron\u002Fcore) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs.\n\n\u003C\u002Fdetails>\n\n# Project Structure\n\n```\nMegatron-LM\u002F\n├── megatron\u002F\n│   ├── core\u002F                    # Megatron Core (kernels, parallelism, building blocks)\n│   │   ├── models\u002F              # Transformer models\n│   │   ├── transformer\u002F         # Transformer building blocks\n│   │   ├── tensor_parallel\u002F     # Tensor parallelism\n│   │   ├── pipeline_parallel\u002F   # Pipeline parallelism\n│   │   ├── distributed\u002F         # Distributed training (FSDP, DDP)\n│   │   ├── optimizer\u002F           # Optimizers\n│   │   ├── datasets\u002F            # Dataset loaders\n│   │   ├── inference\u002F           # Inference engines and server\n│   │   └── export\u002F              # Model export (e.g. TensorRT-LLM)\n│   ├── training\u002F                # Training scripts\n│   ├── legacy\u002F                  # Legacy components\n│   ├── post_training\u002F           # Post-training (quantization, distillation, pruning, etc.)\n│   └── rl\u002F                      # Reinforcement learning (RLHF, etc.)\n├── examples\u002F                    # Ready-to-use training examples\n├── tools\u002F                       # Utility tools\n├── tests\u002F                       # Comprehensive test suite\n└── docs\u002F                        # Documentation\n```\n\n# Performance Benchmarking\n\nFor our latest performance benchmarking results, please refer to [NVIDIA Megatron Bridge Performance Summary](https:\u002F\u002Fdocs.nvidia.com\u002Fnemo\u002Fmegatron-bridge\u002Flatest\u002Fperformance-summary.html).\n\nOur codebase efficiently trains models from 2B to 462B parameters across thousands of GPUs, achieving up to **47% Model FLOP Utilization (MFU)** on H100 clusters.\n\n![Model table](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_Megatron-LM_readme_9a815fec9341.png)\n\n**Benchmark Configuration:**\n\n- **Vocabulary size**: 131,072 tokens\n- **Sequence length**: 4096 tokens\n- **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts\n- **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default)\n\n**Key Results:**\n\n- **6144 H100 GPUs**: Successfully benchmarked 462B parameter model training\n- **Superlinear scaling**: MFU increases from 41% to 47-48% with model size\n- **End-to-end measurement**: Throughputs include all operations (data loading, optimizer steps, communication, logging)\n- **Production ready**: Full training pipeline with checkpointing and fault tolerance\n- *Note: Performance results measured without training to convergence*\n\n## Weak Scaling Results\n\nOur weak scaled results show superlinear scaling (MFU increases from 41% for the smallest model considered to 47-48% for the largest models); this is because larger GEMMs have higher arithmetic intensity and are consequently more efficient to execute.\n\n![Weak scaling](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_Megatron-LM_readme_d6a470986a0a.png)\n\n## Strong Scaling Results\n\nWe also strong scaled the standard GPT-3 model (our version has slightly more than 175 billion parameters due to larger vocabulary size) from 96 H100 GPUs to 4608 GPUs, using the same batch size of 1152 sequences throughout. Communication becomes more exposed at larger scale, leading to a reduction in MFU from 47% to 42%.\n\n![Strong scaling](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_Megatron-LM_readme_f42da0264d72.png)\n\n# Roadmaps\n\n- **[MoE Roadmap](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F1729)** - DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements\n\n# Resources\n\n## Getting Help\n\n- 📖 **[Documentation](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Findex.html)** - Official documentation\n- 🐛 **[Issues](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues)** - Bug reports and feature requests\n\n## Contributing\n\nWe ❤️ contributions! Ways to contribute:\n\n- 🐛 **Report bugs** - Help us improve reliability\n- 💡 **Suggest features** - Shape the future of Megatron Core\n- 📝 **Improve docs** - Make Megatron Core more accessible\n- 🔧 **Submit PRs** - Contribute code improvements\n\n**→ [Contributing Guide](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fdeveloper\u002Fcontribute.html)**\n\n## Citation\n\nIf you use Megatron in your research or project, we appreciate that you use the following citations:\n\n```bibtex\n@article{megatron-lm,\n  title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},\n  author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},\n  journal={arXiv preprint arXiv:1909.08053},\n  year={2019}\n}\n```\n","\u003Cdiv align=\"center\">\n\nMegatron-LM 与 Megatron Core\n=============================\n\n\u003Ch4>面向 GPU 优化的库，用于大规模训练 Transformer 模型\u003C\u002Fh4>\n\n[![文档](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdocs-latest-brightgreen.svg?style=flat)](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Findex.html)\n[![版本](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Frelease-0.15.0-green)](.\u002FCHANGELOG.md)\n[![许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Flicense-Apache-blue)](.\u002FLICENSE)\n\n\u003Cdiv align=\"left\">\n\n## 关于\n\n本仓库包含两个组件：**Megatron-LM** 和 **Megatron Core**。\n\n**Megatron-LM** 是一个参考示例，包含了 Megatron Core 以及预配置的训练脚本。非常适合研究团队、学习分布式训练以及快速实验。\n\n**Megatron Core** 是一个可组合的库，提供针对 GPU 优化的构建模块，适用于自定义训练框架。它提供了 Transformer 的基础组件、先进的并行化策略（TP、PP、DP、EP、CP）、混合精度支持（FP16、BF16、FP8、FP4）以及模型架构。最适合框架开发者和构建自定义训练流水线的机器学习工程师使用。\n\n**[Megatron Bridge](https:\u002F\u002Fgithub.com\u002FNVIDIA-NeMo\u002FMegatron-Bridge)** 提供 Hugging Face ↔ Megatron 检查点的双向转换，并配有生产就绪的配方。\n\n## 快速入门\n\n**从 PyPI 安装：**\n\n```bash\nuv pip install megatron-core\n```\n\n**或从源码克隆并安装：**\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM.git\ncd Megatron-LM\nuv pip install -e .\n```\n\n> **注意：** 从源码构建可能会占用大量内存。如果构建过程中出现内存不足的情况，可以通过设置 `MAX_JOBS` 来限制并行编译任务数（例如 `MAX_JOBS=4 uv pip install -e .`）。\n\n有关 NGC 容器的设置及所有安装选项，请参阅 **[安装指南](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fget-started\u002Finstall.html)**。\n\n- **[你的首次训练运行](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fget-started\u002Fquickstart.html)** - 包含数据准备的端到端训练示例\n- **[并行化策略](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fuser-guide\u002Fparallelism-guide.html)** - 使用 TP、PP、DP、EP 和 CP 在多 GPU 上扩展训练规模\n- **[贡献指南](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fdeveloper\u002Fcontribute.html)** - 如何为 Megatron Core 做出贡献\n\n# 最新动态\n\n- **[2026\u002F03]** **弃用 Python 3.10 支持：** 我们将在即将发布的 0.17.0 版本中正式停止对 Python 3.10 的支持。下游应用必须将最低兼容版本提升至 3.12，才能与 MCore 保持兼容。\n- **[2026\u002F01]** **[动态上下文并行化](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Fspeeding-up-variable-length-training-with-dynamic-context-parallelism-and-nvidia-megatron-core\u002F)** - 通过自适应调整 CP 大小，可使变长序列训练速度提升高达 1.48 倍。\n- **[2025\u002F12]** **Megatron Core 开发已迁移到 GitHub！** 所有开发和 CI 现在都在开源环境中进行。我们欢迎社区贡献。\n- **[2025\u002F10]** **[Megatron 开发分支](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Ftree\u002Fdev)** - 提供早期访问的实验性功能分支。\n- **[2025\u002F10]** **[Megatron Bridge](https:\u002F\u002Fgithub.com\u002FNVIDIA-NeMo\u002FMegatron-Bridge)** - 用于 Hugging Face 和 Megatron 检查点之间互操作性的双向转换工具，附带热门模型的生产就绪配方。\n- **[2025\u002F08]** **[MoE 2025 年第三至第四季度路线图](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F1729)** - 包含 DeepSeek-V3、Qwen3、高级并行化策略、FP8 优化以及 Blackwell 性能增强在内的 MoE 功能综合路线图。\n- **[2025\u002F08]** **[GPT-OSS 模型](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F1739)** - YaRN RoPE 缩放、注意力汇流以及自定义激活函数等高级特性正被整合到 Megatron Core 中。\n- **[2025\u002F06]** **[Megatron MoE 模型库](https:\u002F\u002Fgithub.com\u002Fyanring\u002FMegatron-MoE-ModelZoo)** - 提供 DeepSeek-V3、Mixtral 和 Qwen3 MoE 模型训练的最佳实践与优化配置，并配有性能基准测试和检查点转换工具。\n- **[2025\u002F05]** Megatron Core v0.11.0 为多数据中心 LLM 训练带来了新能力（[博客](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Fturbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework\u002F))。\n\n\u003Cdetails>\n\u003Csummary>往期新闻\u003C\u002Fsummary>\n\n- **[2024\u002F07]** Megatron Core v0.7 提升了可扩展性和训练容错能力，并新增了多模态训练支持（[博客](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Ftrain-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities\u002F))。\n- **[2024\u002F06]** Megatron Core 新增了对 Mamba 模型的支持。请参阅我们的论文 [基于 Mamba 的语言模型实证研究](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.07887) 和 [代码示例](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Ftree\u002Fssm\u002Fexamples\u002Fmamba)。\n- **[2024\u002F01 公告]** NVIDIA 已将 **Megatron-LM** 中的核心功能剥离出来，发布到本仓库的 **[Megatron Core](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Ftree\u002Fmain\u002Fmegatron\u002Fcore)** 中。Megatron Core 在 Megatron-LM 的 GPU 优化技术基础上，进一步引入了更前沿的系统级优化创新，具备可组合、模块化的 API。\n\n\u003C\u002Fdetails>\n\n# 项目结构\n\n```\nMegatron-LM\u002F\n├── megatron\u002F\n│   ├── core\u002F                    # Megatron Core（内核、并行化、基础组件）\n│   │   ├── models\u002F              # Transformer 模型\n│   │   ├── transformer\u002F         # Transformer 基础组件\n│   │   ├── tensor_parallel\u002F     # 张量并行\n│   │   ├── pipeline_parallel\u002F   # 流水线并行\n│   │   ├── distributed\u002F         # 分布式训练（FSDP、DDP）\n│   │   ├── optimizer\u002F           # 优化器\n│   │   ├── datasets\u002F            # 数据集加载器\n│   │   ├── inference\u002F           # 推理引擎与服务端\n│   │   └── export\u002F              # 模型导出（如 TensorRT-LLM）\n│   ├── training\u002F                # 训练脚本\n│   ├── legacy\u002F                  # 遗留组件\n│   ├── post_training\u002F           # 后训练处理（量化、蒸馏、剪枝等）\n│   └── rl\u002F                      # 强化学习（RLHF 等）\n├── examples\u002F                    # 即用型训练示例\n├── tools\u002F                       # 工具集\n├── tests\u002F                       # 全面的测试套件\n└── docs\u002F                        # 文档\n```\n\n# 性能基准测试\n\n有关我们最新的性能基准测试结果，请参阅 [NVIDIA Megatron Bridge 性能摘要](https:\u002F\u002Fdocs.nvidia.com\u002Fnemo\u002Fmegatron-bridge\u002Flatest\u002Fperformance-summary.html)。\n\n我们的代码库能够在数千张 GPU 上高效训练参数量从 20 亿到 4620 亿的模型，在 H100 集群上最高可达到 **47% 的模型浮点运算利用率 (MFU)**。\n\n![模型表格](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_Megatron-LM_readme_9a815fec9341.png)\n\n**基准测试配置：**\n\n- **词汇表大小**：131,072 个词元\n- **序列长度**：4096 个词元\n- **模型缩放**：通过调整隐藏层大小、注意力头数和层数来实现目标参数量\n- **通信优化**：与数据并行（`--overlap-grad-reduce`、`--overlap-param-gather`）、张量并行（`--tp-comm-overlap`）以及流水线并行（默认启用）进行细粒度重叠\n\n**关键结果：**\n\n- **6144 张 H100 GPU**：成功完成了 4620 亿参数模型的训练基准测试\n- **超线性扩展**：随着模型规模增大，MFU 从 41% 提升至 47%-48%\n- **端到端测量**：吞吐量包括所有操作（数据加载、优化器步骤、通信、日志记录）\n- **生产就绪**：完整的训练流水线，具备检查点保存和容错能力\n- *注：性能结果是在未训练至收敛的情况下测得的*\n\n## 弱扩展结果\n\n我们的弱扩展结果显示了超线性扩展特性（MFU 从最小模型的 41% 增加到最大模型的 47%-48%）；这是因为较大的 GEMM 具有更高的算术强度，因此执行效率更高。\n\n![弱扩展](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_Megatron-LM_readme_d6a470986a0a.png)\n\n## 强扩展结果\n\n我们还对标准 GPT-3 模型进行了强扩展测试（由于词汇表更大，我们的版本参数略多于 1750 亿），从 96 张 H100 GPU 扩展到 4608 张 GPU，期间始终使用 1152 个序列的固定批大小。在更大规模下，通信开销变得更加显著，导致 MFU 从 47% 降至 42%。\n\n![强扩展](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_Megatron-LM_readme_f42da0264d72.png)\n\n# 路线图\n\n- **[MoE 路线图](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F1729)** - DeepSeek-V3、Qwen3、高级并行化、FP8 优化以及 Blackwell 加速\n\n# 资源\n\n## 获取帮助\n\n- 📖 **[文档](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Findex.html)** - 官方文档\n- 🐛 **[问题](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues)** - Bug 报告和功能请求\n\n## 参与贡献\n\n我们非常欢迎各种形式的贡献！您可以这样参与：\n\n- 🐛 **报告 Bug** - 帮助我们提升系统可靠性\n- 💡 **提出建议** - 共同塑造 Megatron Core 的未来\n- 📝 **改进文档** - 让 Megatron Core 更加易用\n- 🔧 **提交 Pull Request** - 贡献代码改进\n\n**→ [贡献指南](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fdeveloper\u002Fcontribute.html)**\n\n## 引用\n\n如果您在研究或项目中使用 Megatron，我们非常感谢您采用以下引用格式：\n\n```bibtex\n@article{megatron-lm,\n  title={Megatron-LM: 使用模型并行训练数十亿参数语言模型},\n  author={Shoeybi, Mohammad 和 Patwary, Mostofa 和 Puri, Raul 和 LeGresley, Patrick 和 Casper, Jared 和 Catanzaro, Bryan},\n  journal={arXiv 预印本 arXiv:1909.08053},\n  year={2019}\n}\n```","# Megatron-LM 快速上手指南\n\nMegatron-LM 是 NVIDIA 推出的 GPU 优化库，专为大规模训练 Transformer 模型设计。本项目包含 **Megatron-LM**（参考示例与脚本）和 **Megatron Core**（可组合的核心构建块），支持张量并行 (TP)、流水线并行 (PP) 等多种高级策略，适用于从研究实验到生产级定制训练框架的开发。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04\u002F22.04 或兼容发行版)。\n*   **GPU**: NVIDIA GPU (推荐 Ampere 架构及以上，如 A100\u002FH100)，需安装对应的 NVIDIA Driver。\n*   **CUDA**: 需安装与 PyTorch 版本匹配的 CUDA Toolkit (通常建议 CUDA 11.8 或 12.x)。\n*   **Python**: \n    *   当前稳定版支持 Python 3.11\u002F3.12。\n    *   **注意**: Python 3.10 支持将在未来的 0.17.0 版本中移除，建议使用 Python 3.12 以获得最佳兼容性。\n*   **包管理工具**: 推荐使用 `uv` 进行快速依赖管理，也可使用 `pip`。\n\n> **提示**: 对于最简便的环境配置，推荐直接使用 NVIDIA NGC 容器，其中已预装所有必要依赖。详见 [安装指南](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fget-started\u002Finstall.html)。\n\n## 安装步骤\n\n您可以选择通过 PyPI 直接安装核心库，或克隆源码进行开发。\n\n### 方式一：通过 PyPI 安装（推荐用于快速体验）\n\n使用 `uv` 安装 `megatron-core`：\n\n```bash\nuv pip install megatron-core\n```\n\n若使用 `pip`：\n\n```bash\npip install megatron-core\n```\n\n### 方式二：从源码安装（推荐用于开发与自定义）\n\n克隆仓库并安装可编辑版本：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM.git\ncd Megatron-LM\nuv pip install -e .\n```\n\n> **注意**: 源码编译可能消耗大量内存。如果构建过程中出现内存不足 (OOM)，请限制并行编译任务数，例如：\n> ```bash\n> MAX_JOBS=4 uv pip install -e .\n> ```\n\n## 基本使用\n\nMegatron-LM 的核心优势在于其灵活的并行策略和高效的训练脚本。以下是启动一次基础训练的简要流程。\n\n### 1. 数据准备\n训练前需将原始文本数据预处理为二进制格式 (`.bin` 和 `.idx`)。可以使用项目自带的 `tools\u002Fpreprocess_data.py` 脚本。\n\n### 2. 运行训练脚本\nMegatron-LM 提供了丰富的预配置脚本位于 `megatron\u002Ftraining\u002F` 或通过 `examples\u002F` 目录调用。以下是一个使用 `torchrun` 启动分布式训练的最小化示例命令（需根据实际硬件调整参数）：\n\n```bash\ntorchrun --nproc_per_node=8 \\\n  pretrain_gpt.py \\\n  --tensor-model-parallel-size 2 \\\n  --pipeline-model-parallel-size 2 \\\n  --num-layers 24 \\\n  --hidden-size 1024 \\\n  --num-attention-heads 16 \\\n  --seq-length 2048 \\\n  --max-position-embeddings 2048 \\\n  --micro-batch-size 4 \\\n  --global-batch-size 128 \\\n  --train-iters 1000 \\\n  --lr-decay-iters 990 \\\n  --save .\u002Fcheckpoints \\\n  --load .\u002Fcheckpoints \\\n  --data-path .\u002Fdata\u002Fmy_dataset \\\n  --vocab-file .\u002Fvocab.json \\\n  --merge-file .\u002Fmerges.txt \\\n  --data-impl mmap \\\n  --split 949,50,1 \\\n  --distributed-backend nccl \\\n  --lr 0.00015 \\\n  --min-lr 0.000015 \\\n  --lr-warmup-fraction .01 \\\n  --clip-grad 1.0 \\\n  --weight-decay 0.01 \\\n  --adam-beta1 0.9 \\\n  --adam-beta2 0.95 \\\n  --init-method-std 0.006 \\\n  --fp16\n```\n\n### 3. 关键配置说明\n*   **并行策略**: 通过 `--tensor-model-parallel-size` (TP) 和 `--pipeline-model-parallel-size` (PP) 控制多卡间的模型切分。\n*   **精度支持**: 支持 `--fp16`, `--bf16`, 以及最新的 `--fp8` (需硬件支持)。\n*   **断点续训**: 使用 `--save` 和 `--load` 指定检查点路径，程序会自动加载最新状态继续训练。\n\n更多详细的端到端示例、数据准备教程及高级并行策略配置，请参阅官方文档 **[Your First Training Run](https:\u002F\u002Fdocs.nvidia.com\u002Fmegatron-core\u002Fdeveloper-guide\u002Flatest\u002Fget-started\u002Fquickstart.html)**。","某大型金融科技公司的大模型团队正试图在自建的 GPU 集群上训练一个拥有千亿参数的行业专属大语言模型，以处理复杂的合规审查任务。\n\n### 没有 Megatron-LM 时\n- **显存瓶颈难以突破**：单卡甚至单机显存无法容纳巨大模型参数，团队需花费数周手动编写复杂的模型切分代码，且极易出错。\n- **训练效率低下**：缺乏优化的并行策略（如张量并行、流水线并行），导致多卡通信开销巨大，GPU 利用率长期低于 30%，训练周期被无限拉长。\n- **精度与稳定性风险**：自行实现的混合精度训练（FP16\u002FBF16）常引发梯度溢出或数值不稳定，导致训练中途频繁崩溃，实验迭代成本极高。\n- **生态兼容困难**：训练好的模型权重格式独特，难以直接转换为 Hugging Face 格式进行部署或二次开发，阻碍了业务落地。\n\n### 使用 Megatron-LM 后\n- **轻松实现大规模扩展**：利用内置的张量并行（TP）、流水线并行（PP）及上下文并行（CP）策略，一键将千亿模型分布到数百张 GPU 上，无需手动重构网络结构。\n- **极致算力释放**：依托 NVIDIA 深度优化的算子内核和动态上下文并行技术，将有效训练吞吐量提升数倍，显著缩短从实验到生产的时间。\n- **稳定高效的精度控制**：原生支持 FP8、BF16 等先进混合精度格式，自动处理梯度缩放与数值稳定性，确保长周期训练平稳运行。\n- **无缝生态互通**：通过 Megatron Bridge 工具，实现与 Hugging Face 检查点的双向无损转换，让模型训练完即可直接接入现有推理服务框架。\n\nMegatron-LM 将原本需要数月攻坚的底层分布式训练难题，转化为可配置的工程流程，让团队能专注于模型算法创新而非基础设施修补。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_Megatron-LM_9a815fec.png","NVIDIA","NVIDIA Corporation","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FNVIDIA_7dcf6000.png","",null,"https:\u002F\u002Fnvidia.com","https:\u002F\u002Fgithub.com\u002FNVIDIA",[83,87,91,95,99],{"name":84,"color":85,"percentage":86},"Python","#3572A5",99.1,{"name":88,"color":89,"percentage":90},"Shell","#89e051",0.6,{"name":92,"color":93,"percentage":94},"C++","#f34b7d",0.3,{"name":96,"color":97,"percentage":98},"Cuda","#3A4E3A",0,{"name":100,"color":101,"percentage":98},"Makefile","#427819",15918,3785,"2026-04-05T10:11:33","NOASSERTION",4,"Linux","必需 NVIDIA GPU（支持 H100 等），显存需求视模型规模而定（大规模训练需高显存），支持 FP16\u002FBF16\u002FFP8\u002FFP4 混合精度","未说明（源码编译时若内存不足需限制并行编译任务数 MAX_JOBS）",{"notes":111,"python":112,"dependencies":113},"该工具主要针对大规模 Transformer 模型训练优化。推荐使用 NGC 容器或参考官方安装指南进行部署。从源码构建时消耗内存较大，若遇到内存溢出错误，可通过设置环境变量 MAX_JOBS（如 MAX_JOBS=4）来限制并行编译任务数。支持多种并行策略（TP, PP, DP, EP, CP）。","3.12+ (即将弃用 3.10，建议升级至 3.12 以兼容未来版本)",[114,115],"megatron-core","uv",[15],[118,119,120],"large-language-models","model-para","transformers","2026-03-27T02:49:30.150509","2026-04-06T06:44:36.561815",[124,129,134,139,144,148],{"id":125,"question_zh":126,"answer_zh":127,"source_url":128},9795,"训练过程中遇到 'found NaN in local grad norm' 错误，常见原因有哪些？如何排查？","这是一个通用错误，可能由多种原因引起。已确认的常见原因包括：\n1. **数据问题**：数据格式错误、全填充序列（all-padding sequences）、填充长度不匹配。\n2. **数值不稳定**：BF16 精度问题、学习率设置不当、损失缩放（loss scaling）问题。\n3. **Transformer Engine 版本**：部分用户通过从源码重新安装 Transformer Engine 解决了该问题。\n4. **序列长度配置**：Prompt 或序列长度配置不当。\n\n建议排查步骤：\n- 检查数据预处理流程和序列填充情况。\n- 尝试调整 Prompt 长度或序列长度配置。\n- 如果是 BF16 训练，检查是否涉及数值溢出，尝试调整学习率或关闭某些优化。\n- 重新从源码编译安装最新版的 Transformer Engine。\n- 若问题依旧，请提供模型配置、并行策略、数据细节及完整堆栈跟踪以便进一步定位。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F780",{"id":130,"question_zh":131,"answer_zh":132,"source_url":133},9796,"如何将 HuggingFace 格式的 LLaMA-2 模型权重转换为 Megatron-LM 格式，反之亦然？","**1. 从 HuggingFace 转换到 Megatron-LM：**\n```bash\nPYTHONPATH=$(pwd) tools\u002Fcheckpoint\u002Futil.py --model-type=GPT --loader=llama2_hf --load-dir=\u003CHF_MODEL_DIR> --save-dir=\u003CSAVE_DIR> --tokenizer-model=\u003CTOKENIZER_MODEL_FILE>\n```\n\n**2. 从 Megatron-LM 转换回 HuggingFace：**\n第一步：下载并保存转换脚本 [saver_llama2_hf.py](https:\u002F\u002Fgist.github.com\u002Fdevymex\u002F734ff89ffb7ba047de177201ba90b3d1) 到 `Megatron-LM\u002Ftools\u002Fcheckpoint\u002Fsaver_llama2_hf.py`。\n\n第二步：执行转换命令：\n```bash\nPYTHONPATH=$(pwd) tools\u002Fcheckpoint\u002Futil.py --model-type=GPT --saver=llama2_hf --load-dir=\u003CMEGATRON_CHECKPOINT_DIR> --save-dir=\u003CSAVE_DIR>\n```\n\n**注意**：在将 LLaMA-2 从 Megatron 转回 HF 之前，必须确保训练时以下参数与 HF 默认值一致：\n- 设置 `--norm-epsilon=1e-6`\n- 不要启用 `--apply-query-key-layer-scaling`（或在旧版本中启用 `--no-query-key-layer-scaling`）","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F37",{"id":135,"question_zh":136,"answer_zh":137,"source_url":138},9797,"在多节点训练 Mixtral 8x7B 时，如何配置并行策略以获得最佳吞吐量？","并行策略的选择取决于硬件规模：\n- **多节点场景**：使用流水线并行（PP）通常比专家并行（EP）性能更好。\n- **单节点内**：专家并行（EP）仍是首选。\n\n例如，在 64 张 H100 上训练 Mixtral 8x7B 的最佳配置是 `TP=1, EP=8, PP=8`，可达到超过 400 TFLOPS 的吞吐量。\n\n但在较小规模（如仅 16 张 H100）时，若使用 `PP=2` 可能导致显存溢出（OOM），此时建议改用 `TP=2` 来平衡显存和计算效率。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F756",{"id":140,"question_zh":141,"answer_zh":142,"source_url":143},9798,"Context Parallel (CP) 中的 loss scaling 逻辑是否存在错误？正确的处理方式是什么？","是的，早期版本中存在一个 bug：在 Context Parallel 模式下，损失值被错误地乘以了 `cp_size`。\n\n**正确逻辑应为**：\n1. 在每个 CP 组内对局部 loss 求和并进行 allreduce，得到每 token 的损失（这与 Data Parallel 中每个 rank 的操作类似）。\n2. **不应再乘以 `cp_size`**。该步骤已被移除。\n3. 梯度缓冲区应按 `1\u002F(dp_size + cp_size)` 进行缩放。\n4. 最后在 `(dp + cp)` 组内进行梯度 allreduce。\n\n修复后的代码已提交，确保 loss 使用的是 CP 组内 allreduce 后的结果（即对应序列切片的损失），而不是乘以 `cp_size` 后的值。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F673",{"id":145,"question_zh":146,"answer_zh":147,"source_url":138},9799,"Grouped GEMM Experts 是否支持分布式检查点（distributed checkpointing）？能否加载 SequentialMLP 的检查点？","是的，目前已支持 GroupedMLP 的分布式检查点（distckpt）。这意味着你可以：\n- 保存带有 Grouped GEMM 专家的模型检查点。\n- **加载原本使用 SequentialMLP 训练的检查点**，即使当前模型启用了 GroupedMLP。\n\n该功能允许用户在升级架构（如从 SequentialMLP 切换到更高效的 GroupedMLP）时，无需从头训练，可直接复用已有检查点。",{"id":149,"question_zh":150,"answer_zh":151,"source_url":152},9800,"加载 Llama-Megatron 检查点时出现 'Unexpected key(s) in state_dict' 错误，如何解决？","该错误通常发生在尝试加载由 HuggingFace 转换而来的 Llama2 检查点时，原因是键名不匹配或结构差异。\n\n**解决方案**：\n1. 确保使用官方推荐的转换工具（如 `tools\u002Fcheckpoint\u002Futil.py`）进行格式转换，并指定正确的 `--loader` 和 `--saver` 参数（例如 `llama2_hf`）。\n2. 检查训练时使用的参数是否与原始 HF 模型一致，特别是：\n   - `--norm-epsilon=1e-6`\n   - 禁用 `--apply-query-key-layer-scaling`\n3. 如果手动修改过检查点文件或使用了非标准转换脚本，请核对 `state_dict` 中的键名是否与 Megatron-LM 预期一致。\n4. 参考官方文档中关于 Llama\u002FMistral 模型转换的最新指南，确保流程符合当前版本要求。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fissues\u002F1132",[154,159,164,169,173,177,182,187,191,195,200,204,209,214,219,224,229,234,239,244],{"id":155,"version":156,"summary_zh":157,"released_at":158},107081,"core_v0.16.1","\n\n\n\u003Cdetails>\u003Csummary>Changelog Details\u003C\u002Fsummary>\n\n- cp: `ci: Skip cleanup-taint-node jobs during deployments (3612)` into `core_r0.16.0` by @ko3n1g :: PR: #3613\n- beep boop 🤖: Bumping versions by @svcnvidia-nemo-ci :: PR: #3616\n- cp: `docs: Fix version picker urls (3621)` into `core_r0.16.0` by @ko3n1g :: PR: #3622\n- cp: `ci: Increase changelog generation max PRs fetched (3620)` into `core_r0.16.0` by @ko3n1g :: PR: #3623\n- Cherry-pick #3399 for Mamba Uneven PP fix by @kevalmorabia97 :: PR: #3544\n- cp: `fix: async_utils: explicit GC in persistent checkpoint worker loop (3591)` into `core_r0.16.0` by @ko3n1g :: PR: #3628\n\n\u003C\u002Fdetails>\n","2026-03-20T21:24:09",{"id":160,"version":161,"summary_zh":162,"released_at":163},107082,"core_v0.16.0","\u003Cdetails>\u003Csummary>Changelog Details\u003C\u002Fsummary>\r\n\r\n- ci: Fix copyright checker by @ko3n1g :: PR: #1893\r\n- chore: Add codeowners by @ko3n1g :: PR: #1897\r\n- ci: Extend queue-manager for dev branch by @ko3n1g :: PR: #1906\r\n- ci: Move test optimizer into its own bucket by @ko3n1g :: PR: #1909\r\n- ci: Configure cherrypick bot by @ko3n1g :: PR: #1925\r\n- Ci approve dev by @ko3n1g :: PR: #1933\r\n- ci: Update nightly schedule by @ko3n1g :: PR: #1934\r\n- ci: Bump pre-flight for runs on main\u002Fdev by @ko3n1g :: PR: #1935\r\n- ci: Allow skipping on main by @ko3n1g :: PR: #1936\r\n- Ko3n1g\u002Fci\u002Fpr template community bot by @ko3n1g :: PR: #1937\r\n- ci: More granular unit tests buckets by @ko3n1g :: PR: #1932\r\n- Add sequence packing to RL by @tdene :: PR: #1911\r\n- chore: Update template by @ko3n1g :: PR: #1939\r\n- chore: Add description about who can merge by @ko3n1g :: PR: #1940\r\n- Ko3n1g\u002Fci\u002Ffix main on eos by @ko3n1g :: PR: #1938\r\n- Ko3n1g\u002Fci\u002Finternal mrs by @ko3n1g :: PR: #1942\r\n- ci: Fix branch of approval bot by @ko3n1g :: PR: #1944\r\n- ci: Approvalbot for other branches by @ko3n1g :: PR: #1947\r\n- ci(fix): Approval bot by @ko3n1g :: PR: #1949\r\n- Ko3n1g\u002Fci\u002Fsync branches by @ko3n1g :: PR: #1956\r\n- Ko3n1g\u002Fci\u002Fadd milestone by @ko3n1g :: PR: #1951\r\n- Remove M-FSDP testing under LTS environment by @shjwudp :: PR: #1959\r\n- ci: Run on push to release branch by @ko3n1g :: PR: #1960\r\n- Fix typo in rl section of CODEOWNERS by @tdene :: PR: #1968\r\n- ci: Update copyright checker by @ko3n1g :: PR: #1973\r\n- Ko3n1g\u002Fci\u002Fauto reminder GitHub by @ko3n1g :: PR: #1955\r\n- ci(fix): `Run tests` label by @ko3n1g :: PR: #1970\r\n- Make `get_asyncio_loop` safe to use repeatedly by @tdene :: PR: #1990\r\n- chore: Update codeowners by @ko3n1g :: PR: #2012\r\n- zarr soft deprecation by @dimapihtar :: PR: #2004\r\n- Deduplicate dynamic engine + coordinator. by @lmcafee-nvidia :: PR: #1981\r\n- Update symmetric registration interface to sync-up with upstream pytorch change by @youngeunkwon0405 :: PR: #1924\r\n- Safely access state dict args in load ckpt by @maanug-nv :: PR: #1957\r\n- Allow mixed-batch sampling in dynamic inference by @tdene :: PR: #1927\r\n- Stop Nemo_CICD_Test from failing in forks by @tdene :: PR: #2024\r\n- Clean up dynamic inference step by @tdene :: PR: #1992\r\n- ci: Auto-update copy-pr-bot vetters by @ko3n1g :: PR: #1850\r\n- ci: Fix build-push-wheel workflow by @ko3n1g :: PR: #2022\r\n- ci: Enable integration tests by @ko3n1g :: PR: #2023\r\n- chore: Update tooling for interactive jobs by @ko3n1g :: PR: #2032\r\n- Have datasets account for tokenizers which incorrectly define PAD by @tdene :: PR: #2017\r\n- revert(hotfix): ci: trustees_override by @ko3n1g :: PR: #2041\r\n- add missing warnings import in model parallel config by @yashaswikarnati :: PR: #2039\r\n- Reduce-scatter implementation with FP32 accumulation by @deepakn94 :: PR: #1967\r\n- ci(fix): Workflows on `main` by @ko3n1g :: PR: #2045\r\n- build: Bump modelopt by @ko3n1g :: PR: #2046\r\n- Remove TestCaptureFreezeGC unit test. by @lmcafee-nvidia :: PR: #1978\r\n- ci: Add multi-approval action by @ko3n1g :: PR: #2051\r\n- Ko3n1g\u002Fci\u002Ftest iteration time by @ko3n1g :: PR: #2067\r\n- Allow inference test throughput to vary by 10% by @mathemakitten :: PR: #2070\r\n- chore: Fix autoformatter by @ko3n1g :: PR: #2073\r\n- ci(hotfix): Bypass approvalbot in merge-queue by @ko3n1g :: PR: #2082\r\n- chore: Update local tooling by @ko3n1g :: PR: #2066\r\n- Add extra RL files by @tdene :: PR: #2077\r\n- Prevent summary jobs from running in forks by @tdene :: PR: #2083\r\n- ci: Fix test scope by @ko3n1g :: PR: #2091\r\n- Refactor the attention metadata into separate classes by @kanz-nv :: PR: #2001\r\n- Guard against incorrectly using MoE prefill graphs by @tdene :: PR: #2030\r\n- Run mr-slim tests in lightweight-mode by @chtruong814 :: PR: #2106\r\n- Inference | Lazy compile UVM allocator. by @lmcafee-nvidia :: PR: #1977\r\n- chore: Reenable trustees by @ko3n1g :: PR: #2108\r\n- Ko3n1g\u002Fchore\u002Fupdate release settings by @ko3n1g :: PR: #2097\r\n- ci(fix): Changeset of copyright checker by @ko3n1g :: PR: #2110\r\n- Remove unnecessary check on rotary_pos_cos by @santhnm2 :: PR: #2003\r\n- (Reverted) Inference | Lazy compile UVM allocator. by @lmcafee-nvidia :: PR: #2125\r\n- Refactor Attention Metadata to Separate Classes by @kanz-nv :: PR: #2112\r\n- Refactor model_provider to model_builder format for ModelOpt examples by @AAnoosheh :: PR: #2107\r\n- wandb Inference stats logging by @wdykas :: PR: #2026\r\n- Make `PipelineParallelLayout` always return str from ` __repr__` by @ananthsub :: PR: #2055\r\n- Add flash_attn_3 as first option for FA3 import by @santhnm2 :: PR: #2010\r\n- Add debugging hint for case when cudagraphs are created but no matching runner is found by @mathemakitten :: PR: #2129\r\n- ci: LTS container by @ko3n1g :: PR: #2133\r\n- Fix param init by @cuichenx :: PR: #2033\r\n- Hotfix to unit tests on hopper FA3 by @tdene :: PR: #2143\r\n- Add BytesIO to safe_globals by @tdene :: PR: #2074\r\n- add deprecation warning for legacy tokenizer system by @dimapihtar :: PR: #2145\r\n- r","2026-02-26T04:17:50",{"id":165,"version":166,"summary_zh":167,"released_at":168},107083,"core_v0.15.3","This release addresses known security issues. For the latest NVIDIA Vulnerability Disclosure Information visit \u003Chttps:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fsecurity\u002F>, for acknowledgement please reach out to the NVIDIA PSIRT team at \u003CPSIRT@nvidia.com>","2026-02-06T16:30:51",{"id":170,"version":171,"summary_zh":79,"released_at":172},107084,"core_v0.15.2","2026-01-08T15:42:04",{"id":174,"version":175,"summary_zh":79,"released_at":176},107085,"core_v0.15.1","2026-01-07T18:23:32",{"id":178,"version":179,"summary_zh":180,"released_at":181},107086,"core_v0.15.0","* Features  \r\n  * Performance  \r\n    * Fused QKV preprocessing with precomputed RoPE caches (3x preprocessing speedup, 10-14% E2E) ([MR \\!3912](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Ff0d9fa97fead9825ae3eada36ee2df568bfa415b))  \r\n    * Use new TE interface for user buffers ([MR \\!3886](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fd47b83807142b6490c7a000e63d25a479b106fd9))  \r\n    * Add CPU activation offloading via TE ([MR \\!4286](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F310671436c36e6bd198e92c4f30bc84469cc31d8))  \r\n    * Add setting to support Adam or AdamW optimizer ([MR \\!3866](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F03fd0b41b3840c6f19558161d98373a9242402e5))  \r\n  * MoE  \r\n    * Add DTensor support for EP and DSv3 modules ([MR \\!3955](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F268fda08592528b7bc1a21aadaed259980ca8efb))  \r\n    * Add HybridEP backend to Flex Dispatcher ([PR \\!2176](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fpull\u002F2176))  \r\n    * Implement NVFP4 Zero Padding for MoE ([PR \\!1985](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fpull\u002F1985))  \r\n    * Compute shared experts before router ([MR \\!4068](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fe8024d716f3036ebcef8c5254c7830ad09aaf41b))  \r\n    * Enable bias in expert MLP ([MR \\!3858](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fa329dd6da586261a45a8f7d04c1e659ffedd80ae))  \r\n  * Model support  \r\n    * Add YaRN support for GPT-OSS ([MR \\!4044](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F2c1b77a9984bfa978e7cf1f58522e5f8e045d017))  \r\n    * Add FP8 init for MTP ([MR \\!3958](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fd6c6e54ec5eb43d4e196c7ae84e0e88f28613e6b))  \r\n    * Add fp8\\_dpa option for FP8 scaling ([MR \\!4053](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F61047e60e617e71ebe120ec293b62df6b0efc84f))  \r\n  * FSDP  \r\n    * Enable joint training of parallel modules ([MR \\!3850](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F53008b844f98886a2144c216ecd25952cb2dda58))  \r\n  * Inference  \r\n    * Add CUDA Graph runner lookup table cache (up to 2x E2E speedup) ([MR \\!4082](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fab43252fdbedcc3662014ae0e110bd3278d844f4))  \r\n    * Add MoE dropping and padding router for CUDA Graph \\+ decode ([MR \\!3816](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F56818f9e5090ff9eb0f13f10bfe408aae4031c5c))  \r\n    * Integrate unified memory for dynamic inference context ([MR \\!3985](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fef4ae4528a0924159069b9f3a2719616156bafa2))  \r\n  * Post-training  \r\n    * Add GPT-OSS ModelOpt support with quantization, import\u002Fexport ([MR \\!4169](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fa2d8c806b35bc708b13e6c069e19e5dfb49b8481))  \r\n    * Enable KD support with hybrid training loop ([MR \\!4021](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F48d7275062a8307f82bd0fa6c1504032c7f3af96))  \r\n    * Add ModelOpt pruning example ([MR \\!4022](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F5a58976ebe007064c2ff5e76e815aa5fcf1a8787))  \r\n  * RL  \r\n    * Add importance sampling and partial rollouts to Megatron RL ([MR \\!4000](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F8399280ed3b72a183f44820896a67392c0a47e3e))  \r\n    * Add sequence packing for RL ([MR \\!4191](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fee8e9307f3ad655e6a46f98a483d8192995b02c2))  \r\n  * Ease of use  \r\n    * Handle CUDA absence during import ([MR \\!4120](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fae44e49271dc45b51a7400ecf6debc598ba90b54))  \r\n    * Enable SWA mixing with attention ([MR \\!3855](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fe5bc9249d7ad34355f5db4c8ff7d7a9080f94dc2))  \r\n* Bug fixes  \r\n  * Fix convergence bug in MXFP8 parameter gradient buffer reuse ([MR \\!3999](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fc2c36f77cf7a0476daee5bb2dec604c2764de320))  \r\n  * Fix loss mask cloning to prevent incorrect updates ([MR \\!4164](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fc94d58f3260aa568588265e07b3c06bb58cbde41))  \r\n  * Fix metadata loss in checkpoints ([MR \\!4182](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fd8c6aa4c0b5d4c15ec1196802bce292d4580ed4a))  \r\n  * Fix FSDP grad accum fusion support ([MR \\!4018](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F9f72f4775509668173c75eaab5d58a49f4473748))  \r\n  * Fix non-TE optimizer checkpoint issue ([MR \\!3931](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F2ebb6ee95af8b547e3c0ac394d494cb189b890bc))  \r\n  * Fix BERT virtual pipeline parallelism ([MR \\!3993](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F18420b63408101fe5a49d125fb29625f1ad6ab26))  \r\n  * Fix gc.freeze() slowdown by adding gc.collect() on last layer ([MR \\!4003](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fa3f9e566c9595753553a73d403b2a481ad283fc0))  \r\n  * Fix full iteration CUDA graph non-tensor handling ([MR \\!4019](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F8479eb35fbca9631acb846c3ad5d868e02214227))  \r\n  * Fix model\\_auto\\_sync mis-set and add gradient assertion ([MR \\!4062](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F03045f2d880813695f75707e3262a2bfb4206dfe","2025-12-17T23:08:29",{"id":183,"version":184,"summary_zh":185,"released_at":186},107087,"core_v0.14.0","* Features  \n  * Inference  \n    * Add async support for DynamicInferenceEngine ([MR \\!3187](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F05079d55a5bfcc7a43f4619e36a40a9e8db3f882))  \n    * Pad input tensors and enable FP8 weights for FP8 inference ([MR \\!3341](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F6a6cd478839d90cf09a837adf8c79cbc844bc920))  \n    * Force inference to always gather logits with tensor parallelism ([MR \\!3442](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F7c9cdcb794089968278c7272e0261a68edf5d369))  \n    * Multi batch size CUDA Graphs for Dynamic Inference ([MR \\!3402](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F30aabe5e3133c6d70aa55aaabad4ea8cb04ce63c))  \n  * Post-training  \n    * ModelOpt updates ([MR \\!3268](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F550ed5243c3a18e39430c15e8918ee63e41d7eaf))  \n      * Add speculative decoding AR validation feature  \n      * Add DeepSeek and Qwen model configs  \n  * Performance  \n    * ModelCommProcessGroup integration ([MR \\!3391](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F26adc2dfde53fbc2b063e2fdd1d9ed26578811a6))  \n    * Add HyperCommGrid: N-Dimensional Communication Grid for Model Parallelism ([MR \\!3398](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F45400df7da7fa23e3aff86804e5ac254d9a8d3c0))  \n      * Flexible creation and management of communication groups  \n    * Add support for Spike No More embedding initializations and weight decay skipping ([MR \\!3500](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fee74aa66a06b24e511270f285db475941ef63bfd))  \n  * MoE  \n    * We're actively optimizing large-scale fine-grained MoE performance on Blackwell Platform.  \n    * Features:  \n      * Support Expert Parallel A2A Overlapping ([MR \\!3470](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F0c6c1176fb3e3e00534b3591f1ad023d4ecad6fb); [MR \\!3074](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F4b30ec54aba97e16a083eca33d2df1dd48e1b48f))  \n      * Support CP and recompute for MTP ([MR \\!3330](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F650ab87d04105869f197f2ddc441e3b18ca93724))  \n      * Add support for global aux loss ([MR \\!3318](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fe58d9080ea212e005ccba0b6607bfcc86451285d))  \n    * Memory Optimization  \n      * Support recomputation for FP8 layernorm\u002Fmoe\\_act\u002Fshared\\_experts ([MR \\!3465](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F6850cc6a739d168f8c84db6cdacf4fe2931c0c49))  \n      * Support optimizer offloading for DSV3 FP8 training  ([MR \\!3659](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fabbde02f54b62a5194ebe951218e98feceba6d42))  \n    * Performance Optimization  \n      * Add MoE router fusion ([MR \\!3809](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fd93743a9f11d5d17824b8b49868cc90f2904896f))  \n      * Updates for MoE cudagraph ([MR \\!3631](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F95452706d7aa16dc174813e12639a8c8356fbe87))  \n    * Bug fixes:  \n      * Fix router input jitter dtype ([MR \\!3774](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F20b395424d2e2bbfaab57b2f954294eb57c90c82))\n  * Model support  \n    * Add MiMo video VLM train example ([MR \\!3543](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F786f5629d3462aff2f8855f51db70e882c475116))  \n    * Add AVLM for MIMO ([MR \\!3624](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fdb41707430bff743f986b5779712c74242b99caa))  \n  * Ease of use  \n    * Add uv support for source installs ([MR \\!3615](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F164204cd7216e642bdef7299c569d95f02f9a79e))  \n    * Automated weekly prereleases ([MR \\!3574](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F7e59266c70ef34a246438640af690b55c7ecac28))  \n* Bug fixes  \n  * Use mscale\\_all\\_dim for softmax\\_factor ([MR \\!2800](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fe96a358f60c82b8ac8d965d91c3cc4ad0230a4e0))  \n  * Fix FP8 param blockwise scaling unit test ([MR \\!3480](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F57082f946a04c3390fcfc43634dc546ec3ded033))  \n  * Fix unit test blockwise scaling ([MR \\!3491](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F6d95fe63658f967e56a3fda88a9c30a424fcb520))  \n  * Optimize prefill for token-less requests ([MR \\!3499](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fdaaa650a9ac4291d4027ca2fdeb4298ce024efd2))  \n  * Add default values for Fp8Padding and Fp8Unpadding ([MR \\!3501](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F42b2b1d10a9cb699b7e5aa40f6bfba9c2a1348aa))  \n  * Fix CUDA graph logic for flexible pp layout ([MR \\!3505](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F020d85e50ddf0f0282802002acb3662129a519c5))  \n  * Load FP8 models with strict=False ([MR \\!3508](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002F1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2))  \n  * Skip rope check for torch \\\u003C 1.4.0 ([MR \\!3528](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fd8180ef8ed0bb6f305dcdedf1b27d91304f361a3))  \n  * Disable Apex tests for stability ([MR \\!3539](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM\u002Fcommit\u002Fd1256277fe378add0a2cfd7251f5a350b6d126ec))  \n  * Fix typo in parallel\\_state expert parallelism ([M","2025-10-08T15:04:19",{"id":188,"version":189,"summary_zh":79,"released_at":190},107088,"25.09-alpha.rc1","2025-10-03T14:41:56",{"id":192,"version":193,"summary_zh":79,"released_at":194},107089,"core_v0.13.1","2025-08-12T18:33:56",{"id":196,"version":197,"summary_zh":198,"released_at":199},107090,"core_v0.14.0rc5","Prerelease: NVIDIA Megatron Core 0.14.0rc5 (2025-08-11)","2025-08-11T04:12:15",{"id":201,"version":202,"summary_zh":79,"released_at":203},107091,"core_v0.12.3","2025-08-12T18:12:28",{"id":205,"version":206,"summary_zh":207,"released_at":208},107092,"core_v0.14.0rc4","Prerelease: NVIDIA Megatron Core 0.14.0rc4 (2025-08-04)","2025-08-04T04:12:52",{"id":210,"version":211,"summary_zh":212,"released_at":213},107093,"core_v0.14.0rc3","Prerelease: NVIDIA Megatron Core 0.14.0rc3 (2025-07-28)","2025-07-28T04:13:52",{"id":215,"version":216,"summary_zh":217,"released_at":218},107094,"core_v0.13.0","* Support bf16 dtype for optimizer states to use precision-aware optimizer in TransformerEngine  \r\n* MoE   \r\n  * Features:  \r\n    * Flexible Asymmetric Virtual Pipeline Parallelism with Custom Pipeline Layout (--pipeline-model-parallel-layout)  \r\n    * Add support to pass custom parallelism groups to MoE modules.  \r\n    * Add Hybrid Shard Data-Parallel support for MoE models (--num-distributed-optimizer-instances)  \r\n    * Support EP \\+ custom FSDP training for DeepSeek-V3  \r\n    * FP8 support for Multi-Token-Prediction  \r\n  * Memory Optimization  \r\n    * Fine-grained recomputation to reduce activation memory. (--recompute-modules with \\--recompute-granularity selective)  \r\n    * Memory efficient token permutation by moving the probs multiplication from unpermutation to activation function of GroupedMLP.  \r\n  * Performance Optimization  \r\n    * MLA RoPE fusion kernel and YARN embedding cache.  \r\n    * FP8 padding optimization of MoE models by padding the routing map.  \r\n  * Bug fixes:  \r\n    * Fix the aux loss calculation when expert\\_bias or group limited routing is used. This leads to load\\_balancing\\_loss values change compared to the previous version.  \r\n    * Fix packed sequence support for MLA  \r\n  * Known Issues:  \r\n    * MTP is not compatible with flexible pipeline layout, will be fixed at \\!3594.  \r\n    * MTP convergence issue with TP2, will be fixed at \\!3594.","2025-07-25T18:04:10",{"id":220,"version":221,"summary_zh":222,"released_at":223},107095,"core_v0.14.0rc2","Prerelease: NVIDIA Megatron Core 0.14.0rc2 (2025-07-21)","2025-07-21T04:12:06",{"id":225,"version":226,"summary_zh":227,"released_at":228},107096,"core_v0.13.0rc4","Prerelease: NVIDIA Megatron Core 0.13.0rc4 (2025-07-22)","2025-07-22T08:03:14",{"id":230,"version":231,"summary_zh":232,"released_at":233},107097,"core_v0.13.0rc3","Prerelease: NVIDIA Megatron Core 0.13.0rc3 (2025-07-17)","2025-07-17T15:04:37",{"id":235,"version":236,"summary_zh":237,"released_at":238},107098,"core_v0.14.0rc1","Prerelease: NVIDIA Megatron Core 0.14.0rc1 (2025-07-14)","2025-07-14T04:12:19",{"id":240,"version":241,"summary_zh":242,"released_at":243},107099,"core_v0.14.0rc0","Prerelease: NVIDIA Megatron Core 0.14.0rc0 (2025-07-07)","2025-07-07T04:10:44",{"id":245,"version":246,"summary_zh":247,"released_at":248},107100,"core_v0.13.0rc2","Prerelease: NVIDIA Megatron Core 0.13.0rc2 (2025-07-02)","2025-07-02T21:36:48"]