[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-NVIDIA--FasterTransformer":3,"tool-NVIDIA--FasterTransformer":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":80,"owner_email":80,"owner_twitter":80,"owner_website":81,"owner_url":82,"languages":83,"stars":114,"forks":115,"last_commit_at":116,"license":117,"difficulty_score":118,"env_os":119,"env_gpu":120,"env_ram":121,"env_deps":122,"category_tags":132,"github_topics":133,"view_count":23,"oss_zip_url":80,"oss_zip_packed_at":80,"status":16,"created_at":138,"updated_at":139,"faqs":140,"releases":171},3079,"NVIDIA\u002FFasterTransformer","FasterTransformer","Transformer related optimization, including BERT, GPT","FasterTransformer 是 NVIDIA 推出的一款高性能推理加速库，专为优化基于 Transformer 架构的模型（如 BERT、GPT 等）而设计。它通过深度定制 CUDA 内核，充分利用 GPU 的 Tensor Core 算力，显著提升了自然语言处理中编码器与解码器的推理速度，有效解决了大模型在实际部署中延迟高、吞吐量低的痛点。\n\n该工具主要面向 AI 开发者、算法工程师及研究人员，特别是那些需要在 TensorFlow、PyTorch 或 Triton 后端中集成高效推理能力的团队。FasterTransformer 支持多种精度格式（包括 FP16、INT8 及稀疏化计算），并具备张量并行与流水线并行能力，能够灵活适配从 Volta 到 Hopper 等多代 NVIDIA GPU 架构。\n\n值得注意的是，目前 FasterTransformer 的核心开发已迁移至新一代项目 TensorRT-LLM，建议新用户优先关注后者以获取最新的大语言模型推理优化特性。不过，FasterTransformer 依然是一个成熟稳定的选择，尤其适合需要在现有框架中快速落地高精","FasterTransformer 是 NVIDIA 推出的一款高性能推理加速库，专为优化基于 Transformer 架构的模型（如 BERT、GPT 等）而设计。它通过深度定制 CUDA 内核，充分利用 GPU 的 Tensor Core 算力，显著提升了自然语言处理中编码器与解码器的推理速度，有效解决了大模型在实际部署中延迟高、吞吐量低的痛点。\n\n该工具主要面向 AI 开发者、算法工程师及研究人员，特别是那些需要在 TensorFlow、PyTorch 或 Triton 后端中集成高效推理能力的团队。FasterTransformer 支持多种精度格式（包括 FP16、INT8 及稀疏化计算），并具备张量并行与流水线并行能力，能够灵活适配从 Volta 到 Hopper 等多代 NVIDIA GPU 架构。\n\n值得注意的是，目前 FasterTransformer 的核心开发已迁移至新一代项目 TensorRT-LLM，建议新用户优先关注后者以获取最新的大语言模型推理优化特性。不过，FasterTransformer 依然是一个成熟稳定的选择，尤其适合需要在现有框架中快速落地高精度、低延迟 NLP 应用的场景。","**Note: FasterTransformer development has transitioned to [TensorRT-LLM](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FTensorRT-LLM\u002Ftree\u002Frelease\u002F0.5.0). All developers are encouraged to leverage TensorRT-LLM to get the latest improvements on LLM Inference. The NVIDIA\u002FFasterTransformer repo will stay up, but will not have further development.**\n\n# FasterTransformer\n\nThis repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA.\n\n## Table Of Contents\n\n- [FasterTransformer](#fastertransformer)\n  - [Table Of Contents](#table-of-contents)\n  - [Model overview](#model-overview)\n    - [Support matrix](#support-matrix)\n  - [Advanced](#advanced)\n    - [Global Environment](#global-environment)\n  - [Performance](#performance)\n    - [BERT base performance](#bert-base-performance)\n      - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features)\n      - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow)\n      - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch)\n    - [Decoding and Decoder performance](#decoding-and-decoder-performance)\n      - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow)\n      - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch)\n    - [GPT performance](#gpt-performance)\n  - [Release notes](#release-notes)\n    - [Changelog](#changelog)\n    - [Known issues](#known-issues)\n\n## Model overview\n\nIn NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16.\n\nFasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks.\n\n### Support matrix\n\n| Models           | Framework      | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) |\n| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ |\n| BERT             | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |\n| BERT             | PyTorch        | Yes  | Yes                 | Yes                     | Yes             | Yes               | -                  |\n| BERT             | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| BERT             | C++            | Yes  | Yes                 | -                       | -               | -                 | Yes                |\n| XLNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |\n| Encoder          | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |\n| Encoder          | PyTorch        | Yes  | Yes                 | Yes                     | -               | -                 | -                  |\n| Decoder          | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |\n| Decoder          | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |\n| Decoding         | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |\n| Decoding         | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |\n| GPT              | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |\n| GPT\u002FOPT          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | Yes                |\n| GPT\u002FOPT          | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| GPT-MoE          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| BLOOM            | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| BLOOM            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| GPT-J            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| Longformer       | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |\n| T5\u002FUL2           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| T5               | TensorFlow 2   | Yes  | -                   | -                       | -               | -                 | -                  |\n| T5\u002FUL2           | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| T5               | TensorRT       | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| T5-MoE           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| Swin Transformer | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |\n| Swin Transformer | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |\n| ViT              | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |\n| ViT              | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |\n| GPT-NeoX         | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| GPT-NeoX         | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| BART\u002FmBART       | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |\n| WeNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |\n| DeBERTa          | TensorFlow 2   | Yes  | -                   | -                       | On-going        | On-going          | -                  |\n| DeBERTa          | PyTorch        | Yes  | -                   | -                       | On-going        | On-going          | -                  |\n\n* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++.\n\nMore details of specific models are put in `xxx_guide.md` of [`docs\u002F`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs\u002FQAList.md`](docs\u002FQAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together.\n\n## Advanced\n\nThe following code lists the directory structure of FasterTransformer:\n\n```\n\u002Fsrc\u002Ffastertransformer: source code of FasterTransformer\n    |--\u002Fcutlass_extensions: Implementation of cutlass gemm\u002Fkernels.\n    |--\u002Fkernels: CUDA kernels for different models\u002Flayers and operations, like addBiasResiual.\n    |--\u002Flayers: Implementation of layer modules, like attention layer, ffn layer.\n    |--\u002Fmodels: Implementation of different models, like BERT, GPT.\n    |--\u002Ftensorrt_plugin: encapluate FasterTransformer into TensorRT plugin.\n    |--\u002Ftf_op: custom Tensorflow OP implementation\n    |--\u002Fth_op: custom PyTorch OP implementation\n    |--\u002Ftriton_backend: custom triton backend implementation\n    |--\u002Futils: Contains common cuda utils, like cublasMMWrapper, memory_utils\n\u002Fexamples: C++, tensorflow and pytorch interface examples\n    |--\u002Fcpp: C++ interface examples\n    |--\u002Fpytorch: PyTorch OP examples\n    |--\u002Ftensorflow: TensorFlow OP examples\n    |--\u002Ftensorrt: TensorRT examples\n\u002Fdocs: Documents to explain the details of implementation of different models, and show the benchmark\n\u002Fbenchmark: Contains the scripts to run the benchmarks of different models\n\u002Ftests: Unit tests\n\u002Ftemplates: Documents to explain how to add a new model\u002Fexample into FasterTransformer repo\n```\n\nNote that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples\u002Ftensorflow\u002Fbert\u002Fbert-quantization\u002F` and `examples\u002Fpytorch\u002Fbert\u002Fbert-quantization-sparsity\u002F`.\n\n\n### Global Environment\n\nFasterTransformer provides some convenient environment variables for debuging and testing.\n\n1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src\u002Ffastertransformer\u002Futils\u002Flogger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow.\n2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON .\u002Fbin\u002Fgpt_example`, the program will insert tha tag of nvtx to help profiling the program.\n3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging.\n\n## Performance\n\nHardware settings:\n\n* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor\n* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz\n\nIn order to run the following benchmark, we need to install the unix computing tool \"bc\" by\n\n```bash\napt-get install bc\n```\n\n### BERT base performance\n\nThe FP16 results of TensorFlow were obtained by running the `benchmarks\u002Fbert\u002Ftf_benchmark.sh`.\n\nThe INT8 results of TensorFlow were obtained by running the `benchmarks\u002Fbert\u002Ftf_int8_benchmark.sh`.\n\nThe FP16 results of PyTorch were obtained by running the `benchmarks\u002Fbert\u002Fpyt_benchmark.sh`.\n\nThe INT8 results of PyTorch were obtained by running the `benchmarks\u002Fbert\u002Fpyt_int8_benchmark.sh`.\n\nMore benchmarks are put in [`docs\u002Fbert_guide.md`](docs\u002Fbert_guide.md#bert-performance).\n\n#### BERT base performances of FasterTransformer new features\n\nThe following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4.\n\nFor large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case.\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FFT_Encoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n#### BERT base performance on TensorFlow\n\nThe following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4.\n\nFor small batch size and sequence length, using FasterTransformer can bring about 3x speedup.\n\nFor large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FTF_Encoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n#### BERT base performance on PyTorch\n\nThe following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4.\n\nFor small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup.\n\nFor large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FPy_Encoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n### Decoding and Decoder performance\n\nThe results of TensorFlow were obtained by running the `benchmarks\u002Fdecoding\u002Ftf_decoding_beamsearch_benchmark.sh` and `benchmarks\u002Fdecoding\u002Ftf_decoding_sampling_benchmark.sh`\n\nThe results of PyTorch were obtained by running the `benchmarks\u002Fdecoding\u002Fpyt_decoding_beamsearch_benchmark.sh`.\n\nIn the experiments of decoding, we updated the following parameters:\n\n* head_num = 8\n* size_per_head = 64\n* num_layers = 6 for both encoder and decoder\n* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes\n* memory_hidden_dim = 512\n* max sequenc elength = 128\n\nMore benchmarks are put in [`docs\u002Fdecoder_guide.md`](docs\u002Fdecoder_guide.md#decoding-performance).\n\n#### Decoder and Decoding end-to-end translation performance on TensorFlow\n\nThe following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup.\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FTF_Decoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n#### Decoder and Decoding end-to-end translation performance on PyTorch\n\nThe following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup.\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FPy_Decoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n### GPT performance\n\nThe following figure compares the performances of Megatron and FasterTransformer under FP16 on A100.\n\nIn the experiments of decoding, we updated the following parameters:\n\n* head_num = 96\n* size_per_head = 128\n* num_layers = 48 for GPT-89B model, 96 for GPT-175B model\n* data_type = FP16\n* vocab_size = 51200\n* top_p = 0.9\n* tensor parallel size = 8\n* input sequence length = 512\n* output sequence length = 32\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FFT_GPT_A100.png\"\u002F>\u003C\u002Fdiv>\n\n## Release notes\n\n### Changelog\n\nMay 2023\n- Fix bugs of generation early stopping\n\nJanuary 2023\n- Support GPT MoE\n- Support FP8 for Bert and GPT (**Experimental**)\n- Support DeBERTa on TensorFlow 2 and PyTorch\n\nDec 2022\n- **Release the FasterTransformer 5.2**\n- Support min length penalty\n\nNov 2022\n- Support T5 Tensorflow 2 custom op.\n- Support T5 MoE\n- Support WeNet\n- Support BART & mBART\n- Support SwinV2\n- Initial support for w8a8 int8 mode with GPT (preview)\n- Support fused mha in GPT\n\nOct 2022\n- Support BLOOM\n\nSep 2022\n- Support factual sampling ([link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.04624.pdf)) in gpt\n- Support for IA3 adapting scheme in T5\n\nAug 2022\n- Support returning context tokens embeddings in GPT\n- **Release the FasterTransformer 5.1**\n- Support for interactive generation\n- Support for attention time-limited memory\n- Support mt5 and t5-v1.1\n\nJuly 2022\n- Support UL2 huggingface ckpt. ([link](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Ful2))\n  - Fix bug of T5 under bfloat16.\n- Add ViT INT8 TensorRT Plugin\n- Support batch sampling\n- Support shared context optimization in GPT model\n\nJune 2022\n- Support streaming generation for triton backend.\n- Support OPT.\n- Support multi-node multi-GPU BERT under FP32, FP16 and BF16.\n\nMay 2022\n- Support bfloat16 on most models.\n- Support [prefix-prompt](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00190.pdf) for GPT-J.\n- Support GPT-NeoX.\n  - epsilon value used in layernorm is now a parameter\n  - rotary embedding GPT-NeoX style (only GPT-J was implemented)\n  - load per-GPU layernorm and bias parameters\n  - weight conversion from EleutherAI checkpoint\n\nApril 2022\n- **Release the FasterTransformer 5.0**\n  - Change the default accumulation type of all gemm to FP32.\n  - Support bfloat16 inference in GPT model.\n  - Support Nemo Megatron T5 and Megatron-LM T5 model.\n  - Support ViT.\n\nMarch 2022\n- Support `stop_ids` and `ban_bad_ids` in GPT-J.\n- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding.\n\nFebruary 2022\n- Support Swin Transformer.\n- Optimize the k\u002Fv cache update of beam search by in-direction buffer.\n- Support runtime input for GPT-J, T5 and GPT.\n- Support soft prompt in GPT and GPT-J.\n- Support custom all reduce kernel.\n  - Limitation: \n    1. Only support tensor parallel size = 8 on DGX-A100.\n    2. Only support CUDA with cudaMallocAsync.\n\nDecember 2021\n- Add TensorRT plugin of T5 model.\n- Change some hyper-parameters of GPT model to runtime query.\n- Optimize the memory allocator under C++ code.\n- Fix bug of CUB including when using CUDA 11.5 or newer version.\n\nNovember 2021\n- **Update the FasterTransformer 5.0 beta**\n- Add GPT-3 INT8 weight only qauntization for batch size \u003C= 2.\n- Support multi-node multi-gpu support on T5.\n- Enhance the multi-node multi-gpu supporting in GPT-3.\n\nAugust 2021\n- **Release the FasterTransformer 5.0 beta**\n  - Refactor the repo and codes\n  - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below.\n    - Bugs fix\n      - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper.\n      - Fix memory leak that occurs every forward because of reused allocator.\n      - Fix race condition that occurs in repetition penalty kernel.\n    - Enhancement\n      - Add random seed setting.\n      - Fix GEMM buffer overflow on FP16 of GPT.\n      - Change to invalidate finished buffer for every completion.\n      - Introduce stop_before for early stop.\n  - Support Longformer.\n  - Rename `layer_para` to `pipeline_para`.\n  - Optimize the sorting of top p sampling.\n  - Support sparsity for Ampere GPUs on BERT.\n  - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model.\n  - Support multi-node inference for GPT Triton backend.\n\nJune 2021\n- Support XLNet\n\nApril 2021\n- **Release the FasterTransformer 4.0**\n  - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch.\n  - Support single node, multi-gpus inference for GPT model on triton.\n  - Add the int8 fused multi-head attention kernel for bert.\n  - Add the FP16 fused multi-head attention kernel of V100 for bert.\n  - Optimize the kernel of decoder.\n  - Move to independent repo.\n  - Eager mode PyTorch extension is deprecated.\n\nDec 2020\n- **Release the FasterTransformer 3.1**\n  - Optimize the decoding by adding the finisehd mask to prevent useless computing.\n  - Support opennmt encoder.\n  - Remove the TensorRT plugin supporting.\n  - TorchScript custom op is deprecated.\n\nNov 2020\n- Optimize the INT8 inference.\n- Support PyTorch INT8 inference.\n- Provide PyTorch INT8 quantiztion tools.\n- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.\n- Add unit test of SQuAD.\n- Update the missed NGC checkpoints.\n\nSep 2020\n- Support GPT2\n- **Release the FasterTransformer 3.0**\n  - Support INT8 quantization of encoder of cpp and TensorFlow op.\n  - Add bert-tf-quantization tool.\n  - Fix the issue that Cmake 15 or Cmake 16 fail to build this project.\n\nAug 2020\n- Fix the bug of trt plugin.\n\nJune 2020\n- **Release the FasterTransformer 2.1**\n  - Add Effective FasterTransformer based on the idea of [Effective Transformer](https:\u002F\u002Fgithub.com\u002Fbytedance\u002Feffective_transformer) idea.\n  - Optimize the beam search kernels.\n  - Add PyTorch op supporting\n\nMay 2020\n- Fix the bug that seq_len of encoder must be larger than 3.\n- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table.\n- Modifying the method of loading model in `translate_sample.py`.\n\nApril 2020\n- Rename `decoding_opennmt.h` to `decoding_beamsearch.h`\n- Add DiverseSiblingsSearch for decoding.\n- Add sampling into Decoding\n  - The implementation is in the `decoding_sampling.h`\n  - Add top_k sampling, top_p sampling for decoding.\n- Refactor the tensorflow custom op codes.\n  - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc`\n  - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc`\n  - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc`\n- Fix the bugs of finalize function decoding.py.\n- Fix the bug of tf DiverseSiblingSearch.\n- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3.\n- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder.\n- Add dynamic batch size and dynamic sequence length features into all ops.\n\nMarch 2020\n- Add feature in FasterTransformer 2.0\n  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.\n- Fix bugs of Fastertransformer 2.0\n  - Fix the bug of maximum sequence length of decoder cannot be larger than 128.\n  - Fix the bug that decoding does not check finish or not after each step.\n  - Fix the bug of decoder about max_seq_len.\n  - Modify the decoding model structure to fit the OpenNMT-tf decoding model.\n    - Add a layer normalization layer after decoder.\n    - Add a normalization for inputs of decoder\n\nFebruary 2020\n- **Release the FasterTransformer 2.0**\n  - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op.\n  - Refine the sample codes of encoder.\n  - Add dynamic batch size feature into encoder op.\n\nJuly 2019\n- **Release the FasterTransformer 1.0**\n  - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin.\n\n### Known issues\n\n- Cannot compile on tensorflow 2.10 due to undefined symbol issue.\n- Undefined symbol errors when import the extension\n  - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc.\n- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem.\n- If encounter some problem in the custom environment, try to use the gcc\u002Fg++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14.\n","**注意：FasterTransformer 的开发已迁移至 [TensorRT-LLM](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FTensorRT-LLM\u002Ftree\u002Frelease\u002F0.5.0)。我们鼓励所有开发者使用 TensorRT-LLM 来获得 LLM 推理方面的最新改进。NVIDIA\u002FFasterTransformer 仓库将继续保留，但将不再进行进一步开发。**\n\n# FasterTransformer\n\n本仓库提供用于运行高度优化的基于 Transformer 的编码器和解码器组件的脚本和配方，并由 NVIDIA 进行测试和维护。\n\n## 目录\n\n- [FasterTransformer](#fastertransformer)\n  - [目录](#table-of-contents)\n  - [模型概述](#model-overview)\n    - [支持矩阵](#support-matrix)\n  - [高级用法](#advanced)\n    - [全局环境配置](#global-environment)\n  - [性能](#performance)\n    - [BERT base 性能](#bert-base-performance)\n      - [FasterTransformer 新特性下的 BERT base 性能](#bert-base-performances-of-fastertransformer-new-features)\n      - [TensorFlow 上的 BERT base 性能](#bert-base-performance-on-tensorflow)\n      - [PyTorch 上的 BERT base 性能](#bert-base-performance-on-pytorch)\n    - [解码与解码器性能](#decoding-and-decoder-performance)\n      - [TensorFlow 上的解码器及端到端翻译性能](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow)\n      - [PyTorch 上的解码器及端到端翻译性能](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch)\n    - [GPT 性能](#gpt-performance)\n  - [发布说明](#release-notes)\n    - [变更日志](#changelog)\n    - [已知问题](#known-issues)\n\n## 模型概述\n\n在自然语言处理领域，编码器和解码器是两个重要的组成部分，而 Transformer 层已成为这两种组件中非常流行的一种架构。FasterTransformer 为推理任务实现了高度优化的 Transformer 编码器和解码器层。在 Volta、Turing 和 Ampere 架构的 GPU 上，当数据和权重的精度为 FP16 时，Tensor Core 的计算能力会自动被利用。\n\nFasterTransformer 基于 CUDA、cuBLAS、cuBLASLt 和 C++ 构建。我们提供了至少一种以下框架的 API：TensorFlow、PyTorch 和 Triton 后端。用户可以直接将 FasterTransformer 集成到这些框架中。对于支持的框架，我们还提供了示例代码来演示如何使用，并展示其在这些框架上的性能。\n\n### 支持矩阵\n\n| 模型           | 框架      | FP16 | INT8（Turing之后） | 稀疏化（Ampere之后） | 张量并行 | 流水线并行 | FP8（Hopper之后） |\n| ---------------- | ---------- | ---- | ------------------ | ------------------- | -------- | ---------- | ----------------- |\n| BERT             | TensorFlow | 是   | 是                 | -                   | -        | -          | -                 |\n| BERT             | PyTorch    | 是   | 是                 | 是                  | 是       | 是         | -                 |\n| BERT             | Triton后端 | 是   | -                  | -                   | 是       | 是         | -                 |\n| BERT             | C++        | 是   | 是                 | -                   | -        | -          | 是                |\n| XLNet            | C++        | 是   | -                  | -                   | -        | -          | -                 |\n| Encoder          | TensorFlow | 是   | 是                 | -                   | -        | -          | -                 |\n| Encoder          | PyTorch    | 是   | 是                 | 是                  | -        | -          | -                 |\n| Decoder          | TensorFlow | 是   | -                  | -                   | -        | -          | -                 |\n| Decoder          | PyTorch    | 是   | -                  | -                   | -        | -          | -                 |\n| Decoding         | TensorFlow | 是   | -                  | -                   | -        | -          | -                 |\n| Decoding         | PyTorch    | 是   | -                  | -                   | -        | -          | -                 |\n| GPT              | TensorFlow | 是   | -                  | -                   | -        | -          | -                 |\n| GPT\u002FOPT          | PyTorch    | 是   | -                  | -                   | 是       | 是         | 是                |\n| GPT\u002FOPT          | Triton后端 | 是   | -                  | -                   | 是       | 是         | -                 |\n| GPT-MoE          | PyTorch    | 是   | -                  | -                   | 是       | 是         | -                 |\n| BLOOM            | PyTorch    | 是   | -                  | -                   | 是       | 是         | -                 |\n| BLOOM            | Triton后端 | 是   | -                  | -                   | 是       | 是         | -                 |\n| GPT-J            | Triton后端 | 是   | -                  | -                   | 是       | 是         | -                 |\n| Longformer       | PyTorch    | 是   | -                  | -                   | -        | -          | -                 |\n| T5\u002FUL2           | PyTorch    | 是   | -                  | -                   | 是       | 是         | -                 |\n| T5               | TensorFlow 2 | 是   | -                  | -                   | -        | -          | -                 |\n| T5\u002FUL2           | Triton后端 | 是   | -                  | -                   | 是       | 是         | -                 |\n| T5               | TensorRT   | 是   | -                  | -                   | 是       | 是         | -                 |\n| T5-MoE           | PyTorch    | 是   | -                  | -                   | 是       | 是         | -                 |\n| Swin Transformer | PyTorch    | 是   | 是                 | -                   | -        | -          | -                 |\n| Swin Transformer | TensorRT   | 是   | 是                 | -                   | -        | -          | -                 |\n| ViT              | PyTorch    | 是   | 是                 | -                   | -        | -          | -                 |\n| ViT              | TensorRT   | 是   | 是                 | -                   | -        | -          | -                 |\n| GPT-NeoX         | PyTorch    | 是   | -                  | -                   | 是       | 是         | -                 |\n| GPT-NeoX         | Triton后端 | 是   | -                  | -                   | 是       | 是         | -                 |\n| BART\u002FmBART       | PyTorch    | 是   | -                  | -                   | 是       | 是         | -                 |\n| WeNet            | C++        | 是   | -                  | -                   | -        | -          | -                 |\n| DeBERTa          | TensorFlow 2 | 是   | -                  | -                   | 进行中   | 进行中     | -                 |\n| DeBERTa          | PyTorch    | 是   | -                  | -                   | 进行中   | 进行中     | -                 |\n\n* 注意：FasterTransformer 在 C++ 上支持上述模型，因为所有源代码都是基于 C++ 构建的。\n\n有关具体模型的更多详细信息，请参阅 [`docs\u002F`](docs) 中的 `xxx_guide.md` 文件，其中 `xxx` 表示模型名称。一些常见问题及其解答已收录在 [`docs\u002FQAList.md`](docs\u002FQAList.md) 中。请注意，Encoder 和 BERT 的模型较为相似，因此我们将相关说明合并到了 `bert_guide.md` 中。\n\n## 高级\n\n以下代码列出了 FasterTransformer 的目录结构：\n\n```\n\u002Fsrc\u002Ffastertransformer: FasterTransformer 的源代码\n    |--\u002Fcutlass_extensions: Cutlass GEMM\u002F内核的实现。\n    |--\u002Fkernels: 用于不同模型\u002F层和操作的 CUDA 内核，例如 addBiasResiual。\n    |--\u002Flayers: 层模块的实现，如注意力层、FFN 层。\n    |--\u002Fmodels: 不同模型的实现，如 BERT、GPT。\n    |--\u002Ftensorrt_plugin: 将 FasterTransformer 封装为 TensorRT 插件。\n    |--\u002Ftf_op: 自定义 TensorFlow OP 实现\n    |--\u002Fth_op: 自定义 PyTorch OP 实现\n    |--\u002Ftriton_backend: 自定义 Triton 后端实现\n    |--\u002Futils: 包含常用的 CUDA 工具，如 cublasMMWrapper、memory_utils\n\u002Fexamples: C++、TensorFlow 和 PyTorch 接口示例\n    |--\u002Fcpp: C++ 接口示例\n    |--\u002Fpytorch: PyTorch OP 示例\n    |--\u002Ftensorflow: TensorFlow OP 示例\n    |--\u002Ftensorrt: TensorRT 示例\n\u002Fdocs: 解释不同模型实现细节并展示基准测试结果的文档\n\u002Fbenchmark: 包含运行不同模型基准测试的脚本\n\u002Ftests: 单元测试\n\u002Ftemplates: 解释如何将新模型\u002F示例添加到 FasterTransformer 仓库的文档\n```\n\n请注意，许多文件夹包含多个子文件夹，用于划分不同的模型。量化工具已移至 `examples` 目录下，例如 `examples\u002Ftensorflow\u002Fbert\u002Fbert-quantization\u002F` 和 `examples\u002Fpytorch\u002Fbert\u002Fbert-quantization-sparsity\u002F`。\n\n\n### 全局环境\n\nFasterTransformer 提供了一些方便的环境变量，用于调试和测试。\n\n1. `FT_LOG_LEVEL`: 此环境变量控制调试消息的日志级别。更多详细信息请参阅 `src\u002Ffastertransformer\u002Futils\u002Flogger.h`。请注意，当日志级别低于 `DEBUG` 时，程序会打印大量信息，从而导致程序运行速度显著变慢。\n2. `FT_NVTX`: 如果将其设置为 `ON`，例如 `FT_NVTX=ON .\u002Fbin\u002Fgpt_example`，程序将插入 nvtx 标签，以帮助对程序进行性能分析。\n3. `FT_DEBUG_LEVEL`: 如果将其设置为 `DEBUG`，则程序将在每个内核执行后调用 `cudaDeviceSynchronize()`。否则，默认情况下内核是异步执行的。这有助于在调试过程中定位错误点。但此标志会显著影响程序性能，因此应仅在调试时使用。\n\n## 性能\n\n硬件配置：\n\n* 8xA100-80GB（mclk 1593MHz，pclk 1410MHz），搭配 AMD EPYC 7742 64 核处理器\n* T4（mclk 5000MHz，pclk 1590MHz），搭配 Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz\n\n为了运行以下基准测试，我们需要安装 Unix 计算工具 \"bc\"，命令如下：\n\n```bash\napt-get install bc\n```\n\n### BERT 基础性能\n\nTensorFlow 的 FP16 结果通过运行 `benchmarks\u002Fbert\u002Ftf_benchmark.sh` 获得。\n\nTensorFlow 的 INT8 结果通过运行 `benchmarks\u002Fbert\u002Ftf_int8_benchmark.sh` 获得。\n\nPyTorch 的 FP16 结果通过运行 `benchmarks\u002Fbert\u002Fpyt_benchmark.sh` 获得。\n\nPyTorch 的 INT8 结果通过运行 `benchmarks\u002Fbert\u002Fpyt_int8_benchmark.sh` 获得。\n\n更多基准测试结果请参阅 [`docs\u002Fbert_guide.md`](docs\u002Fbert_guide.md#bert-performance)。\n\n#### FasterTransformer 新功能的 BERT 基础性能\n\n下图比较了 T4 上 FP16 条件下，FasterTransformer 不同功能与原版 FasterTransformer 的性能。\n\n对于较大的批次大小和序列长度，EFF-FT 和 FT-INT8-v2 都带来了约 2 倍的加速。同时使用 Effective FasterTransformer 和 int8v2 可以使大型任务的加速比达到原版 FasterTransformer FP16 的 3.5 倍。\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FFT_Encoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n#### BERT 基础性能（TensorFlow）\n\n下图比较了 T4 上 FP16 条件下，FasterTransformer 不同功能与 TensorFlow XLA 的性能。\n\n对于较小的批次大小和序列长度，使用 FasterTransformer 可以带来约 3 倍的加速。\n\n对于较大的批次大小和序列长度，使用 Effective FasterTransformer 并结合 INT8-v2 量化技术，可以带来约 5 倍的加速。\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FTF_Encoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n#### BERT 基础性能（PyTorch）\n\n下图比较了 T4 上 FP16 条件下，FasterTransformer 不同功能与 PyTorch TorchScript 的性能。\n\n对于较小的批次大小和序列长度，使用 FasterTransformer CustomExt 可以带来约 4 到 6 倍的加速。\n\n对于较大的批次大小和序列长度，使用 Effective FasterTransformer 并结合 INT8-v2 量化技术，可以带来约 5 倍的加速。\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FPy_Encoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n### 解码与解码器性能\n\nTensorFlow 的结果通过运行 `benchmarks\u002Fdecoding\u002Ftf_decoding_beamsearch_benchmark.sh` 和 `benchmarks\u002Fdecoding\u002Ftf_decoding_sampling_benchmark.sh` 获得。\n\nPyTorch 的结果通过运行 `benchmarks\u002Fdecoding\u002Fpyt_decoding_beamsearch_benchmark.sh` 获得。\n\n在解码实验中，我们更新了以下参数：\n\n* head_num = 8\n* size_per_head = 64\n* num_layers = 6 对于编码器和解码器\n* vocabulary_size = 32001 对于 TensorFlow 示例代码，31538 对于 PyTorch 示例代码\n* memory_hidden_dim = 512\n* max sequence length = 128\n\n更多基准测试结果请参阅 [`docs\u002Fdecoder_guide.md`](docs\u002Fdecoder_guide.md#decoding-performance)。\n\n#### 解码器及解码端到端翻译性能（TensorFlow）\n\n下图展示了在 T4 上 FP16 条件下，FT-Decoder op 和 FT-Decoding op 相较于 TensorFlow 的加速效果。这里我们使用翻译测试集的吞吐量来避免不同方法的总 token 数可能不同。相比 TensorFlow，FT-Decoder 提供了 1.5 到 3 倍的加速；而 FT-Decoding 则提供了 4 到 18 倍的加速。\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FTF_Decoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n#### 解码器及解码端到端翻译性能（PyTorch）\n\n下图展示了在 T4 上 FP16 条件下，FT-Decoder op 和 FT-Decoding op 相较于 PyTorch 的加速效果。这里我们使用翻译测试集的吞吐量来避免不同方法的总 token 数可能不同。相比 PyTorch，FT-Decoder 提供了 1.2 到 3 倍的加速；而 FT-Decoding 则提供了 3.8 到 13 倍的加速。\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FPy_Decoder_T4.png\"\u002F>\u003C\u002Fdiv>\n\n### GPT 性能\n\n下图比较了 A100 上 FP16 条件下，Megatron 和 FasterTransformer 的性能。\n\n在解码实验中，我们更新了以下参数：\n\n* head_num = 96\n* size_per_head = 128\n* num_layers = 48 对于 GPT-89B 模型，96 对于 GPT-175B 模型\n* data_type = FP16\n* vocab_size = 51200\n* top_p = 0.9\n* tensor parallel size = 8\n* input sequence length = 512\n* output sequence length = 32\n\n\u003Cdiv align=center>\u003Cimg  width=80% src =\"docs\u002Fimages\u002FFT_GPT_A100.png\"\u002F>\u003C\u002Fdiv>\n\n## 发行说明\n\n### 更改日志\n\n2023年5月\n- 修复生成过程中的提前停止相关 bug\n\n2023年1月\n- 支持 GPT MoE 模型\n- 支持 Bert 和 GPT 的 FP8 精度（**实验性功能**）\n- 支持在 TensorFlow 2 和 PyTorch 上运行 DeBERTa 模型\n\n2022年12月\n- **发布 FasterTransformer 5.2 版本**\n- 支持最小长度惩罚机制\n\n2022年11月\n- 支持 T5 Tensorflow 2 自定义算子\n- 支持 T5 MoE 模型\n- 支持 WeNet 模型\n- 支持 BART 和 mBART 模型\n- 支持 SwinV2 模型\n- 初步支持 GPT 模型的 w8a8 int8 运行模式（预览版）\n- 支持 GPT 中的融合多头注意力机制\n\n2022年10月\n- 支持 BLOOM 模型\n\n2022年9月\n- 在 GPT 模型中支持事实采样（[链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.04624.pdf)）\n- 支持 T5 模型中的 IA3 适配方案\n\n2022年8月\n- 支持在 GPT 模型中返回上下文 token 的嵌入表示\n- **发布 FasterTransformer 5.1 版本**\n- 支持交互式生成\n- 支持注意力机制的有限内存功能\n- 支持 mt5 和 t5-v1.1 模型\n\n2022年7月\n- 支持 UL2 Hugging Face 检查点（[链接](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Ful2)）\n  - 修复 T5 模型在 bfloat16 精度下的 bug\n- 添加 ViT INT8 TensorRT 插件\n- 支持批量采样\n- 支持 GPT 模型中的共享上下文优化\n\n2022年6月\n- 支持 Triton 后端的流式生成\n- 支持 OPT 模型\n- 支持在 FP32、FP16 和 BF16 精度下进行多节点多 GPU 的 BERT 推理\n\n2022年5月\n- 支持大多数模型使用 bfloat16 精度\n- 支持 GPT-J 模型的 [前缀提示](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00190.pdf)\n- 支持 GPT-NeoX 模型\n  - 层归一化中使用的 epsilon 值现为可配置参数\n  - 引入 GPT-NeoX 风格的旋转位置编码（此前仅实现 GPT-J）\n  - 支持按 GPU 加载层归一化和偏置参数\n  - 支持从 EleutherAI 检查点转换权重\n\n2022年4月\n- **发布 FasterTransformer 5.0 版本**\n  - 将所有矩阵乘法运算的默认累加类型更改为 FP32\n  - 支持 GPT 模型中的 bfloat16 推理\n  - 支持 Nemo Megatron T5 和 Megatron-LM T5 模型\n  - 支持 ViT 模型\n\n2022年3月\n- 支持 GPT-J 模型中的 `stop_ids` 和 `ban_bad_ids`\n- 支持 GPT-J、GPT 和 T5 模型中动态设置 `start_id` 和 `end_id`\n- 支持解码过程中动态调整输入\n\n2022年2月\n- 支持 Swin Transformer 模型\n- 通过定向缓冲优化束搜索的 k\u002Fv 缓存更新\n- 支持 GPT-J、T5 和 GPT 模型的运行时输入\n- 支持 GPT 和 GPT-J 模型中的软提示\n- 支持自定义 AllReduce 核心\n  - 限制：\n    1. 仅支持 DGX-A100 上的张量并行规模为 8\n    2. 仅支持使用 cudaMallocAsync 的 CUDA 版本\n\n2021年12月\n- 添加 T5 模型的 TensorRT 插件\n- 将 GPT 模型的部分超参数改为运行时可查询\n- 优化 C++ 代码中的内存分配器\n- 修复 CUB 库的 bug，包括在使用 CUDA 11.5 或更高版本时的问题\n\n2021年11月\n- **更新 FasterTransformer 5.0 测试版**\n- 添加 GPT-3 INT8 权重量化支持，适用于批大小不超过 2 的场景\n- 支持 T5 模型的多节点多 GPU 推理\n- 增强 GPT-3 模型的多节点多 GPU 支持能力\n\n2021年8月\n- **发布 FasterTransformer 5.0 测试版**\n  - 重构代码库和代码结构\n  - 特别感谢 NAVER 公司对本版本的大量贡献，具体如下：\n    - Bug 修复：\n      - 修复 GPT PyTorch 封装中 batch_size 小于 max_batch_size 时出现的错误\n      - 修复因重复使用内存分配器导致的每次前向传播时的内存泄漏问题\n      - 修复重复惩罚核函数中的竞态条件\n    - 功能增强：\n      - 添加随机种子设置\n      - 修复 GPT 在 FP16 精度下 GEMM 缓冲区溢出问题\n      - 更改完成缓冲区的失效策略，确保每次推理完成后及时释放\n      - 引入 stop_before 参数用于提前停止\n  - 支持 Longformer 模型\n  - 将 `layer_para` 重命名为 `pipeline_para`\n  - 优化 top p 采样的排序逻辑\n  - 支持 Ampere 架构 GPU 上 BERT 模型的稀疏化\n  - 支持 GPT 模型中 size_per_head 取值为 96、160、192、224、256\n  - 支持 GPT Triton 后端的多节点推理\n\n2021年6月\n- 支持 XLNet 模型\n\n2021年4月\n- **发布 FasterTransformer 4.0 版本**\n  - 支持 GPT 模型在 C++ 和 PyTorch 下的多 GPU 多节点推理\n  - 支持 GPT 模型在 Triton 上的单节点多 GPU 推理\n  - 为 BERT 模型添加 INT8 融合多头注意力核\n  - 为 V100 显卡上的 BERT 模型添加 FP16 融合多头注意力核\n  - 优化解码器核函数\n  - 迁移到独立代码库\n  - 废弃 Eager 模式下的 PyTorch 扩展\n\n2020年12月\n- **发布 FasterTransformer 3.1 版本**\n  - 通过添加已完成掩码来优化解码过程，避免无效计算\n  - 支持 OpenNMT 编码器\n  - 移除 TensorRT 插件支持\n  - 废弃 TorchScript 自定义算子\n\n2020年11月\n- 优化 INT8 推理流程\n- 支持 PyTorch 的 INT8 推理\n- 提供 PyTorch 的 INT8 量化工具\n- 将 TensorRT 的融合多头注意力核集成到 FasterTransformer 中\n- 添加 SQuAD 数据集的单元测试\n- 更新缺失的 NGC 检查点\n\n2020年9月\n- 支持 GPT2 模型\n- **发布 FasterTransformer 3.0 版本**\n  - 支持 C++ 和 TensorFlow 算子的编码器 INT8 量化\n  - 添加 bert-tf-quantization 工具\n  - 修复 CMake 15 或 CMake 16 无法构建该项目的问题\n\n2020年8月\n- 修复 trt 插件的 bug\n\n2020年6月\n- **发布 FasterTransformer 2.1 版本**\n  - 基于 [Effective Transformer](https:\u002F\u002Fgithub.com\u002Fbytedance\u002Feffective_transformer) 思想添加 Effective FasterTransformer\n  - 优化束搜索核函数\n  - 添加 PyTorch 算子支持\n\n2020年5月\n- 修复编码器序列长度必须大于 3 的 bug\n- 将解码的位置编码作为 FasterTransformer 解码的输入。这样可以方便使用不同类型的位置编码。FasterTransformer 不会自行计算位置编码值，而是直接查表。\n- 修改 `translate_sample.py` 中加载模型的方法\n\n2020年4月\n- 将 `decoding_opennmt.h` 重命名为 `decoding_beamsearch.h`\n- 为解码添加 DiverseSiblingsSearch 策略\n- 在解码中加入采样功能\n  - 实现位于 `decoding_sampling.h`\n  - 添加 top_k 采样和 top_p 采样\n- 重构 TensorFlow 自定义算子代码\n  - 将 `bert_transformer_op.h` 和 `bert_transformer_op.cu.cc` 合并为 `bert_transformer_op.cc`\n  - 将 `decoder.h` 和 `decoder.cu.cc` 合并为 `decoder.cc`\n  - 将 `decoding_beamsearch.h` 和 `decoding_beamsearch.cu.cc` 合并为 `decoding_beamsearch.cc`\n- 修复 decoding.py 中 finalize 函数的 bug\n- 修复 TF 版本的 DiverseSiblingSearch 的 bug\n- 将 BLEU 评分器 `bleu_score.py` 添加到 `utils` 目录中。注意，BLEU 评分需要 Python 3 环境。\n- 将编码器的 QKV GEMM 和解码器的 masked_multi_head_attention 融合在一起\n- 在所有算子中添加动态批大小和动态序列长度功能。\n\n2020年3月\n- 在 FasterTransformer 2.0 中新增功能\n  - 添加 `translate_sample.py`，演示如何通过恢复 OpenNMT-tf 的预训练模型来翻译一句话。\n- 修复 FasterTransformer 2.0 的 bug\n  - 修复解码器最大序列长度不能超过 128 的问题。\n  - 修复解码过程中每一步未检查是否已完成的 bug。\n  - 修复解码器关于 `max_seq_len` 的 bug。\n  - 修改解码模型结构以适配 OpenNMT-tf 的解码模型。\n    - 在解码器后添加一层层归一化。\n    - 为解码器的输入添加归一化。\n\n2020年2月\n- **发布 FasterTransformer 2.0**\n  - 提供基于 OpenNMT-tf 的高度优化解码器及解码功能，包含 C++ API 和 TensorFlow 操作。\n  - 优化编码器示例代码。\n  - 在编码器操作中加入动态批大小功能。\n\n2019年7月\n- **发布 FasterTransformer 1.0**\n  - 提供高度优化的 BERT 等效 Transformer 层，包含 C++ API、TensorFlow 操作和 TensorRT 插件。\n\n\n\n### 已知问题\n\n- 由于未定义符号问题，无法在 TensorFlow 2.10 上编译。\n- 导入扩展时出现未定义符号错误。\n  - 请先执行 `import torch`。如果已执行仍出现错误，则可能是 C++ ABI 不兼容所致。建议检查编译和运行时使用的 PyTorch 版本是否一致，或检查 PyTorch 的编译方式、GCC 版本等。\n- TensorFlow 实现与 OP 实现的解码结果会有所不同。该问题由累积的对数概率引起，目前尚未解决。\n- 如果在自定义环境中遇到问题，尝试使用 gcc\u002Fg++ 4.8 来构建 TensorFlow 操作项目，尤其针对 TensorFlow 1.14。","# FasterTransformer 快速上手指南\n\n> **重要提示**：FasterTransformer 的开发已正式迁移至 **[TensorRT-LLM](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FTensorRT-LLM)**。NVIDIA 建议所有开发者使用 TensorRT-LLM 以获取最新的大语言模型（LLM）推理优化。本仓库将不再进行新功能开发，仅作为历史版本维护。\n\nFasterTransformer 是 NVIDIA 提供的高性能 Transformer 推理库，基于 CUDA、cuBLAS 和 C++ 构建，专为 Volta、Turing、Ampere 及 Hopper 架构的 GPU 优化，支持 FP16、INT8 等精度加速。\n\n## 环境准备\n\n### 系统要求\n*   **操作系统**: Linux (推荐 Ubuntu 18.04\u002F20.04\u002F22.04)\n*   **GPU**: NVIDIA GPU (Volta 架构及以上，如 V100, T4, A100, H100)\n*   **CUDA**: 需安装与显卡驱动匹配的 CUDA Toolkit (通常建议 11.x 或 12.x)\n*   **编译器**: GCC 7.5+ 或更高版本\n*   **CMake**: 3.13 或更高版本\n\n### 前置依赖\n根据您使用的框架，需预先安装以下组件：\n*   **基础依赖**: `git`, `curl`, `build-essential`\n*   **Python 框架 (可选)**:\n    *   PyTorch (需匹配 CUDA 版本)\n    *   TensorFlow (需匹配 CUDA 版本)\n*   **性能测试工具**: `bc` (用于运行基准测试脚本)\n\n```bash\n# Ubuntu 系统安装基础依赖\nsudo apt-get update\nsudo apt-get install -y git curl build-essential bc\n```\n\n## 安装步骤\n\nFasterTransformer 主要通过源码编译安装。以下是基于 C++ 核心库的标准编译流程（以 PyTorch 扩展为例）：\n\n1.  **克隆仓库**\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer.git\n    cd FasterTransformer\n    git submodule init\n    git submodule update\n    ```\n\n2.  **配置编译环境**\n    创建构建目录并运行 CMake。请根据您的实际环境调整 `-DSM` (Compute Capability) 和框架路径。\n\n    ```bash\n    mkdir -p build && cd build\n    \n    # 示例：编译支持 PyTorch 的版本\n    # 请替换 \u003CYOUR_CUDA_PATH> 和 \u003CYOUR_PYTORCH_PATH> 为实际路径\n    cmake -DSM=80 \\\n          -DCMAKE_BUILD_TYPE=Release \\\n          -DBUILD_PYT=ON \\\n          -DCUDA_TOOLKIT_ROOT_DIR=\u003CYOUR_CUDA_PATH> \\\n          -DPYTHON_EXECUTABLE=$(which python3) \\\n          ..\n    ```\n    *注：`-DSM=80` 对应 Ampere 架构 (如 A100)，T4 请使用 `-DSM=75`，V100 请使用 `-DSM=70`。*\n\n3.  **编译安装**\n    ```bash\n    make -j$(nproc)\n    ```\n    编译完成后，生成的动态库位于 `build\u002Flib` 目录下，Python 扩展模块位于 `build\u002Fpytorch` (如果启用了 PyTorch)。\n\n## 基本使用\n\n以下展示如何在 PyTorch 中调用 FasterTransformer 的 BERT 模型进行推理。\n\n### 1. 设置环境变量 (可选)\n用于控制日志级别或调试：\n```bash\nexport FT_LOG_LEVEL=INFO\n# export FT_DEBUG_LEVEL=DEBUG  # 仅在调试内核错误时开启，会显著降低性能\n```\n\n### 2. Python 代码示例\n确保将编译生成的 `.so` 文件路径加入 `LD_LIBRARY_PATH`，或在 Python 脚本中加载。\n\n```python\nimport torch\nfrom fastertransformer import BertLayer\n\n# 配置模型参数\nconfig = {\n    'head_num': 12,\n    'size_per_head': 64,\n    'inter_size': 3072,\n    'num_layer': 12,\n    'vocab_size': 30524,\n    'max_seq_len': 512,\n    'data_type': 'fp16',  # 推荐使用 fp16 以启用 Tensor Core\n    'int8_mode': 0        # 0: FP16\u002FFP32, 1: INT8\n}\n\n# 初始化模型层\n# device_id 指定使用的 GPU ID\nbert_layer = BertLayer(config, device_id=0)\n\n# 准备输入数据 (batch_size, seq_len)\ninput_ids = torch.randint(0, config['vocab_size'], (4, 128)).cuda().int()\nattention_mask = torch.ones_like(input_ids).cuda().int()\n\n# 执行推理 (半精度计算)\nwith torch.cuda.amp.autocast():\n    output = bert_layer.forward(input_ids, attention_mask)\n\nprint(f\"Output shape: {output.shape}\")\n```\n\n### 3. C++ 示例 (简述)\n若直接使用 C++ 接口，可参考 `examples\u002Fcpp` 目录下的示例。核心流程包括初始化 `CublasMMWrapper`，构建 `BertEncoder` 对象，并调用 `forward` 方法。\n\n```cpp\n\u002F\u002F 伪代码示例，详细实现请参考 examples\u002Fcpp\u002Fbert_example.cc\n#include \"fastertransformer\u002Fmodels\u002Fbert.h\"\n\n\u002F\u002F 初始化参数\nft::BertParam\u003CT> bert_param;\n\u002F\u002F ... 设置 head_num, size_per_head 等参数 ...\n\n\u002F\u002F 构建模型\nft::BertEncoder\u003CT> bert_encoder(..., bert_param);\n\n\u002F\u002F 执行推理\nbert_encoder.forward(...);\n```\n\n> **提示**：更多特定模型（如 GPT, T5, Swin Transformer）的详细用法和参数配置，请参阅仓库中 `docs\u002F` 目录下对应的 `xxx_guide.md` 文档。","某大型电商平台的智能客服团队正在部署基于 BERT 模型的实时意图识别系统，以应对大促期间每秒数万次的用户咨询请求。\n\n### 没有 FasterTransformer 时\n- **响应延迟高**：原生 PyTorch 实现的 BERT 模型在 GPU 上推理耗时较长，平均响应时间超过 80 毫秒，导致用户感觉对话有明顯卡顿。\n- **硬件资源浪费**：无法自动调用 NVIDIA GPU 特有的 Tensor Core 进行 FP16 混合精度加速，算力利用率低，需增加大量显卡才能维持并发量。\n- **扩容成本高昂**：为了支撑高峰流量，不得不采购更多服务器，且因单卡吞吐量低，集群规模庞大，运维复杂度急剧上升。\n- **集成难度大**：缺乏针对 TensorFlow 或 Triton 后端的高度优化算子，自行编写 CUDA 内核进行加速不仅开发周期长，还容易引入稳定性风险。\n\n### 使用 FasterTransformer 后\n- **延迟显著降低**：通过高度优化的 Transformer 层和自动启用 Tensor Core，FP16 精度下推理延迟降至 20 毫秒以内，实现丝滑的实时交互体验。\n- **吞吐量翻倍**：充分利用 cuBLASLt 等底层库优势，单卡并发处理能力大幅提升，在相同硬件配置下可处理的请求量增长 3-4 倍。\n- **基础设施降本**：凭借更高的计算密度，所需服务器数量减少 60% 以上，直接降低了硬件采购成本和机房电力消耗。\n- **无缝框架集成**：直接通过提供的 API 将优化后的算子嵌入现有的 PyTorch 或 Triton 流程中，无需重写业务代码，一周内即可完成上线切换。\n\nFasterTransformer 通过深度挖掘 GPU 硬件潜能，将昂贵的 Transformer 推理任务转化为低成本、低延迟的生产力工具。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNVIDIA_FasterTransformer_b7a19043.png","NVIDIA","NVIDIA Corporation","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FNVIDIA_7dcf6000.png","",null,"https:\u002F\u002Fnvidia.com","https:\u002F\u002Fgithub.com\u002FNVIDIA",[84,88,92,96,100,104,108,111],{"name":85,"color":86,"percentage":87},"C++","#f34b7d",67,{"name":89,"color":90,"percentage":91},"Cuda","#3A4E3A",29.2,{"name":93,"color":94,"percentage":95},"CMake","#DA3434",1.8,{"name":97,"color":98,"percentage":99},"Python","#3572A5",1.3,{"name":101,"color":102,"percentage":103},"Shell","#89e051",0.5,{"name":105,"color":106,"percentage":107},"Makefile","#427819",0,{"name":109,"color":110,"percentage":107},"C","#555555",{"name":112,"color":113,"percentage":107},"HCL","#844FBA",6410,935,"2026-04-02T21:09:39","Apache-2.0",4,"Linux","必需 NVIDIA GPU。支持 Volta (如 V100), Turing (如 T4), Ampere (如 A100), Hopper 架构。在 FP16 精度下自动利用 Tensor Cores。示例硬件包括 8xA100-80GB 和 T4。","未说明",{"notes":123,"python":121,"dependencies":124},"1. 该项目开发已停止，官方建议开发者迁移至 TensorRT-LLM 以获取最新的大语言模型推理改进。\n2. 核心代码基于 C++、CUDA、cuBLAS 和 cuBLASLt 构建。\n3. 提供 TensorFlow、PyTorch 和 Triton backend 的 API 接口及示例代码。\n4. 支持多种精度：FP16（通用），INT8（Turing 架构后），稀疏化（Ampere 架构后），FP8（Hopper 架构后）。\n5. 支持张量并行（Tensor parallel）和流水线并行（Pipeline parallel）。\n6. 需安装 'bc' 工具以运行性能基准测试脚本。",[125,126,127,85,128,129,130,131],"CUDA","cuBLAS","cuBLASLt","TensorFlow (可选)","PyTorch (可选)","Triton Inference Server (可选)","TensorRT (可选)",[13,15,26],[134,135,136,137],"pytorch","transformer","gpt","bert","2026-03-27T02:49:30.150509","2026-04-06T06:54:52.831498",[141,146,151,156,161,166],{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},14170,"FasterTransformer 的 Triton 后端是否支持 T5 模型？是否有相关文档或示例？","是的，FasterTransformer 支持 T5 模型。相关的 Triton 后端文件可以在 FasterTransformer 仓库的 `src\u002Ffastertransformer\u002Ftriton_backend\u002Ft5` 目录下找到（例如 dev\u002Fv5.0_beta_2021.11_tag 分支）。虽然独立的 `triton-inference-server\u002Ffastertransformer_backend` 仓库主要展示 GPT 示例，但您可以参考主仓库中的 T5 实现来配置和运行 T5 模型。如果在转换模型或使用 start_id\u002Feos_token_id 时遇到结果不一致的问题，请检查 config.ini 中的配置是否正确，并注意 FT 后端输出的 `output_ids` 中默认不包含 `start_id`，这是预期行为。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer\u002Fissues\u002F202",{"id":147,"question_zh":148,"answer_zh":149,"source_url":150},14171,"为什么在 FP16 模式下，TensorFlow 版本的 GEMM 测试性能结果与 PyTorch 版本不同（甚至显示为 0.00ms）？","维护者表示在标准环境中未观察到此类问题。这种差异通常是由环境设置不当引起的。建议在实验和测试时使用官方提供的 Docker 容器，以避免因本地环境配置（如 CUDA 版本、库依赖等）导致的性能异常或测试结果不准确。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer\u002Fissues\u002F173",{"id":152,"question_zh":153,"answer_zh":154,"source_url":155},14172,"在修改 TensorFlow Op (gpt_op.cc) 以支持 start_ids 和 attention_mask 时，遇到类型不兼容错误（__half vs const DataType_）如何解决？","该错误通常是由于模板参数推导失败，导致 `const DataType_` 与 `__half` 类型的 cv-qualifiers 不兼容。在尝试复现或调试此类问题时，如果是为了比较 TensorFlow 和 PyTorch 的性能，需注意首 token 延迟的差异可能源于模型权重初始化的不同。对于大 batch size 或长序列，这种初始化带来的影响较小。如果必须修改底层 C++ 代码，需确保传入 tensor 的类型指针与模板定义严格匹配，或者参考 TensorFlow 官方关于如何在 Op 属性中添加 tensor 的文档进行处理。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer\u002Fissues\u002F86",{"id":157,"question_zh":158,"answer_zh":159,"source_url":160},14173,"GPT 内核中为什么要运行 `forward_context`？这是否意味着第一个 token 被计算了两次？能否合并？","`forward_context` 和 `forward` 的主要区别在于输入处理方式：`start_id_embedding_position_lookups_kernel_launcher` 用于处理包含 start_ids 的输入（batch_size, seq_len），而 `embedding_position_lookups_kernel_launcher` 仅处理上一个输出的 id（batch_size）。`forward_context` 中使用 `unfused_masked_multi_head_attention` 是为了正确处理上下文注意力掩码。虽然看起来像运行了两次，但这是为了区分上下文编码阶段和自回归生成阶段的必要步骤，类似于 HuggingFace 或其他实现中对第一个 token 的特殊处理逻辑，不能简单合并。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer\u002Fissues\u002F99",{"id":162,"question_zh":163,"answer_zh":164,"source_url":165},14174,"编译时遇到 `identifier \"uint32_t\" is undefined` 错误怎么办？","这是一个常见的编译环境问题，通常是因为编译器未自动包含定义 `uint32_t` 的头文件。解决方法是在执行 `cmake` 命令前，设置环境变量 `CUDAFLAGS` 以显式包含 `stdio.h`。具体命令如下：\n`export CUDAFLAGS=\"-include stdio.h\"`\n然后再运行原有的 cmake 和 make 命令即可解决该报错。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer\u002Fissues\u002F315",{"id":167,"question_zh":168,"answer_zh":169,"source_url":170},14175,"当采样参数 Top_p 大于 0.9 时，模型生成的内容变成乱码（Gibberish），这是什么原因？","当 Top_p 设置过高（如 > 0.9）时，模型采样的随机性显著增加，容易选中概率极低的不合理 token，从而导致生成的文本逻辑混乱或变成乱码。这是核采样（Nucleus Sampling）机制的正常现象。建议适当降低 Top_p 的值（例如 0.9 或更低），或者结合 Temperature 参数进行调整，以在生成多样性和文本质量之间取得平衡。如果初始样本正常但后续变乱，也需检查是否设置了合理的 `max_length` 或 `early_stopping` 策略。","https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer\u002Fissues\u002F234",[172,176,181,185,189,194,199,203,207,211,215,219,223,227],{"id":173,"version":174,"summary_zh":80,"released_at":175},80919,"release\u002Fv5.3_tag","2023-01-23T13:01:46",{"id":177,"version":178,"summary_zh":179,"released_at":180},80920,"release\u002Fv5.2.1_tag","修复 v5.2 的一些 bug","2023-01-01T07:02:15",{"id":182,"version":183,"summary_zh":80,"released_at":184},80921,"release\u002Fv5.2_bug_fix_tag","2022-12-06T06:32:30",{"id":186,"version":187,"summary_zh":80,"released_at":188},80922,"release\u002Fv5.2_tag","2022-12-03T00:58:09",{"id":190,"version":191,"summary_zh":192,"released_at":193},80923,"release\u002Fv5.1.1_tag","1. 修复停止条件。\n2. 修复启用共享上下文优化时注意力掩码选择的错误。\n3. 修复 Swin 模型中的 QK 缩放系数。\n4. 修复 T5 模型在束搜索下重复惩罚机制的错误。\n5. 修复 gpt_guide.md 中的错误。\n6. 修复 decoder_masked_multihead_attention_template 中的错误。","2022-10-17T06:44:25",{"id":195,"version":196,"summary_zh":197,"released_at":198},80924,"release\u002Fv5.1_bugfix_tag","修复 v5.1 版本中 T5 模型并行设置的 bug。","2022-08-23T01:21:02",{"id":200,"version":201,"summary_zh":80,"released_at":202},80925,"release\u002Fv5.1_tag","2022-08-16T03:02:39",{"id":204,"version":205,"summary_zh":80,"released_at":206},80926,"release\u002Fv5.0_tag","2022-04-15T16:15:18",{"id":208,"version":209,"summary_zh":80,"released_at":210},80927,"release\u002Fv1.0_tag","2021-04-03T00:27:21",{"id":212,"version":213,"summary_zh":80,"released_at":214},80928,"release\u002Fv4.0_tag","2021-04-05T07:39:40",{"id":216,"version":217,"summary_zh":80,"released_at":218},80929,"release\u002Fv3.1_tag","2021-07-29T00:39:29",{"id":220,"version":221,"summary_zh":80,"released_at":222},80930,"release\u002Fv3.0_tag","2021-04-03T00:41:24",{"id":224,"version":225,"summary_zh":80,"released_at":226},80931,"release\u002Fv2.1_tag","2021-04-03T00:36:44",{"id":228,"version":229,"summary_zh":80,"released_at":230},80932,"release\u002Fv2.0_tag","2021-04-03T00:33:02"]