[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-triton-inference-server--server":3,"tool-triton-inference-server--server":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160784,2,"2026-04-19T11:32:54",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":77,"owner_twitter":77,"owner_website":78,"owner_url":79,"languages":80,"stars":109,"forks":110,"last_commit_at":111,"license":112,"difficulty_score":10,"env_os":113,"env_gpu":114,"env_ram":115,"env_deps":116,"category_tags":127,"github_topics":128,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":136,"updated_at":137,"faqs":138,"releases":167},9846,"triton-inference-server\u002Fserver","server","The Triton Inference Server provides an optimized cloud and edge inferencing solution. ","Triton Inference Server 是一款由 NVIDIA 推出的开源推理服务软件，旨在为云端、数据中心及边缘设备提供高效优化的 AI 模型部署方案。它主要解决了企业在生产环境中面临的模型框架繁杂、硬件适配困难以及高并发请求下性能瓶颈等痛点。无论是研究人员还是工程开发者，都能利用它将来自 TensorFlow、PyTorch、ONNX、TensorRT 等多种主流框架训练的模型，统一部署在 NVIDIA GPU、x86\u002FARM CPU 或 AWS Inferentia 等不同硬件上。\n\n该工具的核心优势在于其强大的灵活性与高性能调度能力。它支持动态批处理和序列批处理，能显著提升实时推理和流媒体处理的吞吐量；同时具备并发模型执行功能，允许多个模型共享资源并行工作。此外，Triton 提供了开放的后端 API，允许用户通过 Python 轻松编写自定义后端或集成复杂的业务逻辑脚本（BLS），从而构建灵活的模型流水线。作为 NVIDIA AI Enterprise 平台的重要组成部分，Triton Inference Server 非常适合需要将 AI 模型从实验阶段推向大规模生","Triton Inference Server 是一款由 NVIDIA 推出的开源推理服务软件，旨在为云端、数据中心及边缘设备提供高效优化的 AI 模型部署方案。它主要解决了企业在生产环境中面临的模型框架繁杂、硬件适配困难以及高并发请求下性能瓶颈等痛点。无论是研究人员还是工程开发者，都能利用它将来自 TensorFlow、PyTorch、ONNX、TensorRT 等多种主流框架训练的模型，统一部署在 NVIDIA GPU、x86\u002FARM CPU 或 AWS Inferentia 等不同硬件上。\n\n该工具的核心优势在于其强大的灵活性与高性能调度能力。它支持动态批处理和序列批处理，能显著提升实时推理和流媒体处理的吞吐量；同时具备并发模型执行功能，允许多个模型共享资源并行工作。此外，Triton 提供了开放的后端 API，允许用户通过 Python 轻松编写自定义后端或集成复杂的业务逻辑脚本（BLS），从而构建灵活的模型流水线。作为 NVIDIA AI Enterprise 平台的重要组成部分，Triton Inference Server 非常适合需要将 AI 模型从实验阶段推向大规模生产应用的开发团队和数据科学家使用，帮助其 streamlined 地构建稳定、高效的推理服务系统。","\u003C!--\n# Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and\u002For other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n-->\n[![License](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-BSD3-lightgrey.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FBSD-3-Clause)\n\n>[!WARNING]\n>You are currently on the `main` branch which tracks under-development progress\n>towards the next release. The current release is version [2.67.0](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases\u002Flatest)\n>and corresponds to the 26.03 container release on NVIDIA GPU Cloud (NGC).\n\n# Triton Inference Server\n\nTriton Inference Server is an open source inference serving software that\nstreamlines AI inferencing. Triton enables teams to deploy any AI model from\nmultiple deep learning and machine learning frameworks, including TensorRT,\nPyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton\nInference Server supports inference across cloud, data center, edge and embedded\ndevices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference\nServer delivers optimized performance for many query types, including real time,\nbatched, ensembles and audio\u002Fvideo streaming. Triton inference Server is part of\n[NVIDIA AI Enterprise](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdata-center\u002Fproducts\u002Fai-enterprise\u002F),\na software platform that accelerates the data science pipeline and streamlines\nthe development and deployment of production AI.\n\nMajor features include:\n\n- [Supports multiple deep learning\n  frameworks](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend#where-can-i-find-all-the-backends-that-are-available-for-triton)\n- [Supports multiple machine learning\n  frameworks](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ffil_backend)\n- [Concurrent model\n  execution](docs\u002Fuser_guide\u002Farchitecture.md#concurrent-model-execution)\n- [Dynamic batching](docs\u002Fuser_guide\u002Fbatcher.md#dynamic-batcher)\n- [Sequence batching](docs\u002Fuser_guide\u002Fbatcher.md#sequence-batcher) and\n  [implicit state management](docs\u002Fuser_guide\u002Farchitecture.md#implicit-state-management)\n  for stateful models\n- Provides [Backend API](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend) that\n  allows adding custom backends and pre\u002Fpost processing operations\n- Supports writing custom backends in python, a.k.a.\n  [Python-based backends.](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend\u002Fblob\u002Fmain\u002Fdocs\u002Fpython_based_backends.md#python-based-backends)\n- Model pipelines using\n  [Ensembling](docs\u002Fuser_guide\u002Farchitecture.md#ensemble-models) or [Business\n  Logic Scripting\n  (BLS)](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend#business-logic-scripting)\n- [HTTP\u002FREST and GRPC inference\n  protocols](docs\u002Fcustomization_guide\u002Finference_protocols.md) based on the community\n  developed [KServe\n  protocol](https:\u002F\u002Fgithub.com\u002Fkserve\u002Fkserve\u002Ftree\u002Fmaster\u002Fdocs\u002Fpredict-api\u002Fv2)\n- A [C API](docs\u002Fcustomization_guide\u002Finprocess_c_api.md) and\n  [Java API](docs\u002Fcustomization_guide\u002Finprocess_java_api.md)\n  allow Triton to link directly into your application for edge and other in-process use cases\n- [Metrics](docs\u002Fuser_guide\u002Fmetrics.md) indicating GPU utilization, server\n  throughput, server latency, and more\n\n**New to Triton Inference Server?** Make use of\n[these tutorials](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftutorials)\nto begin your Triton journey!\n\nJoin the [Triton and TensorRT community](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdeep-learning-ai\u002Ftriton-tensorrt-newsletter\u002F) and\nstay current on the latest product updates, bug fixes, content, best practices,\nand more.  Need enterprise support?  NVIDIA global support is available for Triton\nInference Server with the\n[NVIDIA AI Enterprise software suite](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdata-center\u002Fproducts\u002Fai-enterprise\u002F).\n\n## Serve a Model in 3 Easy Steps\n\n```bash\n# Step 1: Create the example model repository\ngit clone -b r26.03 https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver.git\ncd server\u002Fdocs\u002Fexamples\n.\u002Ffetch_models.sh\n\n# Step 2: Launch triton from the NGC Triton container\ndocker run --gpus=1 --rm --net=host -v ${PWD}\u002Fmodel_repository:\u002Fmodels nvcr.io\u002Fnvidia\u002Ftritonserver:26.03-py3 tritonserver --model-repository=\u002Fmodels --model-control-mode explicit --load-model densenet_onnx\n\n# Step 3: Sending an Inference Request\n# In a separate console, launch the image_client example from the NGC Triton SDK container\ndocker run -it --rm --net=host nvcr.io\u002Fnvidia\u002Ftritonserver:26.03-py3-sdk \u002Fworkspace\u002Finstall\u002Fbin\u002Fimage_client -m densenet_onnx -c 3 -s INCEPTION \u002Fworkspace\u002Fimages\u002Fmug.jpg\n\n# Inference should return the following\nImage '\u002Fworkspace\u002Fimages\u002Fmug.jpg':\n    15.346230 (504) = COFFEE MUG\n    13.224326 (968) = CUP\n    10.422965 (505) = COFFEEPOT\n```\nPlease read the [QuickStart](docs\u002Fgetting_started\u002Fquickstart.md) guide for additional information\nregarding this example. The quickstart guide also contains an example of how to launch Triton on [CPU-only systems](docs\u002Fgetting_started\u002Fquickstart.md#run-on-cpu-only-system). New to Triton and wondering where to get started? Watch the [Getting Started video](https:\u002F\u002Fyoutu.be\u002FNQDtfSi5QF4).\n\n## Examples and Tutorials\n\nCheck out [NVIDIA LaunchPad](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdata-center\u002Fproducts\u002Fai-enterprise-suite\u002Ftrial\u002F)\nfor free access to a set of hands-on labs with Triton Inference Server hosted on\nNVIDIA infrastructure.\n\nSpecific end-to-end examples for popular models, such as ResNet, BERT, and DLRM\nare located in the\n[NVIDIA Deep Learning Examples](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FDeepLearningExamples)\npage on GitHub. The\n[NVIDIA Developer Zone](https:\u002F\u002Fdeveloper.nvidia.com\u002Fnvidia-triton-inference-server)\ncontains additional documentation, presentations, and examples.\n\n## Documentation\n\n### Build and Deploy\n\nThe recommended way to build and use Triton Inference Server is with Docker\nimages.\n\n- [Install Triton Inference Server with Docker containers](docs\u002Fcustomization_guide\u002Fbuild.md#building-with-docker) (*Recommended*)\n- [Install Triton Inference Server without Docker containers](docs\u002Fcustomization_guide\u002Fbuild.md#building-without-docker)\n- [Build a custom Triton Inference Server Docker container](docs\u002Fcustomization_guide\u002Fcompose.md)\n- [Build Triton Inference Server from source](docs\u002Fcustomization_guide\u002Fbuild.md#building-on-unsupported-platforms)\n- [Build Triton Inference Server for Windows 10](docs\u002Fcustomization_guide\u002Fbuild.md#building-for-windows-10)\n- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy\u002Fgcp\u002FREADME.md),\n  [AWS](deploy\u002Faws\u002FREADME.md), and [NVIDIA FleetCommand](deploy\u002Ffleetcommand\u002FREADME.md)\n- [Secure Deployment Considerations](docs\u002Fcustomization_guide\u002Fdeploy.md)\n\n### Using Triton\n\n#### Preparing Models for Triton Inference Server\n\nThe first step in using Triton to serve your models is to place one or\nmore models into a [model repository](docs\u002Fuser_guide\u002Fmodel_repository.md). Depending on\nthe type of the model and on what Triton capabilities you want to enable for\nthe model, you may need to create a [model\nconfiguration](docs\u002Fuser_guide\u002Fmodel_configuration.md) for the model.\n\n- [Add custom operations to Triton if needed by your model](docs\u002Fuser_guide\u002Fcustom_operations.md)\n- Enable model pipelining with [Model Ensemble](docs\u002Fuser_guide\u002Farchitecture.md#ensemble-models)\n  and [Business Logic Scripting (BLS)](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend#business-logic-scripting)\n- Optimize your models setting [scheduling and batching](docs\u002Fuser_guide\u002Farchitecture.md#models-and-schedulers)\n  parameters and [model instances](docs\u002Fuser_guide\u002Fmodel_configuration.md#instance-groups).\n- Use the [Model Analyzer tool](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fmodel_analyzer)\n  to help optimize your model configuration with profiling\n- Learn how to [explicitly manage what models are available by loading and\n  unloading models](docs\u002Fuser_guide\u002Fmodel_management.md)\n\n#### Configure and Use Triton Inference Server\n\n- Read the [Quick Start Guide](docs\u002Fgetting_started\u002Fquickstart.md) to run Triton Inference\n  Server on both GPU and CPU\n- Triton supports multiple execution engines, called\n  [backends](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend#where-can-i-find-all-the-backends-that-are-available-for-triton), including\n  [TensorRT](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrt_backend),\n  [PyTorch](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpytorch_backend),\n  [ONNX](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fonnxruntime_backend),\n  [OpenVINO](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fopenvino_backend),\n  [Python](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend), and more\n- Not all the above backends are supported on every platform supported by Triton.\n  Look at the\n  [Backend-Platform Support Matrix](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend\u002Fblob\u002Fmain\u002Fdocs\u002Fbackend_platform_support_matrix.md)\n  to learn which backends are supported on your target platform.\n- Learn how to [optimize performance](docs\u002Fuser_guide\u002Foptimization.md) using the\n  [Performance Analyzer](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fperf_analyzer\u002Fblob\u002Fmain\u002FREADME.md)\n  and\n  [Model Analyzer](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fmodel_analyzer)\n- Learn how to [manage loading and unloading models](docs\u002Fuser_guide\u002Fmodel_management.md) in\n  Triton\n- Send requests directly to Triton with the [HTTP\u002FREST JSON-based\n  or gRPC protocols](docs\u002Fcustomization_guide\u002Finference_protocols.md#httprest-and-grpc-protocols)\n\n#### Client Support and Examples\n\nA Triton *client* application sends inference and other requests to Triton. The\n[Python and C++ client libraries](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient)\nprovide APIs to simplify this communication.\n\n- Review client examples for [C++](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient\u002Fblob\u002Fmain\u002Fsrc\u002Fc%2B%2B\u002Fexamples),\n  [Python](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient\u002Fblob\u002Fmain\u002Fsrc\u002Fpython\u002Fexamples),\n  and [Java](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient\u002Fblob\u002Fmain\u002Fsrc\u002Fjava\u002Fsrc\u002Fmain\u002Fjava\u002Ftriton\u002Fclient\u002Fexamples)\n- Configure [HTTP](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient#http-options)\n  and [gRPC](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient#grpc-options)\n  client options\n- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP\n  request without any additional metadata](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fmain\u002Fdocs\u002Fprotocol\u002Fextension_binary_data.md#raw-binary-request)\n\n### Extend Triton\n\n[Triton Inference Server's architecture](docs\u002Fuser_guide\u002Farchitecture.md) is specifically\ndesigned for modularity and flexibility\n\n- [Customize Triton Inference Server container](docs\u002Fcustomization_guide\u002Fcompose.md) for your use case\n- [Create custom backends](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend)\n  in either [C\u002FC++](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend\u002Fblob\u002Fmain\u002FREADME.md#triton-backend-api)\n  or [Python](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend)\n- Create [decoupled backends and models](docs\u002Fuser_guide\u002Fdecoupled_models.md) that can send\n  multiple responses for a request or not send any responses for a request\n- Use a [Triton repository agent](docs\u002Fcustomization_guide\u002Frepository_agents.md) to add functionality\n  that operates when a model is loaded and unloaded, such as authentication,\n  decryption, or conversion\n- Deploy Triton on [Jetson and JetPack](docs\u002Fuser_guide\u002Fjetson.md)\n- [Use Triton on AWS\n   Inferentia](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain\u002Finferentia)\n\n### Additional Documentation\n\n- [FAQ](docs\u002Fuser_guide\u002Ffaq.md)\n- [User Guide](docs\u002FREADME.md#user-guide)\n- [Customization Guide](docs\u002FREADME.md#customization-guide)\n- [Release Notes](https:\u002F\u002Fdocs.nvidia.com\u002Fdeeplearning\u002Ftriton-inference-server\u002Frelease-notes\u002Findex.html)\n- [GPU, Driver, and CUDA Support\nMatrix](https:\u002F\u002Fdocs.nvidia.com\u002Fdeeplearning\u002Fdgx\u002Fsupport-matrix\u002Findex.html)\n\n## Contributing\n\nContributions to Triton Inference Server are more than welcome. To\ncontribute please review the [contribution\nguidelines](CONTRIBUTING.md). If you have a backend, client,\nexample or similar contribution that is not modifying the core of\nTriton, then you should file a PR in the [contrib\nrepo](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fcontrib).\n\n## Reporting problems, asking questions\n\nWe appreciate any feedback, questions or bug reporting regarding this project.\nWhen posting [issues in GitHub](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fissues),\nfollow the process outlined in the [Stack Overflow document](https:\u002F\u002Fstackoverflow.com\u002Fhelp\u002Fmcve).\nEnsure posted examples are:\n- minimal – use as little code as possible that still produces the\n  same problem\n- complete – provide all parts needed to reproduce the problem. Check\n  if you can strip external dependencies and still show the problem. The\n  less time we spend on reproducing problems the more time we have to\n  fix it\n- verifiable – test the code you're about to provide to make sure it\n  reproduces the problem. Remove all other problems that are not\n  related to your request\u002Fquestion.\n\nFor issues, please use the provided bug report and feature request templates.\n\nFor questions, we recommend posting in our community\n[GitHub Discussions.](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fdiscussions)\n\n## For more information\n\nPlease refer to the [NVIDIA Developer Triton page](https:\u002F\u002Fdeveloper.nvidia.com\u002Fnvidia-triton-inference-server)\nfor more information.\n","\u003C!--\n# 版权所有 © 2018-2026，英伟达公司及其关联公司。保留所有权利。\n#\n# 在遵守以下条件的前提下，允许以源代码和二进制形式进行再分发和使用，\n# 不论是否修改：\n#  * 源代码的再分发必须保留上述版权声明、本条件列表及以下免责声明。\n#  * 二进制形式的再分发必须在随分发提供的文档和\u002F或其他材料中复制上述版权声明、\n#    本条件列表及以下免责声明。\n#  * 未经事先书面许可，不得使用英伟达公司或其贡献者的名称来背书或推广由此软件衍生的产品。\n#\n# 本软件由版权所有者“按原样”提供，不提供任何明示或暗示的担保，包括但不限于\n# 对适销性和特定用途适用性的暗示担保。在任何情况下，版权所有者或贡献者均不对任何直接、\n# 间接、偶然、特殊、示范性或后果性损害承担责任（包括但不限于替代品或服务的获取、\n# 使用损失、数据丢失、利润损失或业务中断），即使已被告知发生此类损害的可能性。\n-->\n[![许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-BSD3-lightgrey.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FBSD-3-Clause)\n\n>[!警告]\n>您当前位于 `main` 分支，该分支跟踪下一版本的开发进展。当前发布的稳定版本为 [2.67.0](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases\u002Flatest)，对应 NVIDIA GPU Cloud (NGC) 上的 26.03 容器版本。\n\n# Triton 推理服务器\n\nTriton 推理服务器是一款开源的推理服务软件，旨在简化 AI 推理流程。它使团队能够部署来自多种深度学习和机器学习框架的 AI 模型，包括 TensorRT、PyTorch、ONNX、OpenVINO、Python、RAPIDS FIL 等。Triton 推理服务器支持跨云、数据中心、边缘和嵌入式设备的推理工作负载，可在 NVIDIA GPU、x86 和 ARM CPU，或 AWS Inferentia 上运行。Triton 推理服务器针对多种查询类型提供了优化性能，包括实时推理、批处理推理、模型集成以及音频\u002F视频流式推理。Triton 推理服务器是 [NVIDIA AI Enterprise](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdata-center\u002Fproducts\u002Fai-enterprise\u002F) 的一部分，该软件平台可加速数据科学流水线，并简化生产级 AI 的开发与部署。\n\n主要特性包括：\n\n- [支持多种深度学习框架](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend#where-can-i-find-all-the-backends-that-are-available-for-triton)\n- [支持多种机器学习框架](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ffil_backend)\n- [并发模型执行](docs\u002Fuser_guide\u002Farchitecture.md#concurrent-model-execution)\n- [动态批处理](docs\u002Fuser_guide\u002Fbatcher.md#dynamic-batcher)\n- [序列批处理](docs\u002Fuser_guide\u002Fbatcher.md#sequence-batcher) 和针对有状态模型的 [隐式状态管理](docs\u002Fuser_guide\u002Farchitecture.md#implicit-state-management)\n- 提供 [Backend API](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend)，允许添加自定义后端及预\u002F后处理操作\n- 支持用 Python 编写自定义后端，即 [基于 Python 的后端](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend\u002Fblob\u002Fmain\u002Fdocs\u002Fpython_based_backends.md#python-based-backends)\n- 使用 [模型集成](docs\u002Fuser_guide\u002Farchitecture.md#ensemble-models) 或 [业务逻辑脚本 (BLS)](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend#business-logic-scripting) 构建模型流水线\n- 基于社区开发的 [KServe 协议](https:\u002F\u002Fgithub.com\u002Fkserve\u002Fkserve\u002Ftree\u002Fmaster\u002Fdocs\u002Fpredict-api\u002Fv2) 的 [HTTP\u002FREST 和 GRPC 推理协议](docs\u002Fcustomization_guide\u002Finference_protocols.md)\n- [C API](docs\u002Fcustomization_guide\u002Finprocess_c_api.md) 和 [Java API](docs\u002Fcustomization_guide\u002Finprocess_java_api.md) 允许 Triton 直接集成到您的应用程序中，适用于边缘及其他进程内应用场景\n- [指标](docs\u002Fuser_guide\u002Fmetrics.md)，用于监控 GPU 利用率、服务器吞吐量、服务器延迟等\n\n**初次使用 Triton 推理服务器？** 请参考 [这些教程](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftutorials)，开始您的 Triton 学习之旅！\n\n加入 [Triton 和 TensorRT 社区](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdeep-learning-ai\u002Ftriton-tensorrt-newsletter\u002F) ，及时了解最新产品更新、漏洞修复、内容、最佳实践等信息。如需企业级支持，请联系 NVIDIA 全球技术支持，Triton 推理服务器可与 [NVIDIA AI Enterprise 软件套件](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdata-center\u002Fproducts\u002Fai-enterprise\u002F) 配合使用。\n\n## 三步轻松部署模型\n\n```bash\n# 第一步：创建示例模型仓库\ngit clone -b r26.03 https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver.git\ncd server\u002Fdocs\u002Fexamples\n.\u002Ffetch_models.sh\n\n# 第二步：从 NGC Triton 容器启动 Triton\ndocker run --gpus=1 --rm --net=host -v ${PWD}\u002Fmodel_repository:\u002Fmodels nvcr.io\u002Fnvidia\u002Ftritonserver:26.03-py3 tritonserver --model-repository=\u002Fmodels --model-control-mode explicit --load-model densenet_onnx\n\n# 第三步：发送推理请求\n# 在另一个终端窗口中，从 NGC Triton SDK 容器启动 image_client 示例\ndocker run -it --rm --net=host nvcr.io\u002Fnvidia\u002Ftritonserver:26.03-py3-sdk \u002Fworkspace\u002Finstall\u002Fbin\u002Fimage_client -m densenet_onnx -c 3 -s INCEPTION \u002Fworkspace\u002Fimages\u002Fmug.jpg\n\n# 推理结果应如下所示：\n图像 '\u002Fworkspace\u002Fimages\u002Fmug.jpg':\n    15.346230 (504) = 咖啡杯\n    13.224326 (968) = 杯子\n    10.422965 (505) = 咖啡壶\n```\n有关此示例的更多信息，请参阅 [快速入门](docs\u002Fgetting_started\u002Fquickstart.md) 指南。快速入门指南还包含如何在仅 CPU 系统上启动 Triton 的示例 ([docs\u002Fgetting_started\u002Fquickstart.md#run-on-cpu-only-system])。如果您是 Triton 新手并想知道从哪里开始，请观看 [入门视频](https:\u002F\u002Fyoutu.be\u002FNQDtfSi5QF4)。\n\n## 示例与教程\n\n请访问 [NVIDIA LaunchPad](https:\u002F\u002Fwww.nvidia.com\u002Fen-us\u002Fdata-center\u002Fproducts\u002Fai-enterprise-suite\u002Ftrial\u002F)，\n免费获取一系列动手实验，这些实验使用在 NVIDIA 基础设施上托管的 Triton 推理服务器。\n\n针对 ResNet、BERT 和 DLRM 等流行模型的具体端到端示例，可在 GitHub 上的\n[NVIDIA 深度学习示例](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FDeepLearningExamples) 页面中找到。此外，\n[NVIDIA 开发者专区](https:\u002F\u002Fdeveloper.nvidia.com\u002Fnvidia-triton-inference-server)\n还包含更多文档、演示文稿和示例。\n\n## 文档\n\n### 构建与部署\n\n构建和使用 Triton 推理服务器的推荐方式是通过 Docker 镜像。\n\n- 使用 Docker 容器安装 Triton 推理服务器（推荐）[docs\u002Fcustomization_guide\u002Fbuild.md#building-with-docker]\n- 不使用 Docker 容器安装 Triton 推理服务器 [docs\u002Fcustomization_guide\u002Fbuild.md#building-without-docker]\n- 构建自定义的 Triton 推理服务器 Docker 容器 [docs\u002Fcustomization_guide\u002Fcompose.md]\n- 从源代码构建 Triton 推理服务器 [docs\u002Fcustomization_guide\u002Fbuild.md#building-on-unsupported-platforms]\n- 为 Windows 10 构建 Triton 推理服务器 [docs\u002Fcustomization_guide\u002Fbuild.md#building-for-windows-10]\n- 在 [GCP](deploy\u002Fgcp\u002FREADME.md)、[AWS](deploy\u002Faws\u002FREADME.md) 和 [NVIDIA FleetCommand](deploy\u002Ffleetcommand\u002FREADME.md) 上使用 Kubernetes 和 Helm 部署 Triton 推理服务器的示例\n- 安全部署注意事项 [docs\u002Fcustomization_guide\u002Fdeploy.md]\n\n### 使用 Triton\n\n#### 为 Triton 推理服务器准备模型\n\n使用 Triton 提供模型服务的第一步，是将一个或多个模型放入 [模型存储库](docs\u002Fuser_guide\u002Fmodel_repository.md)。\n根据模型类型以及您希望为该模型启用的 Triton 功能，可能需要为模型创建 [模型配置](docs\u002Fuser_guide\u002Fmodel_configuration.md)。\n\n- 如果您的模型需要，可向 Triton 添加自定义操作 [docs\u002Fuser_guide\u002Fcustom_operations.md]\n- 使用 [模型集成](docs\u002Fuser_guide\u002Farchitecture.md#ensemble-models) 和 [业务逻辑脚本 (BLS)](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend#business-logic-scripting) 启用模型流水线\n- 通过设置 [调度与批处理](docs\u002Fuser_guide\u002Farchitecture.md#models-and-schedulers) 参数以及 [模型实例](docs\u002Fuser_guide\u002Fmodel_configuration.md#instance-groups) 来优化您的模型\n- 使用 [模型分析工具](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fmodel_analyzer) 通过性能剖析帮助优化您的模型配置\n- 学习如何通过加载和卸载模型来 [明确管理可用的模型](docs\u002Fuser_guide\u002Fmodel_management.md)\n\n#### 配置并使用 Triton 推理服务器\n\n- 阅读 [快速入门指南](docs\u002Fgetting_started\u002Fquickstart.md)，了解如何在 GPU 和 CPU 上运行 Triton 推理服务器\n- Triton 支持多种执行引擎，称为 [后端](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend#where-can-i-find-all-the-backends-that-are-available-for-triton)，包括\n  [TensorRT](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrt_backend)、\n  [PyTorch](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpytorch_backend)、\n  [ONNX](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fonnxruntime_backend)、\n  [OpenVINO](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fopenvino_backend)、\n  [Python](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend) 等。\n- 并非所有上述后端都支持 Triton 支持的所有平台。请查看\n  [后端-平台支持矩阵](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend\u002Fblob\u002Fmain\u002Fdocs\u002Fbackend_platform_support_matrix.md)，\n  以了解哪些后端在您的目标平台上受支持。\n- 学习如何使用 [性能分析器](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fperf_analyzer\u002Fblob\u002Fmain\u002FREADME.md) 和\n  [模型分析器](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fmodel_analyzer) 来 [优化性能](docs\u002Fuser_guide\u002Foptimization.md)\n- 学习如何在 Triton 中 [管理模型的加载和卸载](docs\u002Fuser_guide\u002Fmodel_management.md)\n- 直接使用 [基于 HTTP\u002FREST JSON 或 gRPC 协议](docs\u002Fcustomization_guide\u002Finference_protocols.md#httprest-and-grpc-protocols) 向 Triton 发送请求\n\n#### 客户端支持与示例\n\nTriton *客户端* 应用程序会向 Triton 发送推理和其他请求。[Python 和 C++ 客户端库](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient)\n提供了简化此通信的 API。\n\n- 查看 [C++](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient\u002Fblob\u002Fmain\u002Fsrc\u002Fc%2B%2B\u002Fexamples)、\n  [Python](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient\u002Fblob\u002Fmain\u002Fsrc\u002Fpython\u002Fexamples) 和\n  [Java](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient\u002Fblob\u002Fmain\u002Fsrc\u002Fjava\u002Fsrc\u002Fmain\u002Fjava\u002Ftriton\u002Fclient\u002Fexamples) 的客户端示例\n- 配置 [HTTP](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient#http-options) 和 [gRPC](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient#grpc-options) 客户端选项\n- 将输入数据（例如一张 JPEG 图片）直接作为 HTTP 请求的主体发送给 Triton，而无需任何额外的元数据 [https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fmain\u002Fdocs\u002Fprotocol\u002Fextension_binary_data.md#raw-binary-request]\n\n### 扩展 Triton\n\n[Triton 推理服务器的架构](docs\u002Fuser_guide\u002Farchitecture.md) 专为模块化和灵活性而设计。\n\n- 根据您的用例定制 Triton 推理服务器容器 [docs\u002Fcustomization_guide\u002Fcompose.md]\n- 创建自定义后端 [https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend]，\n  可以使用 [C\u002FC++]（参见 [Triton 后端 API](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fbackend\u002Fblob\u002Fmain\u002FREADME.md#triton-backend-api)）\n  或 [Python](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend) 实现。\n- 创建 [解耦后端和模型](docs\u002Fuser_guide\u002Fdecoupled_models.md)，这些后端和模型可以对一个请求返回多个响应，或者不返回任何响应。\n- 使用 [Triton 存储库代理](docs\u002Fcustomization_guide\u002Frepository_agents.md) 添加在模型加载和卸载时运行的功能，例如身份验证、解密或转换。\n- 在 [Jetson 和 JetPack](docs\u002Fuser_guide\u002Fjetson.md) 上部署 Triton\n- 在 AWS Inferentia 上使用 Triton [https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain\u002Finferentia]\n\n### 其他文档\n\n- [常见问题解答](docs\u002Fuser_guide\u002Ffaq.md)\n- [用户指南](docs\u002FREADME.md#user-guide)\n- [自定义指南](docs\u002FREADME.md#customization-guide)\n- [发行说明](https:\u002F\u002Fdocs.nvidia.com\u002Fdeeplearning\u002Ftriton-inference-server\u002Frelease-notes\u002Findex.html)\n- [GPU、驱动程序和 CUDA 支持矩阵](https:\u002F\u002Fdocs.nvidia.com\u002Fdeeplearning\u002Fdgx\u002Fsupport-matrix\u002Findex.html)\n\n## 贡献\n\n我们非常欢迎对 Triton 推理服务器的贡献。请先阅读 [贡献指南](CONTRIBUTING.md)。如果您有后端、客户端、示例代码或其他类似贡献，且这些内容并不涉及修改 Triton 的核心功能，则应将您的 Pull Request 提交到 [contrib 仓库](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fcontrib)。\n\n## 报告问题与提问\n\n我们非常感谢您对该项目的任何反馈、问题或 bug 报告。在 GitHub 上提交 [问题](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fissues) 时，请遵循 [Stack Overflow 文档](https:\u002F\u002Fstackoverflow.com\u002Fhelp\u002Fmcve) 中概述的流程。请确保您提供的示例：\n- 尽量精简——使用尽可能少的代码，但仍能复现相同的问题。\n- 完整——提供复现问题所需的所有部分。尝试去除外部依赖，看看是否仍能展示该问题。我们花在复现问题上的时间越少，就能有更多时间来修复它。\n- 可验证——在提交代码之前，请先测试以确认其确实能够复现问题。同时，请移除所有与您的请求或问题无关的内容。\n\n对于问题，请使用提供的 bug 报告和功能请求模板。\n\n对于疑问，我们建议您在我们的社区 [GitHub Discussions](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fdiscussions) 中发帖。\n\n## 更多信息\n\n如需更多信息，请参阅 [NVIDIA 开发者 Triton 页面](https:\u002F\u002Fdeveloper.nvidia.com\u002Fnvidia-triton-inference-server)。","# Triton Inference Server 快速上手指南\n\nTriton Inference Server 是一款开源的推理服务软件，支持在云、数据中心、边缘及嵌入式设备上部署来自 TensorFlow、PyTorch、ONNX、TensorRT 等多种框架的 AI 模型。\n\n## 环境准备\n\n### 系统要求\n- **操作系统**: Linux (推荐 Ubuntu 18.04\u002F20.04\u002F22.04) 或 Windows 10。\n- **硬件**:\n  - **GPU 模式**: NVIDIA GPU (需安装兼容的 CUDA 驱动)。\n  - **CPU 模式**: x86 或 ARM 架构 CPU。\n- **容器运行时**: 已安装 Docker 和 NVIDIA Container Toolkit (用于 GPU 支持)。\n\n### 前置依赖\n确保已安装 Docker 并配置好 NVIDIA 运行时：\n```bash\n# 验证 NVIDIA Docker 是否正常工作\ndocker run --rm --gpus all nvidia\u002Fcuda:11.0-base nvidia-smi\n```\n> **注意**: 国内用户若拉取 `nvcr.io` 镜像较慢，可配置 Docker 镜像加速器（如阿里云、腾讯云等），但官方镜像仓库 `nvcr.io` 目前暂无官方中国镜像源，建议配置通用 Docker 加速。\n\n## 安装步骤\n\n推荐使用 Docker 容器方式运行，无需手动编译源码。\n\n### 1. 获取示例模型仓库\n克隆服务器代码库并下载示例模型：\n```bash\ngit clone -b r26.03 https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver.git\ncd server\u002Fdocs\u002Fexamples\n.\u002Ffetch_models.sh\n```\n\n### 2. 启动 Triton 服务器\n使用 NVIDIA NGC 提供的预构建容器启动服务。以下命令以加载 `densenet_onnx` 模型为例：\n\n```bash\ndocker run --gpus=1 --rm --net=host -v ${PWD}\u002Fmodel_repository:\u002Fmodels nvcr.io\u002Fnvidia\u002Ftritonserver:26.03-py3 tritonserver --model-repository=\u002Fmodels --model-control-mode explicit --load-model densenet_onnx\n```\n*参数说明:*\n- `--gpus=1`: 指定使用 1 块 GPU。若在纯 CPU 环境运行，请移除此参数并参考官方文档的 CPU 运行指南。\n- `-v ${PWD}\u002Fmodel_repository:\u002Fmodels`: 将本地模型目录挂载到容器内。\n- `--load-model densenet_onnx`: 显式加载指定模型。\n\n## 基本使用\n\n启动服务器后，在**另一个终端窗口**中，使用 Triton SDK 容器发送推理请求。\n\n### 发送推理请求\n运行以下命令测试图像分类模型：\n\n```bash\ndocker run -it --rm --net=host nvcr.io\u002Fnvidia\u002Ftritonserver:26.03-py3-sdk \u002Fworkspace\u002Finstall\u002Fbin\u002Fimage_client -m densenet_onnx -c 3 -s INCEPTION \u002Fworkspace\u002Fimages\u002Fmug.jpg\n```\n\n### 预期输出\n如果一切正常，您将看到类似以下的分类结果：\n```text\nImage '\u002Fworkspace\u002Fimages\u002Fmug.jpg':\n    15.346230 (504) = COFFEE MUG\n    13.224326 (968) = CUP\n    10.422965 (505) = COFFEEPOT\n```\n\n### 下一步\n- **开发客户端**: 使用官方提供的 [Python](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fclient) 或 C++ 客户端库集成到您的应用中。\n- **自定义模型**: 将您的模型文件放入 `model_repository` 目录，并编写对应的 `config.pbtxt` 配置文件即可发布新服务。\n- **性能优化**: 使用 `perf_analyzer` 工具进行性能基准测试。","某大型电商平台的推荐团队需要在高并发大促期间，实时运行由 PyTorch 和 TensorRT 混合构建的复杂商品排序模型。\n\n### 没有 Triton Inference Server 时\n- **框架割裂严重**：团队需为不同框架（PyTorch、TensorRT）分别编写独立的推理服务代码，导致维护多套后端逻辑，开发效率极低。\n- **资源利用率低下**：缺乏动态批处理机制，服务器在面对零散请求时无法自动合并计算，导致 GPU 算力大量闲置，吞吐量难以提升。\n- **延迟波动剧烈**：在流量洪峰到来时，由于缺少序列批处理和并发执行优化，单个请求的响应时间忽高忽低，严重影响用户购物体验。\n- **部署流程繁琐**：每次更新模型或调整预处理逻辑，都需要重新打包容器并重启服务，无法实现热加载和敏捷迭代。\n\n### 使用 Triton Inference Server 后\n- **统一推理入口**：通过单一接口即可调度 PyTorch、TensorRT 等多种框架模型，利用其丰富的后端支持消除了异构框架带来的集成壁垒。\n- **算力极致释放**：启用动态批处理功能后，系统自动将毫秒级到达的零散请求合并计算，显著提升了 GPU 利用率和服务吞吐量。\n- **性能稳定可控**：借助并发模型执行与序列批处理技术，即使在高负载下也能保证低且稳定的推理延迟，确保大促期间服务不卡顿。\n- **运维灵活高效**：支持模型热加载与版本管理，配合 Python 后端自定义业务逻辑，团队可在不中断服务的情况下快速上线新算法策略。\n\nTriton Inference Server 通过标准化的推理服务和智能调度机制，将原本碎片化、低效的模型部署转变为高性能、易维护的生产级流水线。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ftriton-inference-server_server_90c9ae56.png","triton-inference-server","Triton Inference Server","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Ftriton-inference-server_a54d6a79.jpg","",null,"https:\u002F\u002Fdeveloper.nvidia.com\u002Fnvidia-triton-inference-server","https:\u002F\u002Fgithub.com\u002Ftriton-inference-server",[81,85,89,93,97,101,105],{"name":82,"color":83,"percentage":84},"Python","#3572A5",57,{"name":86,"color":87,"percentage":88},"Shell","#89e051",21.6,{"name":90,"color":91,"percentage":92},"C++","#f34b7d",18.4,{"name":94,"color":95,"percentage":96},"CMake","#DA3434",1.4,{"name":98,"color":99,"percentage":100},"Java","#b07219",1.2,{"name":102,"color":103,"percentage":104},"Go Template","#00ADD8",0.4,{"name":106,"color":107,"percentage":108},"Dockerfile","#384d54",0,10584,1761,"2026-04-19T14:20:21","BSD-3-Clause","Linux, Windows","可选（支持 NVIDIA GPU、AWS Inferentia 或仅 CPU 模式）。若使用 NVIDIA GPU，需兼容 CUDA 的显卡（具体型号和显存取决于加载的模型），支持 TensorRT 等后端。","未说明（取决于模型大小和并发请求量）",{"notes":117,"python":118,"dependencies":119},"推荐使用 Docker 容器部署（如 nvcr.io\u002Fnvidia\u002Ftritonserver:26.03-py3）。支持多种硬件架构（NVIDIA GPU, x86\u002FARM CPU, AWS Inferentia）。具体后端框架（如 PyTorch, ONNX）的支持情况需参考后端 - 平台支持矩阵。可通过 Kubernetes (GCP, AWS) 或 FleetCommand 进行编排部署。","3.x (容器镜像基于 py3，如 26.03-py3)",[120,121,122,123,124,125,126],"Docker (推荐)","NVIDIA Container Toolkit","TensorRT","PyTorch","ONNX Runtime","OpenVINO","RAPIDS FIL",[16,14],[129,130,131,132,133,134,135],"inference","gpu","machine-learning","deep-learning","cloud","datacenter","edge","2026-03-27T02:49:30.150509","2026-04-20T07:06:04.120666",[139,144,149,154,158,163],{"id":140,"question_zh":141,"answer_zh":142,"source_url":143},44203,"如何在 NVIDIA Jetson 设备上构建和运行 Triton 推理服务器？","官方没有直接提供预构建的 Jetson Docker 镜像，但可以通过发布的 tarball 轻松构建。以下是一个基于 L4T TensorRT 的 Dockerfile 示例，它会下载特定版本的 Triton 并清理不需要的后端（仅保留 TensorRT\u002FPython）：\n\nFROM nvcr.io\u002Fnvidia\u002Fl4t-tensorrt:r8.4.1-runtime\n\nRUN apt-get update -y && \\\n    apt-get install -y curl libre2-dev libb64-dev && \\\n    rm -rf \u002Fvar\u002Flib\u002Fapt\u002Flists\u002F*\n\nWORKDIR \u002Fopt\u002Ftritonserver\n\nRUN \\\n    curl -L https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases\u002Fdownload\u002Fv2.24.0\u002Ftritonserver2.24.0-jetpack5.0.2.tgz | tar -xzf - && \\\n    rm -rf backends\u002Ftensorflow1 backends\u002Ftensorflow2 backends\u002Fonnxruntime backends\u002Fpytorch clients include qa test-util && \\\n    echo Done\n\nENV PATH=\"\u002Fopt\u002Ftritonserver\u002Fbin:${PATH}\"\n\nENTRYPOINT [\"\u002Fopt\u002Ftritonserver\u002Fbin\u002Ftritonserver\"]\nCMD [\"--help\"]\n\n请根据实际需求修改版本号和保留的后端。","https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fissues\u002F1468",{"id":145,"question_zh":146,"answer_zh":147,"source_url":148},44204,"在 Docker 中运行 Triton 客户端时，注册 CUDA 共享内存失败怎么办？","如果在使用 Docker 运行 Triton 客户端脚本时遇到无法注册 CUDA 共享内存的问题（即使交互式模式下正常），通常是因为缺少必要的进程命名空间隔离配置。除了设置 `--ipc host` 外，还必须添加 `--pid host` 标志。\n\n正确的 Docker 运行命令示例：\ndocker run --gpus all --ipc host --pid host --network host ... \n\n确保服务器和客户端容器都配置了这些标志，以便它们能正确访问共享内存区域。","https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fissues\u002F3429",{"id":150,"question_zh":151,"answer_zh":152,"source_url":153},44205,"遇到 \"Stub process is unhealthy and it will be restarted\" 错误导致服务重启如何解决？","该错误通常不是由共享内存引起的，而是容器内内存不足导致的。建议在执行推理时使用 `top` 或 `htop` 监控容器内部的内存使用情况。\n\n解决方案：\n1. 检查并增加分配给 Docker 容器的内存限制（在 Docker 设置中调整）。\n2. 注意：单纯增加 `shm-size` 或设置 `ipc=host` 可能无法解决此特定问题，如果根本原因是整体 RAM 不足而非共享内存段不足。","https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fissues\u002F3678",{"id":155,"question_zh":156,"answer_zh":157,"source_url":153},44206,"如何解决 Docker 容器间或容器与主机间的 IPC 共享问题以支持高性能应用？","对于需要加速进程间通信（如数据库或高性能计算应用）的场景，需要共享 IPC 机制。可以在 docker-compose 文件或 docker run 命令中添加 `ipc: host` 配置。\n\n配置示例 (docker-compose):\nservices:\n  triton-server:\n    ipc: host\n    # 其他配置...\n\n原理：IPC 命名空间提供了命名共享内存段、信号量和消息队列的隔离。使用 `host` 模式可以让容器直接使用主机的 IPC 资源，避免因为隔离导致的通信失败。",{"id":159,"question_zh":160,"answer_zh":161,"source_url":162},44207,"Triton 中 TorchScript 模型的推理速度比本地 Python 环境慢很多，可能的原因是什么？","虽然具体解决方案在提供的片段中被截断，但此类性能差异通常源于模型配置不当。常见排查方向包括：\n1. 检查 config.pbtxt 中的 `instance_group` 配置，确保 GPU 实例数量合理。\n2. 确认 `max_batch_size` 设置是否与实际操作匹配，过小的 batch size 可能导致 GPU 利用率低。\n3. 验证输入数据的维度（dims）和数据类型（data_type）是否与模型追踪（trace）时完全一致。\n4. 比较本地 Python 环境与 Triton 容器中 PyTorch 及 CUDA 的版本是否一致。","https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fissues\u002F2836",{"id":164,"question_zh":165,"answer_zh":166,"source_url":153},44208,"在 Kubernetes (GKE) 上部署 Triton 时如何处理共享内存大小限制？","在 Kubernetes 环境中，默认的空目录（emptyDir）可能只有 64MB，这不足以支撑 Triton 的共享内存需求。需要在 Deployment YAML 中显式配置使用内存作为介质的 emptyDir。\n\n配置示例：\nvolumes:\n- name: dshm\n  emptyDir:\n    medium: Memory\n    sizeLimit: 2Gi  # 根据需要调整大小\n\n然后在容器挂载点中引用该 volume，并将其挂载到 `\u002Fdev\u002Fshm`，以确保有足够的共享内存空间。",[168,173,178,183,188,193,198,203,208,213,218,223,228,233,238,243,248,253,258,263],{"id":169,"version":170,"summary_zh":171,"released_at":172},351801,"v2.58.0","\u003C!-- TPRD-1381 -->\r\n# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>New Features and Improvements\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n- Optional “execution_context_allocation_strategy” parameter in the TensorRT backend configuration allows selection of memory allocation behavior.\r\n- Support Tool calling functionality with Llama 3 and Mistral models in OpenAI frontend.\r\n- Improvements around memory allocation and various bug fixes.\r\n- GenAI-Perf now offers a new configuration file alongside the command line.\r\n- GenAI-Perf now collects GPU metrics from \u002Fmetrics endpoint exposed by DCGM Exporter.\r\n- GenAI-Perf supports new Power, Utilization, Ecc, Errors and PCie metrics.\r\n\r\n\u003C\u002Fdetails>\r\n\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>Known Issues\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* vLLM backend for 25.05 might be unstable with the vLLM V1 architecture. We recommend switching to V0 for this release, by setting `VLLM_USE_V1` environment variable to 0. However, users should be aware that vLLM's V0 API is affected by vulnerabilities.\r\n\r\n* vLLM containers include vllm version 0.8.4 which is affected by vulnerabilities.\r\n   Workarounds:\r\n   Prior to the fix, your options include:\r\n   - Do not expose the vLLM host to a network where any untrusted connections may reach the host.\r\n   - Ensure that only the other vLLM hosts are able to connect to the TCP port used for the XPUB socket. Note that port used is random.\r\n\r\n* The core Python binding may incur an additional D2H and H2D copy if the backend and frontend both specify device memory to be used for response tensors.\r\n\r\n* A segmentation fault related to DCGM and NSCQ may be encountered during server shutdown on NVSwitch systems. A possible workaround for this issue is to disable the collection of GPU metrics `tritonserver --allow-gpu-metrics false ...`\r\n\r\n* vLLM backend currently does not take advantage of the [vLLM v0.6](https:\u002F\u002Fblog.vllm.ai\u002F2024\u002F09\u002F05\u002Fperf-update.html) performance improvement when metrics are enabled.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and `is_non_linear_format_io:true` for [reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in [decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the `ResponseSender` goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads`. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter `\"config\" : \"\u003CJSON>\"` instead of custom configuration file in the following format: `\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\"`.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n\r\n* Some systems which implement `malloc()` may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. `TCMalloc` and `jemalloc` are installed in the Triton container and can be [used by specifying the library in LD_PRELOAD](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr25.01\u002Fdocs\u002Fuser_guide\u002Fmodel_management.m","2025-05-31T09:16:11",{"id":174,"version":175,"summary_zh":176,"released_at":177},351802,"v2.57.0","\u003C!-- TPRD-1381 -->\r\n# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>New Features and Improvements\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n- Exposed gRPC infer thread count as a server option.\r\n- Improved server stability during the gRPC client cancellation.\r\n- Improved server stability in tracing mode.\r\n- Added BLS decoupled request cancellation in the Python Backend\r\n- GenAI-Perf now offers a new configuration file alongside the command line.\r\n- GenAI-Perf now supports the Huggingface TGI generated endpoint.\r\n- GenAI-Perf added a Token per second per user (TPS\u002Fuser) metric.\r\n- GenAI-Perf metric parsing speed was increased by 60%.\r\n\r\n\u003C\u002Fdetails>\r\n\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>Known Issues\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* vLLM backend for 25.04 might be unstable with the vLLM V1 architecture. We recommend switching to V0 for this release, by setting `VLLM_USE_V1` environment variable to 0. However, users should be aware that vLLM's V0 API is affected by vulnerabilities.\r\n\r\n* vLLM containers include vllm version 0.8.1 which is affected by new vulnerabilities.\r\n   Workarounds:\r\n   Prior to the fix, your options include:\r\n   - Do not expose the vLLM host to a network where any untrusted connections may reach the host.\r\n   - Ensure that only the other vLLM hosts are able to connect to the TCP port used for the XPUB socket. Note that port used is random.\r\n\r\n* The core Python binding may incur an additional D2H and H2D copy if the backend and frontend both specify device memory to be used for response tensors.\r\n\r\n* A segmentation fault related to DCGM and NSCQ may be encountered during server shutdown on NVSwitch systems. A possible workaround for this issue is to disable the collection of GPU metrics `tritonserver --allow-gpu-metrics false ...`\r\n\r\n* vLLM backend currently does not take advantage of the [vLLM v0.6](https:\u002F\u002Fblog.vllm.ai\u002F2024\u002F09\u002F05\u002Fperf-update.html) performance improvement when metrics are enabled.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and `is_non_linear_format_io:true` for [reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in [decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the `ResponseSender` goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads`. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter `\"config\" : \"\u003CJSON>\"` instead of custom configuration file in the following format: `\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\"`.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n\r\n* Some systems which implement `malloc()` may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. `TCMalloc` and `jemalloc` are installed in the Triton container and can be [used by specifying the library in LD_PRELOAD](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr25.01\u002Fdocs\u002Fuser_guide\u002Fmodel_management.md). NVIDIA recommends experimenting wit","2025-05-12T18:13:57",{"id":179,"version":180,"summary_zh":181,"released_at":182},351803,"v2.56.0","\u003C!-- TPRD-1195 -->\r\n# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>New Features and Improvements\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* The Tensorflow Backend has been deprecated starting in 25.03. The last release of Triton Inference Server with the Tensorflow Backend is 25.02. Users wishing to continue using the Tensorflow Backend in 25.03 and later can build the [Tensorflow Backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorflow_backend?tab=readme-ov-file#build-the-tensorflow-backend) from source and install the result into the `\u002Fopt\u002Ftritonserver\u002Fbackends\u002F` directory. \r\n* The “XX.YY-tf2-python-py3” container will no longer be available starting in 25.03. See the Tensorflow Backend deprecation.\r\n* Added generate and generate_stream inference types to SageMaker server. Customers can choose which inference types - infer (default), generate or generate_stream using SAGEMAKER_TRITON_INFERENCE_TYPE environment variable during server launch.\r\n* In an effort to allow quick, on-demand metric retrieval for external load balancers such as the [Kubernetes Inference Gateway API](https:\u002F\u002Fgateway-api-inference-extension.sigs.k8s.io\u002F), Triton when used with TRT-LLM can include live KV-cache utilization and capacity metrics in the HTTP response header when processing inference requests.\r\n\r\n\u003C\u002Fdetails>\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>Known Issues\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* The core Python binding may incur an additional D2H and H2D copy if the backend and frontend both specify device memory to be used for response tensors.\r\n\r\n* A segmentation fault related to DCGM and NSCQ may be encountered during server shutdown on NVSwitch systems. A possible workaround for this issue is to disable the collection of GPU metrics `tritonserver --allow-gpu-metrics false ...`\r\n\r\n* vLLM backend currently does not take advantage of the [vLLM v0.6](https:\u002F\u002Fblog.vllm.ai\u002F2024\u002F09\u002F05\u002Fperf-update.html) performance improvement when metrics are enabled.\r\n\r\n* Incorrect results are known to occur when using TensorRT (TRT) Backend for inference using int8 data type for I\u002FO on the Blackwell GPU architecture.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and `is_non_linear_format_io:true` for [reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in [decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the `ResponseSender` goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads`. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter `\"config\" : \"\u003CJSON>\"` instead of custom configuration file in the following format: `\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\"`.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n\r\n* Some systems which implement `malloc()` may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. `TCMalloc` and `jemalloc` are installed in the Triton container and can be [used by ","2025-04-07T19:30:21",{"id":184,"version":185,"summary_zh":186,"released_at":187},351804,"v2.55.0","# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>New Features and Improvements\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* Python backend now supports setting and retrieving [Inference Response Parameters](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fr25.02#inference-response-parameters) on InferenceResponse objects on model.py.\r\n\r\n* Optimized the core Python binding architecture leading to improved OpenAI frontend performance.\r\n\r\n* Added dynamic sampling parameter handling, improving flexibility and consistency across vllm interactions. Added support for “guided_generation” request parameter for efficient constrained decoding workflows. \r\n\r\n* Improved Multi-Lora handling in TRTLLM GRPC Client `end_to_end_grpc_client.py`\r\n\r\n* GenAI-Perf added the ability to format output using Jinja2 templates.\r\n* GenAI-Perf telemetry now supports multiple metric endpoints.\r\n* GenAI-Perf now supports increased corpus size, 90x the previously supported size.\r\n* GenAI-Perf now supports keys without values as input.\r\n* GenAI-Perf fixed the OSL issue due to Performance Analyzer not removing the first 4 bytes from output.\r\n* GenAI-Perf added a chat template option for the TRT-LLM engine.\r\n* Performance Analyzer fixed TRITON_ENABLE_GPU compile definition bug.\r\n* Performance Analyzer bumped minimum required C++ version to C++20.\r\n* Performance Analyzer modified to disallow user attempts to use concurrency and warmup with the schedule flag.\r\n\u003C\u002Fdetails>\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>Known Issues\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* The core Python binding may incur an additional D2H and H2D copy if the backend and frontend both specify device memory to be used for response tensors.\r\n\r\n* A segmentation fault related to DCGM and NSCQ may be encountered during server shutdown on NVSwitch systems. A possible workaround for this issue is to disable the collection of GPU metrics `tritonserver --allow-gpu-metrics false ...`\r\n\r\n* vLLM backend currently does not take advantage of the [vLLM v0.6](https:\u002F\u002Fblog.vllm.ai\u002F2024\u002F09\u002F05\u002Fperf-update.html) performance improvement when metrics are enabled.\r\n\r\n* Incorrect results are known to occur when using TensorRT (TRT) Backend for inference using int8 data type for I\u002FO on the Blackwell GPU architecture.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and `is_non_linear_format_io:true` for [reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in [decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the `ResponseSender` goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads`. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter `\"config\" : \"\u003CJSON>\"` instead of custom configuration file in the following format: `\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\"`.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n\r\n* Some systems which implement `malloc()` may not release memory back to the operating system right away causing a false memory leak. Th","2025-02-26T19:55:19",{"id":189,"version":190,"summary_zh":191,"released_at":192},351805,"v2.54.0","# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>New Features and Improvements\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* Starting with the 25.01 release, Triton Inference Server supports Blackwell GPU architectures.\r\n\r\n* Starting from 25.01, the vLLM container shipped by Triton is NVIDIA optimized. Users who wish to use the public version of vLLM can continue to build a Triton-vLLM container on their end.\r\n\r\n* Fixed a bug when passing the correlation ID of string type to python_backend. Added datatype checks to correlation ID values.\r\n\r\n* vLLM backend can now take advantage of the vLLM v0.6 performance improvement by communicating with the vLLM engine via ZMQ.\r\n\r\n* GenAI-Perf now provides the exact input sequence length requested for synthetic text generation\r\n\r\n* GenAI-Perf supports the creation of a prefix pool to emulate system prompts via `--num-system-prompts` and `--system-prompt-length`\r\n\r\n* GenAI-Perf improved error visibility via returning more detailed errors when OpenAI frontends return an error or metric generation fails\r\n\r\n* GenAI-Perf reports time-to-second-token and request count in its metrics\r\n\r\n* GenAI-Perf allows the use of a custom tokenizer in its “compare” subcommand for comparing multiple profiles\r\n\r\n* GenAI-Perf natively supports `--request-count` for sending a specific number of requests and `--header` for sending a list of headers with every request\r\n\r\n* Model Analyzer functionality has been migrated to GenAI-Perf via the “analyze” subcommand, enabling the tool to sweep and find the optimal model configuration\r\n\r\n* A bytes appending bug was fixed in GenAI-Perf, resulting in more accurate output sequence lengths for Triton\r\n\r\n* _Update February 12th, 2025_: Triton Windows release now has CUDA context sharing support in the TensorRT Backend\r\n\u003C\u002Fdetails>\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>Known Issues\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* A segmentation fault related to DCGM and NSCQ may be encountered during server shutdown on NVSwitch systems. A possible workaround for this issue is to disable the collection of GPU metrics `tritonserver --allow-gpu-metrics false ...`\r\n\r\n* vLLM backend currently does not take advantage of the [vLLM v0.6](https:\u002F\u002Fblog.vllm.ai\u002F2024\u002F09\u002F05\u002Fperf-update.html) performance improvement when metrics are enabled.\r\n\r\n* Please note, that the vllm version provided in 25.01 container is 0.6.3.post1. Due to some issues with vllm library versioning, `vllm.__version__` displays `0.6.3`.\r\n\r\n* Incorrect results are known to occur when using TensorRT (TRT) Backend for inference using int8 data type for I\u002FO on the Blackwell GPU architecture.\r\n\r\n* When running Torch TRT models, the output may differ from running the same model on a previous release.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and `is_non_linear_format_io:true` for [reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in [decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the `ResponseSender` goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads`. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter `\"config\" : \"\u003CJSON>\"` instead of custom configuration file in the following format: `\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\"`.\r\n\r\n* TensorRT-LLM [back","2025-01-29T23:09:44",{"id":194,"version":195,"summary_zh":196,"released_at":197},351806,"v2.53.0","# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n## New Features and Improvements\r\n\r\n* [vLLM backend health check](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fvllm_backend\u002Fblob\u002Fr24.12\u002Fdocs\u002Fhealth_check.md) may be optionally enabled which unloads the model if the vLLM engine health check failed.\r\n\r\n* vLLM backend supports sending [additional outputs](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fvllm_backend\u002Fblob\u002Fr24.12\u002Fdocs\u002Fadditional_outputs.md) from vLLM if requested.\r\n\r\n* Improved server stability during the gRPC client cancellation.\r\n\r\n* Perf Analyzer: Added trtllm multi node process support.\r\n\r\n* Windows executables and DLLs are signed by NVIDIA. This should remove the un-trusted software popup when starting Triton outside of administrator mode.\r\n\r\n* Triton on Windows supports long path notation for model repositories\r\n\r\n* Triton on Windows supports wide character encoding, UTF-16, for model repositories\r\n\r\n## Known Issues \r\n\r\n* To build the Llama 3.1 engine inside the 24.09-trtllm-python-py3 image, make sure to upgrade the transformer library to 4.43+ due to the bug in 4.43.x. One option to do so is to run `pip install -U transformers`. For more information, please refer to the discussion: https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FTensorRT-LLM\u002Fissues\u002F2121\r\n\r\n* Triton vLLM container comes with the vLLM version, which has a known vulnerability: https:\u002F\u002Fgithub.com\u002Fadvisories\u002FGHSA-w2r7-9579-27hf. Note, that the affected code is not invoked at runtime, therefore the Triton vLLM container is not affected by this issue.  \r\n\r\n* When running Torch TRT models, the output may differ from running the same model on a previous release.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and is_non_linear_format_io:true for[ reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in[ decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the ResponseSender goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton TensorRT-LLM Backend container image uses TensorRT-LLM version 0.16.0 and built out of [nvcr.io\u002Fnvidia\u002Ftritonserver:24.11-py3-min](http:\u002F\u002Fnvcr.io\u002Fnvidia\u002Ftritonserver:24.11-py3-min).  Please refer to the Triton TRT-LLM Container Support Matrix section in the [GitHub release note](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases) for more details.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter \"config\" : \"\u003CJSON>\" instead of custom configuration file in the following format:\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\".\r\n\r\n* Perf Analyzer no longer supports --trace-file option.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n\r\n* Some systems which implement malloc() may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. Tcmalloc and jemalloc are installed in the Triton container and can be [used by specifying the library in LD_PRELOAD](https:\u002F\u002Fgithub.co","2024-12-23T21:20:20",{"id":199,"version":200,"summary_zh":201,"released_at":202},351807,"v2.52.0","# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n## New Features and Improvements\r\n\r\n* [Conceptual Guides](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftutorials\u002Ftree\u002Fr24.11\u002FConceptual_Guide) were enhanced with a comprehensive [tutorial](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftutorials\u002Ftree\u002Fr24.11\u002FConceptual_Guide\u002FPart_8-semantic_caching) on Semantic Caching optimization for LLM workloads.\r\n\r\n* Triton Metrics:\r\n  * Added a new histogram metric \"Request to First Response Time” to decoupled models. Enabled by setting `--metrics-config histogram_latencies=true` \u003Csup>[[user_guide]](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.11\u002Fdocs\u002Fuser_guide\u002Fmetrics.md#histograms) \u003C\u002Fsup>.\r\n  * A new model configuration field `model_metrics` that allows overriding default buckets for histogram metric families.\r\n\r\n## Known Issues\r\n\r\n* TensorFlow backend may leak memory due to a known issue with the cuDNN library shipped with the container.\r\n\r\n* The latest GenAI-Perf package on pypi.org is version 0.0.9dev while the latest Triton SDK container (24.11) contains GenAI-Perf version 0.0.8.\r\n\r\n* Numpy 2.x is not currently supported for Python Backend models and may cause them to return empty tensors unxpectedly, please use Numpy 1.x until support is added. \r\n\r\n* Triton vLLM container comes with the vLLM version, which has a known vulnerability: https:\u002F\u002Fgithub.com\u002Fadvisories\u002FGHSA-w2r7-9579-27hf. Note, that the affected code is not invoked at runtime, therefore the Triton vLLM container is not affected by this issue.  \r\n\r\n* When running Torch TRT models, the output may differ from running the same model on a previous release.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and is_non_linear_format_io:true for[ reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in[ decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the ResponseSender goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton TensorRT-LLM Backend container image uses TensorRT-LLM version 0.15.0 and built out of [nvcr.io\u002Fnvidia\u002Ftritonserver:24.10-py3-min](http:\u002F\u002Fnvcr.io\u002Fnvidia\u002Ftritonserver:24.10-py3-min).  Please refer to the Triton TRT-LLM Container Support Matrix section in the [GitHub release note](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases) for more details.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter \"config\" : \"\u003CJSON>\" instead of custom configuration file in the following format:\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\".\r\n\r\n* Perf Analyzer no longer supports --trace-file option.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n\r\n* Some systems which implement malloc() may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. Tcmalloc and jemalloc are installed in the Triton container and can be [used by specifying the library in LD_PRELOAD](https:\u002F\u002Fgit","2024-11-26T23:13:19",{"id":204,"version":205,"summary_zh":206,"released_at":207},351808,"v2.51.0","# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n## New Features and Improvements\r\n\r\n* Optimized vLLM performance with custom metrics.\r\n\r\n## Known Issues\r\n\r\n* Numpy 2.x is not currently supported for Python Backend models and may cause them to return empty tensors unxpectedly, please use Numpy 1.x until support is added. \r\n\r\n* To build the Llama 3.1 engine inside the 24.09-trtllm-python-py3 image, make sure to upgrade the transformer library to 4.43+ due to the bug in 4.43.x. One option to do so is to run `pip install -U transformers`. For more information, please refer to the discussion: https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FTensorRT-LLM\u002Fissues\u002F2121\r\n\r\n* Triton vLLM container comes with the vLLM version, which has a known vulnerability: https:\u002F\u002Fgithub.com\u002Fadvisories\u002FGHSA-w2r7-9579-27hf. Note, that the affected code is not invoked at runtime, therefore the Triton vLLM container is not affected by this issue.  \r\n\r\n* When running Torch TRT models, the output may differ from running the same model on a previous release.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and is_non_linear_format_io:true for[ reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in[ decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the ResponseSender goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton TensorRT-LLM Backend container image uses TensorRT-LLM version 0.14.0 and built out of [nvcr.io\u002Fnvidia\u002Ftritonserver:24.07-py3-min](http:\u002F\u002Fnvcr.io\u002Fnvidia\u002Ftritonserver:24.07-py3-min).  Please refer to the Triton TRT-LLM Container Support Matrix section in the [GitHub release note](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases) for more details.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter \"config\" : \"\u003CJSON>\" instead of custom configuration file in the following format:\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\".\r\n\r\n* Perf Analyzer no longer supports --trace-file option.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n\r\n* Some systems which implement malloc() may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. Tcmalloc and jemalloc are installed in the Triton container and can be [used by specifying the library in LD_PRELOAD](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr22.12\u002Fdocs\u002Fuser_guide\u002Fmodel_management.md). NVIDIA recommends experimenting with both tcmalloc and jemalloc to determine which one works better for your use case.\r\n\r\n* Auto-complete may cause an increase in server start time. To avoid a start time increase, users can provide the full model configuration and launch the server with --disable-auto-complete-config.\r\n\r\n* Auto-complete does not support PyTorch models due to lack of metadata in the model. It can only verify that the number of inputs and the input names matches what is specified in the model configuration. Ther","2024-10-29T15:38:56",{"id":209,"version":210,"summary_zh":211,"released_at":212},351809,"v2.50.0","# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n## New Features and Improvements\r\n\r\n* Our tutorials were updated with 2 extensive guides on constrained decoding implementation in TensorRT-LLM python backend and function\u002Ftool calling. Guides can be found [here](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftutorials\u002Fblob\u002Fmain\u002FAI_Agents_Guide\u002FREADME.md)\r\n\r\n* Our tutorials were also updated for Kubernetes Multi-Node and Multi-Instance Scaling with Triton and TRT-LLM; they can be found [here](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftutorials\u002Ftree\u002Fmain\u002FDeployment\u002FKubernetes).\r\n\r\n* vLLM backend now supports these additional metrics. For additional details, see [vllm_backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fvllm_backend\u002Ftree\u002Fr24.09?tab=readme-ov-file#triton-metrics).\r\n  -   vllm:e2e_request_latency_seconds\r\n  -   vllm:request_prompt_tokens\r\n  -   vllm:request_generation_tokens\r\n  -   vllm:request_params_best_of\r\n  -   vllm:request_params_n\r\n  \r\n## Known Issues\r\n  \r\n* To build the Llama 3.1 engine inside the 24.09-trtllm-python-py3 image, make sure to upgrade the transformer library to 4.43+ due to the bug in 4.43.x. One option to do so is to run `pip install -U transformers`. For more information, please refer to the discussion: https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FTensorRT-LLM\u002Fissues\u002F2121.\r\n\r\n* Triton vLLM container comes with the vLLM version, which has a known vulnerability: https:\u002F\u002Fgithub.com\u002Fadvisories\u002FGHSA-w2r7-9579-27hf. Note, that the affected code is not invoked at runtime, therefore the Triton vLLM container is not affected by this issue.  \r\n\r\n* When running Torch TRT models, the output may differ from running the same model on a previous release.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and is_non_linear_format_io:true for[ reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in[ decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the ResponseSender goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton TensorRT-LLM Backend container image uses TensorRT-LLM version 0.13.0 and built out of [nvcr.io\u002Fnvidia\u002Ftritonserver:24.07-py3-min](http:\u002F\u002Fnvcr.io\u002Fnvidia\u002Ftritonserver:24.07-py3-min).  Please refer to the Triton TRT-LLM Container Support Matrix section in the [GitHub release note](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases) for more details.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter \"config\" : \"\u003CJSON>\" instead of custom configuration file in the following format:\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\".\r\n\r\n* Perf Analyzer no longer supports --trace-file option.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults.\r\n\r\n* Some systems which implement malloc() may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. Tcmalloc and jemalloc are installed in the Triton container and can be [used by specifying t","2024-09-27T16:50:59",{"id":214,"version":215,"summary_zh":216,"released_at":217},351810,"v2.49.0","# Triton Inference Server\r\n\r\nThe Triton Inference Server provides a cloud inferencing solution optimized for both CPUs and GPUs. The server provides an inference service via an HTTP or GRPC endpoint, allowing remote clients to request inferencing for any model being managed by the server. For edge deployments, Triton Server is also available as a shared library with an API that allows the full functionality of the server to be included directly in an application.\r\n\r\n## New Features and Improvements\r\n\r\n* OpenAI-compatible embeddings and Hugging Face TEI re-ranker API-compatible rankings can now be profiled via GenAI-Perf.\r\n\r\n* GenAI-Perf can now receive multiple user-specified prompts via --input-file.\r\n\r\n* The request-rate for async requests have been updated in the OpenAI and HTTP clients to send requests at exactly that rate. Users submitting more requests than their models can handle can see increased latency.\r\n\r\n* The stabilization metric for Perf Analyzer has been updated due to these changes, so if latency does not stabilize for async models, a warning will be printed but Perf Analyzer will still complete.\r\n\r\n* Perf Analyzer will not validate any user-supplied inputs and outputs, returning an error if the model does not contain them.\r\n\r\n* Python backend now supports BF16 tensors via DLPack\r\n\r\n* vLLM backend now supports these reporting metrics. \r\n  -   vllm:prompt_tokens_total\r\n  -   vllm:generation_tokens_total\r\n  -   vllm:time_to_first_token_seconds\r\n\r\n  To enable the vLLM model's metrics reporting, add these lines to config.pbtxt: \r\n```\r\nparameters: {\r\n  key: \"REPORT_CUSTOM_METRICS\"\r\n  value: {\r\n    string_value:\"yes\"\r\n  }\r\n}\r\n```\r\n\r\n* TensorRT-LLM backend now supports specifying GPU device IDs per instance using the “gpu_device_ids” field.\r\n\r\n* After the model config is updated to load new model versions, any loaded model versions whose model files are unmodified will not be reloaded.\r\n\r\n## Known Issues\r\n\r\n* When running Torch TRT models, the output may differ from running the same model on a previous release. This issue is expected to be fixed on the next release.\r\n\r\n* When using TensorRT models, if auto-complete configuration is disabled and is_non_linear_format_io:true for[ reformat-free tensors](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) is not provided in the model configuration, the model may not load successfully.\r\n\r\n* When using Python models in[ decoupled mode](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode), users need to ensure that the ResponseSender goes out of scope or is properly cleaned up before unloading the model to guarantee that the unloading process executes correctly.\r\n\r\n* Restart support was temporarily removed for Python models.\r\n\r\n* Triton TensorRT-LLM Backend container image uses TensorRT-LLM version 0.12.0 and built out of [nvcr.io\u002Fnvidia\u002Ftritonserver:24.07-py3-min](http:\u002F\u002Fnvcr.io\u002Fnvidia\u002Ftritonserver:24.07-py3-min).  Please refer to the Triton TRT-LLM Container Support Matrix section in the [GitHub release note](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases) for more details.\r\n\r\n* Triton Inference Server with vLLM backend currently does not support running vLLM models with tensor parallelism sizes greater than 1 and the default \"distributed_executor_backend\" setting when using explicit model control mode. In attempt to load a vllm model (tp > 1) in explicit mode, users could potentially  see failure at `initialize` step: `could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads. For the default model control mode, after server shutdown, vllm related sub-processes are not killed. Related vllm issue: https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 . Please specify  \"distributed_executor_backend\":\"ray\" in the `model.json` when deploying vllm models with tensor parallelism > 1.\r\n\r\n* When loading models with file override, multiple model configuration files are not supported. Users must  provide the model configuration by setting parameter \"config\" : \"\u003CJSON>\" instead of custom configuration file in the following format:\"file:configs\u002F\u003Cmodel-config-name>.pbtxt\" : \"\u003Cbase64-encoded-file-content>\".\r\n\r\n* Perf Analyzer no longer supports --trace-file option.\r\n\r\n* TensorRT-LLM [backend](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Ftensorrtllm_backend) provides limited support of Triton extensions and features.\r\n\r\n* The TensorRT-LLM backend may core dump on server shutdown. This impacts server teardown only and will not impact inferencing.\r\n\r\n* The Java CAPI is known to have intermittent segfaults. \r\n \r\n* Some systems which implement malloc() may not release memory back to the operating system right away causing a false memory leak. This can be mitigated by using a different malloc implementation. Tcmalloc and jemalloc are installed in the Triton container and can be [used by ","2024-08-27T18:03:16",{"id":219,"version":220,"summary_zh":221,"released_at":222},351791,"v2.67.0","  \u003C!--\r\n  # 版权所有 © 2025–2026，NVIDIA公司。保留所有权利。\r\n  #\r\n  # 在遵守以下条件的前提下，允许以源代码和二进制形式进行再分发和使用，无论是否修改：\r\n  #  * 源代码的再分发必须保留上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 二进制形式的再分发必须在随分发提供的文档和\u002F或其他材料中复制上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 未经NVIDIA公司及其贡献者事先书面许可，不得使用NVIDIA公司的名称或其贡献者的名称来背书或推广由此软件派生的产品。\r\n  #\r\n  # 本软件由版权所有者“按原样”提供，不提供任何明示或默示的担保，包括但不限于对适销性和特定用途适用性的默示担保。在任何情况下，版权所有者或贡献者均不对任何直接、间接、偶然、特殊、示范性或后果性损害承担责任（包括但不限于替代品或服务的采购、使用损失、数据丢失、利润损失或业务中断），即使已被告知发生此类损害的可能性。任何责任均基于合同责任、严格责任或侵权责任（包括过失或其他原因），且无论如何均源于对本软件的使用，即便已被告知此类损害的可能性。\r\n  -->\r\n  # Triton 推理服务器\r\n\r\n  Triton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，使远程客户端能够请求由服务器管理的任何模型的推理。对于边缘部署，Triton 服务器还以共享库的形式提供 API，允许将服务器的全部功能直接集成到应用程序中。\n\n> [!重要提示]\n> - Triton 推理服务器可在 Jetson 硬件上使用，支持通过 SBSA（arm64）容器镜像提供。\r\n> - Triton 26.03 不会在 GitHub 上发布新的 Jetson 发布工件；对于 Jetson 平台，在适用的情况下，请使用 Triton 26.02 \u002F v2.66.0 中的软件包。\n\n  \u003Cdetails>\n    \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n  * 修复了 SageMaker 服务器以及 MLflow–Triton 部署 API 中的路径遍历漏洞。\r\n\r\n  * 增加了对 OpenAI 前端 LoRA 路径的验证。\r\n\r\n  * 对 SageMaker 和 Vertex AI 端点应用了 HTTP 限制，并改进了 Vertex AI 重定向处理。\r\n\r\n  * 重构了 vLLM 构建流程，改用上游容器镜像；更新了 TensorRT-LLM 构建，并切换到稳定版 API。\r\n\r\n  * 修复了在取消请求时并发流式响应中 `AddNextResponse` 的竞态条件。\r\n\r\n  * 修复了 ensemb","2026-03-27T16:47:26",{"id":224,"version":225,"summary_zh":226,"released_at":227},351792,"v2.66.0","  \u003C!--\r\n  # 版权所有 © 2025–2026，NVIDIA公司。保留所有权利。\r\n  #\r\n  # 在遵守以下条件的前提下，允许以源代码和二进制形式进行再分发和使用，无论是否修改：\r\n  #  * 源代码的再分发必须保留上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 二进制形式的再分发必须在随分发提供的文档和\u002F或其他材料中复制上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 未经事先书面许可，不得使用NVIDIA公司的名称及其贡献者的名称来背书或推广由此软件派生的产品。\r\n  #\r\n  # 本软件由版权所有者“按原样”提供，不提供任何明示或暗示的担保，包括但不限于对适销性和特定用途适用性的暗示担保。在任何情况下，版权所有者或贡献者均不对任何直接、间接、偶然、特殊、示范性或后果性损害承担责任（包括但不限于替代品或服务的采购、使用损失、数据丢失、利润损失或业务中断），即使已被告知发生此类损害的可能性，亦不承担任何责任，无论是基于合同、严格责任还是侵权（包括疏忽或其他原因）的责任。\r\n  -->\r\n  \u003C!-- https:\u002F\u002Flinear.app\u002Fissue\u002FTRI-693 -->\r\n  # Triton 推理服务器\r\n\r\n  Triton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，使远程客户端能够请求由该服务器管理的任何模型的推理任务。对于边缘部署，Triton 服务器还以共享库的形式提供，其 API 允许将服务器的全部功能直接嵌入到应用程序中。\n\n> [!重要提示]\n> Triton 26.02 是 GitHub 上适用于 Jetson 平台设备的最后一个 Triton 发布版本。\n\n  \u003Cdetails>\n    \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n  * 修复并更新了跨仓库文档中的失效链接和锚点。\r\n\r\n  * 修复了 NVIDIA Triton 推理服务器文档中的页面链接错误。\r\n\r\n  * 修复了一个问题：模型输出大小未绑定可能导致资源耗尽和拒绝服务。（https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fpull\u002F8603）\r\n\r\n  * 修复了一个问题：格式错误的 HTTP 头部可能导致服务器崩溃。（https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fpull\u002F8637）\r\n\r\n\r\n  \u003C\u002Fdetails>\r\n\r\n\r\n  \u003Cdetails>\n    \u003Csummary>\u003Ch2>已知问题\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n   * 避免使用特定的 API\u002F流程（低级别 OCB、带有短写操作的 BIO_f_linebuffer、基于密码的 CMS 解密…","2026-03-02T18:04:29",{"id":229,"version":230,"summary_zh":231,"released_at":232},351793,"v2.65.0","  \u003C!--\r\n  # 版权所有 © 2025–2026，NVIDIA Corporation。保留所有权利。\r\n  #\r\n  # 在遵守以下条件的情况下，允许以源代码和二进制形式进行再分发和使用，无论是否修改：\r\n  #  * 源代码的再分发必须保留上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 二进制形式的再分发必须在随分发提供的文档和\u002F或其他材料中复制上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 未经 NVIDIA Corporation 及其贡献者事先书面许可，不得使用 NVIDIA Corporation 或其贡献者的名称来背书或推广由此软件派生的产品。\r\n  #\r\n  # 本软件由版权所有者“按原样”提供，不提供任何明示或暗示的担保，包括但不限于对适销性和特定用途适用性的暗示担保。在任何情况下，版权所有者或贡献者均不对任何直接、间接、偶然、特殊、示范性或后果性损害承担责任（包括但不限于替代品或服务的采购、使用损失、数据丢失、利润损失或业务中断），即使已被告知发生此类损害的可能性。\r\n  -->\r\n  \u003C!-- https:\u002F\u002Flinear.app\u002Fissue\u002FTRI-527 -->\r\n  # Triton 推理服务器\r\n\r\n  Triton 推理服务器提供针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，允许远程客户端为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供，配备 API，可将服务器的全部功能直接嵌入到应用程序中。\n\n> [!注意]\r\n> 对 Windows 的支持已被弃用，适用于 Windows 的最新构建资产可在版本 [2.51.0 \u002F 25.01](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Freleases\u002Ftag\u002Fv2.51.0) 中找到。\n\n  \u003Cdetails>\r\n    \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n  * 针对非 JSON 格式响应暴露了 HTTP 错误信息。\r\n\r\n  * 通过 pip 安装 tritonclient 时，Perf Analyzer 的依赖项变为可选。\r\n\r\n  * 在耦合模式（即非解耦模式）下运行模型时，新增了对 `nv_inference_first_response_histogram_ms` 指标的支持。\r\n\r\n  * 修复了一个问题：当 Python 后端存根进程崩溃时，无法被正确检测到，从而导致推理请求失败。\r\n\r\n  * 修复了一个问题：恶意 HTTP 请求可能会耗尽所有可用系统内存，进而导致进程崩溃或拒绝服务。\r\n\r\n  * 修复了一个问题 whe","2026-02-03T20:20:28",{"id":234,"version":235,"summary_zh":236,"released_at":237},351794,"v2.64.0","  \u003C!--\r\n  # 版权所有 © 2025，NVIDIA Corporation。保留所有权利。\r\n  #\r\n  # 依照以下条件，允许以源代码和二进制形式进行再分发和使用，无论是否修改：\r\n  #  * 源代码的再分发必须保留上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 二进制形式的再分发必须在随分发提供的文档和\u002F或其他材料中复制上述版权声明、本条件列表以及以下免责声明。\r\n  #  * 未经 NVIDIA Corporation 及其贡献者事先书面许可，不得使用 NVIDIA Corporation 或其贡献者的名称来代言或推广由此软件衍生的产品。\r\n  #\r\n  # 本软件由版权所有者“按原样”提供，不提供任何明示或暗示的担保，包括但不限于适销性和特定用途适用性的暗示担保。在任何情况下，版权所有者或贡献者均不对任何直接、间接、偶然、特殊、示范性或后果性损害承担责任（包括但不限于替代品或服务的采购、使用损失、数据丢失、利润损失或业务中断），无论该等损害是基于合同、严格责任还是侵权（包括过失或其他原因）而产生的，只要该等损害源于对本软件的使用，即使已被告知可能发生此类损害。\r\n  -->\r\n  \u003C!-- https:\u002F\u002Flinear.app\u002Fissue\u002FTRI-417 -->\r\n  # Triton 推理服务器\r\n\r\n  Triton 推理服务器提供针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，允许远程客户端为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供，配备 API，可将服务器的全部功能直接集成到应用程序中。\n\n  \u003Cdetails>\n    \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\n\n  * 修复了 Triton 服务器 Sagemaker 服务中的一个问题，该问题由于对模型列表的无保护访问导致竞争条件，进而可能引发服务器崩溃。\n\n  * 扩展了 Triton PyTorch 后端所包含的加速 PyTorch 库集合。\n\n  * 将 Triton 客户端的 Golang 依赖项升级至最新稳定版本，以解决先前版本中存在的已知问题。\n\n  * 与 OpenAI 兼容的前端已从测试版过渡到稳定版。\n\n  * 在与 OpenAI 兼容的 API 前端 `v1\u002Fcompletions` 端点中，为 TensorRT-LLM 和 Python 后端添加了 `echo` 请求参数。\n\n  * 为 TensorRT-LLM 后端启用了与 OpenAI 兼容的 API 前端多 LoRA 支持。\n\n  * 后端现在可以实现 the ","2025-12-24T02:01:09",{"id":239,"version":240,"summary_zh":241,"released_at":242},351795,"v2.63.0","\u003C!--\n# 版权所有 © 2025，NVIDIA Corporation。保留所有权利。\n#\n# 依照以下条件，允许以源代码和二进制形式进行再分发和使用，无论是否修改：\n#  * 源代码的再分发必须保留上述版权声明、本条件列表以及以下免责声明。\n#  * 二进制形式的再分发必须在随分发提供的文档和\u002F或其他材料中复制上述版权声明、本条件列表以及以下免责声明。\n#  * 未经事先书面许可，不得使用 NVIDIA CORPORATION 或其贡献者的名称来背书或推广由此软件派生的产品。\n#\n# 本软件由版权所有者“按原样”提供，不提供任何明示或暗示的担保，包括但不限于针对特定用途的适销性和适用性的暗示担保。在任何情况下，版权所有者或贡献者均不对任何直接、间接、附带、特殊、示范性或后果性损害（包括但不限于替代品或服务的采购、使用损失、数据丢失或利润损失，以及业务中断）承担责任，即使已被告知发生此类损害的可能性。根据任何责任理论，无论是合同责任、严格责任还是侵权责任（包括过失或其他原因），均不应因使用本软件而产生任何责任。\n-->\n  \u003C!-- TRI-277 -->\n# Triton 推理服务器\n\nTriton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，使远程客户端能够为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供 API，允许将服务器的全部功能直接嵌入到应用程序中。\n\n\u003Cdetails>\n  \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\n\n* 在与 OpenAI 兼容的 API 服务器中，为 vLLM 后端启用了 `v1\u002Fembeddings` 端点。\n\n* 在与 OpenAI 兼容的 API 服务器中，为 TensorRT-LLM 和 Python 后端启用了 `echo` 参数。\n\n* 通过提供更具体且符合 OpenAI 规范的错误码，改进了与 OpenAI 兼容的 API 服务器中的错误处理机制。\n\n* 升级了 OpenAI 前端所使用的 starlette 版本。\n\n\n\n\u003C\u002Fdetails>\n\n\n\u003Cdetails>\n  \u003Csummary>\u003Ch2>已知问题\u003C\u002Fh2>\u003C\u002Fsummary>\n\n* 自 25.10 版本起，vLLM 后端默认使用 V1 引擎。您可能会在 logprobs 输出中看到无效字符，此问题已报告给 vLLM 团队。\n\n* PyTorch 后端支持 PyTorch 2.0，但存在限制：模型必须以序列化文件形式提供（即 [‘model.pt’](http:\u002F\u002Fmodel.pt)）。详情请参阅 Triton PyTorch 后端文档。\n\n* vLLM 的 v0 API 和 Ray 受到 v…的影响。","2025-11-26T01:26:44",{"id":244,"version":245,"summary_zh":246,"released_at":247},351796,"v2.62.0","  \u003C!-- TRI-154 -->\r\n# Triton 推理服务器\r\n\r\nTriton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，允许远程客户端为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供 API，使服务器的全部功能可以直接嵌入到应用程序中。\n\n\u003Cdetails>\n  \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\n\n* 修复了由发送至 `\u002Fv2\u002Fmodels\u002F\u003Cmodel_name>\u002Finfer` 的特制请求消息导致的服务器崩溃问题。\n\n* 修复了因错误处理格式不正确的 HTTP 请求而导致的服务器崩溃问题。\n\n\n\u003C\u002Fdetails>\n\n\n\u003Cdetails>\n  \u003Csummary>\u003Ch2>已知问题\u003C\u002Fh2>\u003C\u002Fsummary>\n\n* Triton Python 包使用了过时的依赖包 `starlette` 版本。\n\n* 自 25.10 版本起，vLLM 后端默认使用 V1 引擎。用户可能会在 logprobs 输出中看到无效字符，此问题已报告给 vLLM 团队。\n\n* 在推理过程中启用 vLLM 指标会导致引擎崩溃。\n\n* PyTorch 后端支持 PyTorch 2.0，但存在限制：模型必须以序列化文件形式（即 ‘model.pt’）提供。详情请参阅 Triton PyTorch 后端文档。\n\n* vLLM 的 v0 API 和 Ray 受到漏洞影响。用户应根据自身架构采取相应的缓解措施，这些措施可能包括但不限于：\n  * 不要将 Ray 执行器和 vLLM 主机暴露于可能存在不受信任连接的网络中。\n  * 确保只有其他 vLLM 主机能够连接到用于 XPUB 套接字的 TCP 端口。请注意，该端口是随机分配的。\n\n* Perf Analyzer 已不再包含在发布的“客户端”归档中，可使用 `pip install perf-analyzer` 单独安装。\n\n* 在 AGX-Thor 或 DGX-Spark 系统上使用 Valgrind 或其他内存泄漏检测工具时，可能会看到归因于 NvRmGpuLibOpen 的内存泄漏。根本原因已被识别并在 CUDA 中修复。\n\n* Valgrind 或其他内存泄漏检测工具有时会报告与 DCGM 相关的泄漏。这些报告通常是间歇性的，重试后往往会消失。根本原因仍在调查中。\n\n* CuPy 在多线程环境中使用 CUDA 13 Device API 时会出现问题。在 CuPy 修复之前，请避免在多线程环境中使用 tritonclient cuda_shared_memory API。\n\n* TensorRT 校准缓存在某些情况下可能需要调整大小，这一点已在 IGX 平台上观察到。\n\n* 如果后端和前端都指定了使用设备内存来存储响应张量，核心 Python 绑定可能会产生额外的 D2H 和 H2D 数据拷贝。\n\n* 在 NVSwitch 系统上关闭服务器时，可能会遇到与 DCGM 和 NSCQ 相关的段错误。对此问题的一个可行 workaround 是禁用 GPU 指标的收集：`tritonserver --allow-gpu-metrics false`","2025-10-31T18:54:48",{"id":249,"version":250,"summary_zh":251,"released_at":252},351797,"v2.61.0","\u003C!-- TPRD-1607 -->\r\n# Triton 推理服务器\r\n\r\nTriton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，允许远程客户端为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供 API，使服务器的全部功能可以直接嵌入到应用程序中。\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* OpenAI 前端 API 的静态密钥认证\r\n\r\n* 阻止从 OpenAI 前端加载 Triton 仓库之外的模型。\r\n\r\n\r\n\u003C\u002Fdetails>\r\n\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>已知问题\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* vLLM 的 v0 API 和 Ray 受到漏洞影响。用户应根据自身架构采取相应的缓解措施，这些措施可能包括但不限于以下几点：\r\n  * 不要将 Ray 执行器和 vLLM 主机暴露在可能存在不受信任连接的网络环境中。\r\n  * 确保只有其他 vLLM 主机能够连接到用于 XPUB 套接字的 TCP 端口。请注意，该端口是随机分配的。\r\n  \r\n* Perf Analyzer 已不再包含在发布的“客户端”归档中，可使用 `pip install perf-analyzer` 单独安装。\r\n\r\n* 在 AGX-Thor 或 DGX-Spark 系统上使用 Valgrind 或其他内存泄漏检测工具时，可能会看到归因于 NvRmGpuLibOpen 的内存泄漏。其根本原因已被识别并在 CUDA 中修复。\r\n\r\n* Valgrind 或其他内存泄漏检测工具有时会报告与 DCGM 相关的泄漏。此类报告通常是间歇性的，重试后往往会消失。目前仍在调查其根本原因。\r\n\r\n* CuPy 在多线程环境下使用 CUDA 13 设备 API 时存在问题。在 CuPy 修复此问题之前，请避免在多线程环境中使用 tritonclient cuda_shared_memory API。\r\n\r\n* 在某些情况下，TensorRT 校准缓存可能需要调整大小，这一点在 IGX 平台上已被观察到。\r\n\r\n* 如果后端和前端都指定了响应张量使用的设备内存，核心 Python 绑定可能会产生额外的 D2H 和 H2D 数据拷贝。\r\n\r\n* 在 NVSwitch 系统上关闭服务器时，可能会遇到与 DCGM 和 NSCQ 相关的段错误。针对此问题的一个可行 workaround 是禁用 GPU 指标收集：`tritonserver --allow-gpu-metrics false ...`\r\n\r\n* 使用 TensorRT 模型时，如果禁用了自动完成配置，并且未在模型配置中为 [无需格式转换的张量](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) 提供 `is_non_linear_format_io:true`，则模型可能无法成功加载。\r\n\r\n* 当以 [解耦模式](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode) 使用 Python 模型时，用户需要确保 `ResponseSender` 超出作用域或被","2025-10-07T22:10:06",{"id":254,"version":255,"summary_zh":256,"released_at":257},351798,"v2.60.0","\u003C!-- TPRD-1607 -->\r\n# Triton 推理服务器\r\n\r\nTriton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，允许远程客户端为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供 API，使服务器的全部功能可以直接嵌入到应用程序中。\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* 添加了对 CUDA 13 的支持。\r\n\r\n\u003C\u002Fdetails>\r\n\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>已知问题\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* Triton ONNX Runtime Backend 的构建使用了 [microsoft\u002Fonnxruntime\u002Fcommit\u002F1d1712fdaf](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fonnxruntime\u002Fcommit\u002F1d1712fdafb9e61b2d6d033c4433c1033395d7e7)，在 DGX Spark 硬件上可能存在一些限制，这些问题将在后续版本中得到解决。\r\n\r\n* CuPy 在多线程环境中使用 CUDA 13 Device API 时会出现问题。在 CuPy 修复此问题之前，请避免在多线程环境中使用 tritonclient cuda_shared_memory API。\r\n\r\n* 截至撰写本文时，CuPy 尚不支持 CUDA 13。在 CuPy 正式支持 CUDA 13 之前使用它可能会遇到问题，详情请参阅：https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Ftree\u002Fr25.08\u002Fpython\u002Fopenai#pre-requisites。\r\n\r\n* 在某些情况下，TensorRT 校准缓存可能需要调整大小，这一点在 IGX 平台上已被观察到。\r\n\r\n* 如果后端和前端都指定了响应张量使用的设备内存，核心 Python 绑定可能会产生额外的 D2H 和 H2D 数据拷贝。\r\n\r\n* 在 NVSwitch 系统上关闭服务器时，可能会遇到与 DCGM 和 NSCQ 相关的段错误。对此问题的一个可能的 workaround 是禁用 GPU 指标收集：`tritonserver --allow-gpu-metrics false ...`。\r\n\r\n* 使用 TensorRT 模型时，如果禁用了自动完成配置，并且未在模型配置中为 [无格式转换张量](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) 提供 `is_non_linear_format_io:true`，则模型可能无法成功加载。\r\n\r\n* 当以 [解耦模式](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode) 使用 Python 模型时，用户需要确保 `ResponseSender` 在卸载模型之前超出作用域或被正确清理，以保证卸载过程能够正常执行。\r\n\r\n* 对于 Python 模型，重启支持曾被暂时移除。\r\n\r\n* 带有 vLLM 后端的 Triton 推理服务器目前不支持在显式模型控制模式下运行张量并行度大于 1 的 vLLM 模型，以及默认的“distributed_executor_backend”设置。尝试以显式模式加载张量并行度大于 1 的 vLLM 模型时，用户可能会在 `initialize` 步骤看到失败信息：“could not acquire lock for \u003C_i","2025-08-26T22:15:33",{"id":259,"version":260,"summary_zh":261,"released_at":262},351799,"v2.59.1","\u003C!-- TPRD-1647 -->\r\n# Triton 推理服务器\r\n\r\nTriton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，允许远程客户端为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供 API，使服务器的全部功能可以直接嵌入到应用程序中。\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* 修复了 Triton 推理服务器中的漏洞。\r\n\r\n\u003C\u002Fdetails>\r\n\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>已知问题\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* 在 25.07 版本中未发布 Python wheels 包。\r\n\r\n* 在某些情况下，TensorRT 校准缓存可能需要调整大小，这一点在 IGX 平台上已被观察到。\r\n\r\n* 如果后端和前端都指定了使用设备内存来存储响应张量，核心 Python 绑定可能会引入额外的 D2H 和 H2D 数据拷贝。\r\n\r\n* 在 NVSwitch 系统上关闭服务器时，可能会遇到与 DCGM 和 NSCQ 相关的段错误。对此问题的一个可行 workaround 是禁用 GPU 指标收集：`tritonserver --allow-gpu-metrics false ...`。\r\n\r\n* 当启用指标时，vLLM 后端目前无法利用 [vLLM v0.6](https:\u002F\u002Fblog.vllm.ai\u002F2024\u002F09\u002F05\u002Fperf-update.html) 的性能提升。\r\n\r\n* 使用 TensorRT 模型时，如果禁用了自动完成配置，并且在模型配置中未为 [无格式转换张量](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats) 提供 `is_non_linear_format_io:true`，则模型可能无法成功加载。\r\n\r\n* 在以 [解耦模式](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode) 使用 Python 模型时，用户需要确保 `ResponseSender` 在卸载模型之前超出作用域或被正确清理，以保证卸载过程能够正常执行。\r\n\r\n* 对于 Python 模型，重启支持曾被临时移除。\r\n\r\n* 带有 vLLM 后端的 Triton 推理服务器目前不支持在显式模型控制模式下运行张量并行度大于 1 的 vLLM 模型，以及默认的“distributed_executor_backend”设置。尝试以显式模式加载张量并行度大于 1 的 vLLM 模型时，用户可能会在 `initialize` 步骤遇到失败：`could not acquire lock for \u003C_io.BufferedWriter name='\u003Cstdout>'> at interpreter shutdown, possibly due to daemon threads`。而在默认的模型控制模式下，服务器关闭后，vLLM 相关的子进程不会被终止。相关 vLLM 问题：https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 。请在部署张量并行度大于 1 的 vLLM 模型时，在 `model.json` 中指定 `\"distributed_executor_backend\":\"ray\"`。\r\n\r\n* 加载模型时…","2025-07-29T21:50:01",{"id":264,"version":265,"summary_zh":266,"released_at":267},351800,"v2.59.0","\u003C!-- TPRD-1564 -->\r\n# Triton 推理服务器\r\n\r\nTriton 推理服务器提供了一种针对 CPU 和 GPU 优化的云端推理解决方案。该服务器通过 HTTP 或 gRPC 端点提供推理服务，允许远程客户端为服务器管理的任何模型发起推理请求。对于边缘部署，Triton 服务器还以共享库的形式提供 API，使服务器的全部功能可以直接嵌入到应用程序中。\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>新特性与改进\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* 通过提高最大吞吐量和降低延迟，在允许无序响应的场景下提升了集成模型的性能。\r\n\r\n\u003C\u002Fdetails>\r\n\r\n\r\n\u003Cdetails>\r\n  \u003Csummary>\u003Ch2>已知问题\u003C\u002Fh2>\u003C\u002Fsummary>\r\n\r\n* 在某些情况下，TensorRT 校准缓存可能需要调整大小，这一点在 IGX 平台上已被观察到。\r\n\r\n* 如果后端和前端都指定使用设备内存来存储响应张量，核心 Python 绑定可能会产生额外的 D2H 和 H2D 数据拷贝。\r\n\r\n* 在 NVSwitch 系统上关闭服务器时，可能会遇到与 DCGM 和 NSCQ 相关的段错误。对此问题的一个可行 workaround 是禁用 GPU 指标收集：`tritonserver --allow-gpu-metrics false ...`\r\n\r\n* 当启用指标时，vLLM 后端目前无法利用 [vLLM v0.6](https:\u002F\u002Fblog.vllm.ai\u002F2024\u002F09\u002F05\u002Fperf-update.html) 的性能提升。\r\n\r\n* 使用 TensorRT 模型时，如果禁用了自动完成配置，并且在模型配置中未为[免格式转换张量](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fserver\u002Fblob\u002Fr24.08\u002Fdocs\u002Fuser_guide\u002Fmodel_configuration.md#non-linear-io-formats)指定 `is_non_linear_format_io:true`，则模型可能无法成功加载。\r\n\r\n* 在使用 Python 模型的[解耦模式](https:\u002F\u002Fgithub.com\u002Ftriton-inference-server\u002Fpython_backend\u002Ftree\u002Fmain?tab=readme-ov-file#decoupled-mode)时，用户需要确保 `ResponseSender` 在卸载模型之前超出作用域或被正确清理，以保证卸载过程能够正常执行。\r\n\r\n* 对于 Python 模型，重启支持曾被暂时移除。\r\n\r\n* 带有 vLLM 后端的 Triton 推理服务器目前不支持在显式模型控制模式下运行张量并行度大于 1 的 vLLM 模型，以及默认的“distributed_executor_backend”设置。尝试以显式模式加载张量并行度大于 1 的 vLLM 模型时，用户可能会在 `initialize` 步骤中遇到失败：`在解释器关闭时无法获取 \u003C_io.BufferedWriter name='\u003Cstdout>'> 的锁，可能是由于守护线程所致`。对于默认的模型控制模式，服务器关闭后，vLLM 相关的子进程不会被终止。相关 vLLM 问题：https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fissues\u002F6766 。请在部署张量并行度大于 1 的 vLLM 模型时，在 `model.json` 中指定 `\"distributed_executor_backend\":\"ray\"`。\r\n\r\n* 加载 m","2025-06-26T23:35:58"]