[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-ai-dynamo--nixl":3,"tool-ai-dynamo--nixl":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160015,2,"2026-04-18T11:30:52",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":76,"owner_twitter":76,"owner_website":77,"owner_url":78,"languages":79,"stars":112,"forks":113,"last_commit_at":114,"license":115,"difficulty_score":10,"env_os":116,"env_gpu":117,"env_ram":118,"env_deps":119,"category_tags":131,"github_topics":76,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":132,"updated_at":133,"faqs":134,"releases":163},9079,"ai-dynamo\u002Fnixl","nixl","NVIDIA Inference Xfer Library (NIXL)","NIXL（NVIDIA Inference Xfer Library）是英伟达推出的一款开源库，专为加速 AI 推理框架中的点对点通信而设计。在大规模模型推理场景下，数据在不同节点、不同内存层级（如 CPU 与 GPU）或存储系统间的传输往往成为性能瓶颈，NIXL 正是为了解决这一痛点而生。它通过模块化的插件架构，统一抽象了各类内存和存储后端，让开发者能够高效地管理数据流动，从而显著提升如 NVIDIA Dynamo 等推理系统的整体吞吐量。\n\n这款工具主要面向从事 AI 基础设施开发的工程师和研究人员，特别是那些需要优化分布式推理性能、处理海量键值缓存（KV Cache）迁移的技术团队。NIXL 的独特亮点在于其高度的灵活性与兼容性：它不仅支持多种后端插件扩展，还深度集成了 UCX 高性能通信库及 GDRCopy 技术以挖掘极致带宽，同时提供了便捷的 Python API 和详细的基准测试工具（如 NIXLBench），帮助用户快速验证和优化传输策略。目前，NIXL 专注于 Linux 环境，旨在为构建下一代高效能 AI 推理服务提供坚实的数据传输基石。","\u003C!--\nSPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\nSPDX-License-Identifier: Apache-2.0\n-->\n\n# NVIDIA Inference Xfer Library (NIXL)\n\nNVIDIA Inference Xfer Library (NIXL) is targeted for accelerating point to point communications in AI inference frameworks such as NVIDIA Dynamo, while providing an abstraction over various types of memory (e.g., CPU and GPU) and storage (e.g., file, block and object store) through a modular plug-in architecture.\n\n[![License](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-Apache_2.0-blue.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FApache-2.0)\n[![GitHub Release](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fv\u002Frelease\u002Fai-dynamo\u002Fnixl)](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Freleases\u002Flatest)\n\n## Documentation and Resources\n\n* [NIXL overview](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Fnixl.md) - Core concepts\u002Farchitecture overview (`docs\u002Fnixl.md`)\n\n* [Python API](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Fpython_api.md) - Python API usage and examples (`docs\u002Fpython_api.md`)\n\n* [Backend guide](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002FBackendGuide.md) - Backend\u002Fplugin development guide (`docs\u002FBackendGuide.md`)\n\n* [Telemetry](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Ftelemetry.md) - Observability and telemetry details (`docs\u002Ftelemetry.md`)\n\n* [Doxygen guide](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Fdoxygen\u002Fnixl_doxygen.md) - API\u002Fclass diagrams overview (`docs\u002Fdoxygen\u002Fnixl_doxygen.md`)\n\n* [Doxygen images](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fdocs\u002Fdoxygen) - Diagram assets (`docs\u002Fdoxygen\u002F`)\n\n* [NIXLBench docs](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fbenchmark\u002Fnixlbench\u002FREADME.md) - Benchmark usage guide (`benchmark\u002Fnixlbench\u002FREADME.md`)\n\n* [KVBench docs](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fbenchmark\u002Fkvbench\u002Fdocs) - KVBench workflows and tutorials (`benchmark\u002Fkvbench\u002Fdocs\u002F`)\n\n## Supported Platforms\nNIXL is supported on a Linux environment only. It is tested on Ubuntu (22.04\u002F24.04) and Fedora. macOS and Windows are not currently supported; use a Linux host or container\u002FVM.\n\n## Pre-build Distributions\n### PyPI Wheel\n\nThe nixl python API and libraries, including UCX, are available directly through PyPI.\nFor example, if you have a GPU running on a Linux host, container, or VM, you can do the following install:\n\nIt can be installed for CUDA 12 with:\n\n```\npip install nixl[cu12]\n```\n\nFor CUDA 13 with:\n\n```\npip install nixl[cu13]\n```\n\nFor backwards compatibility, `pip install nixl` installs automatically `nixl[cu12]`, continuing to work seamlessly for CUDA 12 users without requiring changes to downstream project dependencies.\n\nIf both `nixl-cu12` and `nixl-cu13` are installed at the same time in an environment, `nixl-cu13` takes precedence.\n\n## Prerequisites for source build (Linux)\n### Ubuntu:\n\n`$ sudo apt install build-essential cmake pkg-config`\n\n### Fedora:\n\n`$ sudo dnf install gcc-c++ cmake pkg-config`\n\n### Python\n\n`$ pip3 install meson ninja pybind11 tomlkit`\n\n### UCX\n\nNIXL was tested with UCX version 1.20.x.\n\n[GDRCopy](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002Fgdrcopy) is available on Github and is necessary for maximum performance, but UCX and NIXL will work without it.\n\n```\n$ git clone https:\u002F\u002Fgithub.com\u002Fopenucx\u002Fucx.git\n$ cd ucx\n$ git checkout v1.20.x\n$ .\u002Fautogen.sh\n$ .\u002Fcontrib\u002Fconfigure-release-mt       \\\n    --enable-shared                    \\\n    --disable-static                   \\\n    --disable-doxygen-doc              \\\n    --enable-optimizations             \\\n    --enable-cma                       \\\n    --enable-devel-headers             \\\n    --with-cuda=\u003Ccuda install>         \\\n    --with-verbs                       \\\n    --with-dm                          \\\n    --with-gdrcopy=\u003Cgdrcopy install>\n$ make -j\n$ make -j install-strip\n$ ldconfig\n```\n\n### ETCD (Optional)\nNIXL can use ETCD for metadata distribution and coordination between nodes in distributed environments. To use ETCD with NIXL:\n#### ETCD Server and Client\n ```\n$ sudo apt install etcd etcd-server etcd-client\n\n# Or use Docker\n$ docker run -d -p 2379:2379 quay.io\u002Fcoreos\u002Fetcd:v3.5.1\n```\n\n#### ETCD CPP API\nInstalled from https:\u002F\u002Fgithub.com\u002Fetcd-cpp-apiv3\u002Fetcd-cpp-apiv3\n\n```\n$ sudo apt install libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc\n$ sudo apt install libcpprest-dev\n$ git clone https:\u002F\u002Fgithub.com\u002Fetcd-cpp-apiv3\u002Fetcd-cpp-apiv3.git\n$ cd etcd-cpp-apiv3\n$ mkdir build && cd build\n$ cmake ..\n$ make -j$(nproc) && make install\n```\n\n### Additional plugins\n\nSome plugins may have additional build requirements, see them here:\n\n- [Mooncake](src\u002Fplugins\u002Fmooncake\u002FREADME.md)\n- [POSIX](src\u002Fplugins\u002Fposix\u002FREADME.md)\n- [GDS](src\u002Fplugins\u002Fcuda_gds\u002FREADME.md)\n\n## Getting started\n### Build & install\n\n```\n$ meson setup \u003Cname_of_build_dir>\n$ cd \u003Cname_of_build_dir>\n$ ninja\n$ ninja install\n```\n\n### Build Options\n\n#### Release build (default)\n\n```bash\n$ meson setup \u003Cname_of_build_dir>\n```\n\n#### Debug build\n\n```bash\n$ meson setup \u003Cname_of_build_dir> --buildtype=debug\n```\n\n#### NIXL-specific build options\n\n```bash\n# Example with custom options\n$ meson setup \u003Cname_of_build_dir> \\\n    -Dbuild_docs=true \\           # Build Doxygen documentation\n    -Ducx_path=\u002Fpath\u002Fto\u002Fucx \\     # Custom UCX installation path\n    -Dinstall_headers=true \\      # Install development headers\n    -Ddisable_gds_backend=false   # Enable GDS backend\n```\n\nCommon build options:\n- `build_docs`: Build Doxygen documentation (default: false)\n- `ucx_path`: Path to UCX installation (default: system path)\n- `install_headers`: Install development headers (default: true)\n- `disable_gds_backend`: Disable GDS backend (default: false)\n- `cudapath_inc`, `cudapath_lib`: Custom CUDA paths\n- `static_plugins`: Comma-separated list of plugins to build statically\n- `enable_plugins`: Comma-separated list of plugins to build (e.g. `-Denable_plugins=UCX,POSIX`). Cannot be used with `disable_plugins`.\n- `disable_plugins`: Comma-separated list of plugins to exclude (e.g. `-Ddisable_plugins=GDS`). Cannot be used with `enable_plugins`.\n\n#### Environment Variables\n\nThere are a few environment variables that can be set to configure the build:\n- `NIXL_NO_STUBS_FALLBACK`: If not set or 0, build NIXL stub library if the library build fails\n\n### Building Documentation\n\nIf you have Doxygen installed, you can build the documentation:\n\n```bash\n# Configure with documentation enabled\n$ meson setup \u003Cname_of_build_dir> -Dbuild_docs=true\n$ cd \u003Cname_of_build_dir>\n$ ninja\n\n# Documentation will be generated in \u003Cname_of_build_dir>\u002Fhtml\n# After installation (ninja install), documentation will be available in \u003Cprefix>\u002Fshare\u002Fdoc\u002Fnixl\u002F\n```\n\n### Python Interface\n\nNIXL provides Python bindings through pybind11. For detailed Python API documentation, see [docs\u002Fpython_api.md](docs\u002Fpython_api.md).\n\nThe preferred way to install the Python bindings is through pip from PyPI:\n\n```bash\npip install nixl[cu12]\n```\n\nOr for CUDA 13 with:\n\n```bash\npip install nixl[cu13]\n```\n\n#### Installation from source\n\nPrerequisites:\n\n- `uv`: https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002Fgetting-started\u002Finstallation\u002F\n- `tomlkit`: https:\u002F\u002Fpypi.org\u002Fproject\u002Ftomlkit\u002F\n- `PyTorch`: https:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F\n\n`uv` is always required *even if* you have another kind of Python virtual environment manager or if you are using a system-wide Python installation without using a virtual environment.\n\nExample with `uv` Python virtual environment:\n\n```\ncurl -LsSf https:\u002F\u002Fastral.sh\u002Fuv\u002Finstall.sh | sh\nexport PATH=\"$HOME\u002F.local\u002Fbin:${PATH}\"\n\nuv venv .venv --python 3.12\nsource .venv\u002Fbin\u002Factivate\nuv pip install tomlkit\n```\n\nExample with python-virtualenv:\n\n```\ncurl -LsSf https:\u002F\u002Fastral.sh\u002Fuv\u002Finstall.sh | sh\nexport PATH=\"$HOME\u002F.local\u002Fbin:${PATH}\"\n\npython3 -m venv .venv\nsource .venv\u002Fbin\u002Factivate\npip install tomlkit\n```\n\nExample with system-wide Python installation without using a virtual environment:\n\n```\ncurl -LsSf https:\u002F\u002Fastral.sh\u002Fuv\u002Finstall.sh | sh\nexport PATH=\"$HOME\u002F.local\u002Fbin:${PATH}\"\n\npip install tomlkit\n```\n\nThen install PyTorch following the instructions on the PyTorch website: https:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F\n\nAfter installing the prerequisites, you can build and install the NIXL binaries and the Python bindings from source. You have to:\n\n1. Build NIXL binaries and install them\n2. Build and install the CUDA platform-specific package (`nixl-cu12` or `nixl-cu13`)\n3. Build and install the `nixl` meta-package\n\n**For CUDA 12:**\n\n```\npip install .\nmeson setup build\nninja -C build install\npip install build\u002Fsrc\u002Fbindings\u002Fpython\u002Fnixl-meta\u002Fnixl-*-py3-none-any.whl\n```\n\n**For CUDA 13:**\n\n```\npip install .\n.\u002Fcontrib\u002Ftomlutil.py --wheel-name nixl-cu13 pyproject.toml\nmeson setup build\nninja -C build install\npip install build\u002Fsrc\u002Fbindings\u002Fpython\u002Fnixl-meta\u002Fnixl-*-py3-none-any.whl\n```\n\nTo check if the installation is successful, you can run the following command:\n\n```\npython3 -c \"import nixl; agent = nixl.nixl_agent('agent1')\"\n```\n\nwhich should print:\n\n```\n2026-01-08 13:36:27 NIXL INFO    _api.py:363 Backend UCX was instantiated\n2026-01-08 13:36:27 NIXL INFO    _api.py:253 Initialized NIXL agent: agent1\n```\n\nYou can also run a complete Python example to test the installation:\n\n```\npython3 examples\u002Fpython\u002Fexpanded_two_peers.py --mode=target --use_cuda=true --ip=127.0.0.1 --port=4242 &\nsleep 5\npython3 examples\u002Fpython\u002Fexpanded_two_peers.py --mode=initiator --use_cuda=true --ip=127.0.0.1 --port=4242\n```\n\nFor more Python examples, see [examples\u002Fpython\u002F](examples\u002Fpython\u002F).\n\n### Rust Bindings\n#### Build\n- Use `-Drust=true` meson option to build rust bindings.\n- Use `--buildtype=debug` for a debug build (default is release).\n- Or build manually:\n    ```bash\n    $ cargo build --release\n    ```\n#### Install\nThe bindings will be installed under `nixl-sys` in the configured installation prefix.\nCan be done using ninja, from project build directory:\n```bash\n$ ninja install\n```\n\n#### Test\n```\n# Rust bindings tests\n$ cargo test\n```\n\nUse in your project by adding to `Cargo.toml`:\n```toml\n[dependencies]\nnixl-sys = { path = \"path\u002Fto\u002Fnixl\u002Fbindings\u002Frust\" }\n```\n\n### Other build options\nSee [contrib\u002FREADME.md](contrib\u002FREADME.md) for more build options.\n\n### Building Docker container\nTo build the docker container, first clone the current repository. Also make sure you are able to pull docker images to your machine before attempting to build the container.\n\nRun the following from the root folder of the cloned NIXL repository:\n```\n$ .\u002Fcontrib\u002Fbuild-container.sh\n```\n\nBy default, the container is built with Ubuntu 24.04. To build a container for Ubuntu 22.04 use the --os option as follows:\n```\n$ .\u002Fcontrib\u002Fbuild-container.sh --os ubuntu22\n```\n\nTo see all the options supported by the container use:\n```\n$ .\u002Fcontrib\u002Fbuild-container.sh -h\n```\n\nThe container also includes a prebuilt python wheel in \u002Fworkspace\u002Fdist if required for installing\u002Fdistributing. Also, the wheel can be built with a separate script (see below).\n\n### Building the python wheel\nThe contrib folder also includes a script to build the python wheel with the UCX dependencies. Note, that UCX and other NIXL dependencies are required to be installed.\n```\n$ .\u002Fcontrib\u002Fbuild-wheel.sh\n```\n\n## Running with ETCD\nNIXL can use ETCD for metadata exchange between distributed nodes. This is especially useful in containerized or cloud-native environments.\n\n### Environment Setup\nTo use ETCD with NIXL, set the following environment variables:\n\n```bash\n# Set ETCD endpoints (required) - replace localhost with the hostname of the etcd server\nexport NIXL_ETCD_ENDPOINTS=\"http:\u002F\u002Flocalhost:2379\"\n\n# Set ETCD namespace (optional, defaults to \u002Fnixl\u002Fagents)\nexport NIXL_ETCD_NAMESPACE=\"\u002Fnixl\u002Fagents\"\n```\n\n### Running the ETCD Example\nNIXL includes an example demonstrating metadata exchange and data transfer using ETCD:\n\n```bash\n# Start an ETCD server if not already running\n# For example:\n# docker run -d -p 2379:2379 quay.io\u002Fcoreos\u002Fetcd:v3.5.1\n\n# Set the ETCD env variables as above\n\n# Run the example. The two agents in the example will exchange metadata through ETCD\n# and perform data transfers\n.\u002F\u003Cnixl_build_path>\u002Fexamples\u002Fnixl_etcd_example\n```\n\n### nixlbench Benchmark\nFor more comprehensive testing, the nixlbench benchmarking tool supports ETCD for worker coordination:\n\n```bash\n# Build nixlbench (see benchmark\u002Fnixlbench\u002FREADME.md for details)\ncd benchmark\u002Fnixlbench\nmeson setup build && cd build && ninja\n\n# Run benchmark with ETCD\n.\u002Fnixlbench --etcd-endpoints http:\u002F\u002Flocalhost:2379 --backend UCX --initiator_seg_type VRAM\n```\n\n## Code Examples\n\n* [C++ examples](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fexamples\u002Fcpp)\n\n* [Python examples](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fexamples\u002Fpython)\n\n## Contributing\n\nFor contribution guidelines, see [CONTRIBUTING.md](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002FCONTRIBUTING.md) (`CONTRIBUTING.md`).\n\n## Third-Party Components\n\nThis project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.\n","\u003C!--\nSPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\nSPDX-License-Identifier: Apache-2.0\n-->\n\n# NVIDIA 推理传输库 (NIXL)\n\nNVIDIA 推理传输库 (NIXL) 旨在加速 NVIDIA Dynamo 等 AI 推理框架中的点对点通信，同时通过模块化的插件架构为多种类型的内存（例如 CPU 和 GPU）以及存储（例如文件、块和对象存储）提供抽象层。\n\n[![许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-Apache_2.0-blue.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FApache-2.0)\n[![GitHub 发布](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fv\u002Frelease\u002Fai-dynamo\u002Fnixl)](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Freleases\u002Flatest)\n\n## 文档与资源\n\n* [NIXL 概述](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Fnixl.md) - 核心概念\u002F架构概述 (`docs\u002Fnixl.md`)\n\n* [Python API](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Fpython_api.md) - Python API 的使用方法及示例 (`docs\u002Fpython_api.md`)\n\n* [后端指南](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002FBackendGuide.md) - 后端\u002F插件开发指南 (`docs\u002FBackendGuide.md`)\n\n* [遥测](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Ftelemetry.md) - 可观测性与遥测细节 (`docs\u002Ftelemetry.md`)\n\n* [Doxygen 指南](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fdocs\u002Fdoxygen\u002Fnixl_doxygen.md) - API\u002F类图概览 (`docs\u002Fdoxygen\u002Fnixl_doxygen.md`)\n\n* [Doxygen 图片](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fdocs\u002Fdoxygen) - 图表资源 (`docs\u002Fdoxygen\u002F`)\n\n* [NIXLBench 文档](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fbenchmark\u002Fnixlbench\u002FREADME.md) - 基准测试使用指南 (`benchmark\u002Fnixlbench\u002FREADME.md`)\n\n* [KVBench 文档](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fbenchmark\u002Fkvbench\u002Fdocs) - KVBench 工作流与教程 (`benchmark\u002Fkvbench\u002Fdocs\u002F`)\n\n## 支持的平台\nNIXL 目前仅支持 Linux 环境。已在 Ubuntu (22.04\u002F24.04) 和 Fedora 上进行过测试。macOS 和 Windows 当前不支持；请使用 Linux 主机或容器\u002F虚拟机。\n\n## 预编译发行版\n### PyPI Wheel\n\nnixl 的 Python API 和库（包括 UCX）可以直接通过 PyPI 获得。\n例如，如果您在 Linux 主机、容器或虚拟机上运行 GPU，可以执行以下安装命令：\n\n对于 CUDA 12，可执行：\n```\npip install nixl[cu12]\n```\n\n对于 CUDA 13，可执行：\n```\npip install nixl[cu13]\n```\n\n为保持向后兼容性，`pip install nixl` 将自动安装 `nixl[cu12]`，继续无缝兼容 CUDA 12 用户，而无需更改下游项目的依赖项。\n\n如果环境中同时安装了 `nixl-cu12` 和 `nixl-cu13`，则优先使用 `nixl-cu13`。\n\n## 源码构建的先决条件（Linux）\n### Ubuntu:\n\n`$ sudo apt install build-essential cmake pkg-config`\n\n### Fedora:\n\n`$ sudo dnf install gcc-c++ cmake pkg-config`\n\n### Python\n\n`$ pip3 install meson ninja pybind11 tomlkit`\n\n### UCX\n\nNIXL 已在 UCX 1.20.x 版本上进行了测试。\n\n[GDRCopy](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002Fgdrcopy) 在 GitHub 上可用，对于实现最佳性能至关重要，但即使没有 GDRCopy，UCX 和 NIXL 也能正常工作。\n\n```\n$ git clone https:\u002F\u002Fgithub.com\u002Fopenucx\u002Fucx.git\n$ cd ucx\n$ git checkout v1.20.x\n$ .\u002Fautogen.sh\n$ .\u002Fcontrib\u002Fconfigure-release-mt       \\\n    --enable-shared                    \\\n    --disable-static                   \\\n    --disable-doxygen-doc              \\\n    --enable-optimizations             \\\n    --enable-cma                       \\\n    --enable-devel-headers             \\\n    --with-cuda=\u003Ccuda 安装路径>         \\\n    --with-verbs                       \\\n    --with-dm                          \\\n    --with-gdrcopy=\u003Cgdrcopy 安装路径>\n$ make -j\n$ make -j install-strip\n$ ldconfig\n```\n\n### ETCD（可选）\nNIXL 可以使用 ETCD 在分布式环境中进行元数据分发和节点间协调。要将 ETCD 与 NIXL 结合使用：\n#### ETCD 服务器和客户端\n ```\n$ sudo apt install etcd etcd-server etcd-client\n\n# 或者使用 Docker\n$ docker run -d -p 2379:2379 quay.io\u002Fcoreos\u002Fetcd:v3.5.1\n```\n\n#### ETCD CPP API\n从 https:\u002F\u002Fgithub.com\u002Fetcd-cpp-apiv3\u002Fetcd-cpp-apiv3 安装。\n\n```\n$ sudo apt install libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc\n$ sudo apt install libcpprest-dev\n$ git clone https:\u002F\u002Fgithub.com\u002Fetcd-cpp-apiv3\u002Fetcd-cpp-apiv3.git\n$ cd etcd-cpp-apiv3\n$ mkdir build && cd build\n$ cmake ..\n$ make -j$(nproc) && make install\n```\n\n### 其他插件\n\n部分插件可能有额外的构建要求，请参阅以下内容：\n\n- [Mooncake](src\u002Fplugins\u002Fmooncake\u002FREADME.md)\n- [POSIX](src\u002Fplugins\u002Fposix\u002FREADME.md)\n- [GDS](src\u002Fplugins\u002Fcuda_gds\u002FREADME.md)\n\n## 开始使用\n### 构建与安装\n\n```\n$ meson setup \u003C构建目录名称>\n$ cd \u003C构建目录名称>\n$ ninja\n$ ninja install\n```\n\n### 构建选项\n\n#### 发布版本（默认）\n\n```bash\n$ meson setup \u003C构建目录名称>\n```\n\n#### 调试版本\n\n```bash\n$ meson setup \u003C构建目录名称> --buildtype=debug\n```\n\n#### NIXL 特定的构建选项\n\n```bash\n# 示例：自定义选项\n$ meson setup \u003C构建目录名称> \\\n    -Dbuild_docs=true \\           # 构建 Doxygen 文档\n    -Ducx_path=\u002Fpath\u002Fto\u002Fucx \\     # 自定义 UCX 安装路径\n    -Dinstall_headers=true \\      # 安装开发头文件\n    -Ddisable_gds_backend=false   # 启用 GDS 后端\n```\n\n常见构建选项：\n- `build_docs`: 构建 Doxygen 文档（默认：false）\n- `ucx_path`: UCX 安装路径（默认：系统路径）\n- `install_headers`: 安装开发头文件（默认：true）\n- `disable_gds_backend`: 禁用 GDS 后端（默认：false）\n- `cudapath_inc`, `cudapath_lib`: 自定义 CUDA 路径\n- `static_plugins`: 以逗号分隔的静态构建插件列表\n- `enable_plugins`: 以逗号分隔的启用插件列表（例如 `-Denable_plugins=UCX,POSIX`）。不能与 `disable_plugins` 同时使用。\n- `disable_plugins`: 以逗号分隔的禁用插件列表（例如 `-Ddisable_plugins=GDS`）。不能与 `enable_plugins` 同时使用。\n\n#### 环境变量\n\n有几个环境变量可用于配置构建：\n- `NIXL_NO_STUBS_FALLBACK`: 如果未设置或设为 0，则在库构建失败时构建 NIXL 存根库。\n\n### 构建文档\n\n如果您已安装 Doxygen，可以构建文档：\n\n```bash\n# 配置时启用文档生成\n$ meson setup \u003C构建目录名称> -Dbuild_docs=true\n$ cd \u003C构建目录名称>\n$ ninja\n\n# 文档将生成在 \u003C构建目录名称>\u002Fhtml 中\n# 安装完成后（ninja install），文档将在 \u003Cprefix>\u002Fshare\u002Fdoc\u002Fnixl\u002F 中可用\n```\n\n### Python 接口\n\nNIXL 通过 pybind11 提供了 Python 绑定。有关详细的 Python API 文档，请参阅 [docs\u002Fpython_api.md](docs\u002Fpython_api.md)。\n\n安装 Python 绑定的首选方式是从 PyPI 使用 pip：\n\n```bash\npip install nixl[cu12]\n```\n\n或者对于 CUDA 13 版本：\n\n```bash\npip install nixl[cu13]\n```\n\n#### 从源码安装\n\n先决条件：\n\n- `uv`：https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002Fgetting-started\u002Finstallation\u002F\n- `tomlkit`：https:\u002F\u002Fpypi.org\u002Fproject\u002Ftomlkit\u002F\n- `PyTorch`：https:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F\n\n即使您使用其他类型的 Python 虚拟环境管理器，或直接使用系统范围的 Python 安装而不使用虚拟环境，`uv` 仍然是必需的。\n\n使用 `uv` 创建 Python 虚拟环境的示例：\n\n```\ncurl -LsSf https:\u002F\u002Fastral.sh\u002Fuv\u002Finstall.sh | sh\nexport PATH=\"$HOME\u002F.local\u002Fbin:${PATH}\"\n\nuv venv .venv --python 3.12\nsource .venv\u002Fbin\u002Factivate\nuv pip install tomlkit\n```\n\n使用 python-virtualenv 的示例：\n\n```\ncurl -LsSf https:\u002F\u002Fastral.sh\u002Fuv\u002Finstall.sh | sh\nexport PATH=\"$HOME\u002F.local\u002Fbin:${PATH}\"\n\npython3 -m venv .venv\nsource .venv\u002Fbin\u002Factivate\npip install tomlkit\n```\n\n不使用虚拟环境而直接使用系统范围 Python 安装的示例：\n\n```\ncurl -LsSf https:\u002F\u002Fastral.sh\u002Fuv\u002Finstall.sh | sh\nexport PATH=\"$HOME\u002F.local\u002Fbin:${PATH}\"\n\npip install tomlkit\n```\n\n然后按照 PyTorch 官网上的说明安装 PyTorch：https:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F\n\n安装完先决条件后，您可以从源码构建并安装 NIXL 二进制文件和 Python 绑定。您需要执行以下步骤：\n\n1. 构建并安装 NIXL 二进制文件。\n2. 构建并安装特定于 CUDA 平台的包（`nixl-cu12` 或 `nixl-cu13`）。\n3. 构建并安装 `nixl` 元包。\n\n**对于 CUDA 12：**\n\n```\npip install .\nmeson setup build\nninja -C build install\npip install build\u002Fsrc\u002Fbindings\u002Fpython\u002Fnixl-meta\u002Fnixl-*-py3-none-any.whl\n```\n\n**对于 CUDA 13：**\n\n```\npip install .\n.\u002Fcontrib\u002Ftomlutil.py --wheel-name nixl-cu13 pyproject.toml\nmeson setup build\nninja -C build install\npip install build\u002Fsrc\u002Fbindings\u002Fpython\u002Fnixl-meta\u002Fnixl-*-py3-none-any.whl\n```\n\n要检查安装是否成功，可以运行以下命令：\n\n```\npython3 -c \"import nixl; agent = nixl.nixl_agent('agent1')\"\n```\n\n如果安装成功，应输出：\n\n```\n2026-01-08 13:36:27 NIXL INFO    _api.py:363 Backend UCX was instantiated\n2026-01-08 13:36:27 NIXL INFO    _api.py:253 Initialized NIXL agent: agent1\n```\n\n您还可以运行一个完整的 Python 示例来测试安装：\n\n```\npython3 examples\u002Fpython\u002Fexpanded_two_peers.py --mode=target --use_cuda=true --ip=127.0.0.1 --port=4242 &\nsleep 5\npython3 examples\u002Fpython\u002Fexpanded_two_peers.py --mode=initiator --use_cuda=true --ip=127.0.0.1 --port=4242\n```\n\n更多 Python 示例请参阅 [examples\u002Fpython\u002F](examples\u002Fpython\u002F)。\n\n### Rust 绑定\n#### 构建\n- 使用 `-Drust=true` meson 选项来构建 Rust 绑定。\n- 使用 `--buildtype=debug` 进行调试构建（默认为 release）。\n- 或者手动构建：\n    ```bash\n    $ cargo build --release\n    ```\n#### 安装\n绑定将被安装到配置的安装前缀下的 `nixl-sys` 目录中。\n可以通过 ninja 从项目构建目录完成安装：\n```bash\n$ ninja install\n```\n\n#### 测试\n```\n# Rust 绑定测试\n$ cargo test\n```\n\n在您的项目中使用时，只需在 `Cargo.toml` 中添加：\n```toml\n[dependencies]\nnixl-sys = { path = \"path\u002Fto\u002Fnixl\u002Fbindings\u002Frust\" }\n```\n\n### 其他构建选项\n更多构建选项请参阅 [contrib\u002FREADME.md](contrib\u002FREADME.md)。\n\n### 构建 Docker 容器\n要构建 Docker 容器，首先克隆当前仓库。此外，在尝试构建容器之前，请确保您能够拉取 Docker 镜像到本地机器。\n\n在克隆的 NIXL 仓库根目录下运行以下命令：\n```\n$ .\u002Fcontrib\u002Fbuild-container.sh\n```\n\n默认情况下，容器基于 Ubuntu 24.04 构建。若要构建适用于 Ubuntu 22.04 的容器，请使用 `--os` 选项，如下所示：\n```\n$ .\u002Fcontrib\u002Fbuild-container.sh --os ubuntu22\n```\n\n要查看容器支持的所有选项，可以运行：\n```\n$ .\u002Fcontrib\u002Fbuild-container.sh -h\n```\n\n容器还包含预构建的 Python wheel 文件，位于 `\u002Fworkspace\u002Fdist` 目录下，便于安装或分发。此外，也可以使用单独的脚本构建该 wheel 文件（见下文）。\n\n### 构建 Python wheel 文件\n`contrib` 文件夹中还包含一个用于构建带有 UCX 依赖项的 Python wheel 文件的脚本。请注意，UCX 和其他 NIXL 依赖项必须已安装。\n```\n$ .\u002Fcontrib\u002Fbuild-wheel.sh\n```\n\n## 使用 ETCD 运行\nNIXL 可以使用 ETCD 在分布式节点之间交换元数据。这在容器化或云原生环境中尤为有用。\n\n### 环境设置\n要将 ETCD 与 NIXL 结合使用，需设置以下环境变量：\n\n```bash\n# 设置 ETCD 端点（必填）——将 localhost 替换为 ETCD 服务器的主机名\nexport NIXL_ETCD_ENDPOINTS=\"http:\u002F\u002Flocalhost:2379\"\n\n# 设置 ETCD 命名空间（可选，默认为 \u002Fnixl\u002Fagents）\nexport NIXL_ETCD_NAMESPACE=\"\u002Fnixl\u002Fagents\"\n```\n\n### 运行 ETCD 示例\nNIXL 包含一个演示使用 ETCD 进行元数据交换和数据传输的示例：\n\n```bash\n# 如果尚未运行 ETCD 服务器，请启动一个\n# 例如：\n# docker run -d -p 2379:2379 quay.io\u002Fcoreos\u002Fetcd:v3.5.1\n\n# 按照上述方法设置 ETCD 环境变量\n\n# 运行示例。示例中的两个代理将通过 ETCD 交换元数据，并执行数据传输\n.\u002F\u003Cnixl_build_path>\u002Fexamples\u002Fnixl_etcd_example\n```\n\n### nixlbench 基准测试\n为了进行更全面的测试，nixlbench 基准测试工具支持使用 ETCD 进行工作节点协调：\n\n```bash\n# 构建 nixlbench（详情请参阅 benchmark\u002Fnixlbench\u002FREADME.md）\ncd benchmark\u002Fnixlbench\nmeson setup build && cd build && ninja\n\n# 使用 ETCD 运行基准测试\n.\u002Fnixlbench --etcd-endpoints http:\u002F\u002Flocalhost:2379 --backend UCX --initiator_seg_type VRAM\n```\n\n## 代码示例\n\n* [C++ 示例](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fexamples\u002Fcpp)\n\n* [Python 示例](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Ftree\u002Fmain\u002Fexamples\u002Fpython)\n\n## 贡献指南\n\n有关贡献指南，请参阅 [CONTRIBUTING.md](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002FCONTRIBUTING.md)（`CONTRIBUTING.md`）。\n\n## 第三方组件\n\n本项目会下载并安装额外的第三方开源软件项目。在使用这些开源项目之前，请务必查看其许可证条款。","# NIXL 快速上手指南\n\nNVIDIA Inference Xfer Library (NIXL) 是一个用于加速 AI 推理框架中点对点通信的库，支持多种内存（CPU\u002FGPU）和存储后端的抽象。\n\n## 1. 环境准备\n\n### 系统要求\n- **操作系统**：仅支持 Linux（推荐 Ubuntu 22.04\u002F24.04 或 Fedora）。\n- **硬件**：需要 NVIDIA GPU 环境。\n- **不支持**：macOS 和 Windows。\n\n### 前置依赖安装\n\n**Ubuntu:**\n```bash\nsudo apt install build-essential cmake pkg-config\n```\n\n**Fedora:**\n```bash\nsudo dnf install gcc-c++ cmake pkg-config\n```\n\n**Python 工具链:**\n```bash\npip3 install meson ninja pybind11 tomlkit\n```\n\n**可选依赖 (ETCD - 用于分布式元数据协调):**\n```bash\n# 使用 apt 安装\nsudo apt install etcd etcd-server etcd-client\n\n# 或使用 Docker 运行\ndocker run -d -p 2379:2379 quay.io\u002Fcoreos\u002Fetcd:v3.5.1\n```\n\n> **注意**：若需从源码构建高性能版本，还需手动编译安装 UCX (v1.20.x) 和 GDRCopy，具体步骤请参考官方文档。大多数用户建议直接使用 PyPI 预编译包。\n\n## 2. 安装步骤\n\n### 方式一：通过 PyPI 安装（推荐）\n\n这是最简单的方式，已包含 UCX 等核心依赖。\n\n**对于 CUDA 12 用户:**\n```bash\npip install nixl[cu12]\n```\n\n**对于 CUDA 13 用户:**\n```bash\npip install nixl[cu13]\n```\n\n> **提示**：直接运行 `pip install nixl` 默认会安装 CUDA 12 版本以保持向后兼容。如果环境中同时存在 cu12 和 cu13 包，cu13 优先。\n\n### 方式二：从源码构建\n\n如果你需要自定义插件或开发后端，可从源码构建。\n\n1. **配置与编译:**\n```bash\nmeson setup \u003Cbuild_dir>\ncd \u003Cbuild_dir>\nninja\nsudo ninja install\n```\n\n2. **常用构建选项示例:**\n```bash\nmeson setup \u003Cbuild_dir> \\\n    -Dbuild_docs=true \\\n    -Ducx_path=\u002Fpath\u002Fto\u002Fucx \\\n    -Denable_plugins=UCX,POSIX\n```\n\n3. **安装 Python 绑定 (源码方式):**\n需先安装 `uv` 和 `tomlkit`，然后执行：\n```bash\n# CUDA 12 示例\npip install .\nmeson setup build\nninja -C build install\npip install build\u002Fsrc\u002Fbindings\u002Fpython\u002Fnixl-meta\u002Fnixl-*-py3-none-any.whl\n```\n\n## 3. 基本使用\n\n### 验证安装\n运行以下命令检查是否安装成功：\n```bash\npython3 -c \"import nixl; agent = nixl.nixl_agent('agent1')\"\n```\n成功输出示例：\n```text\n2026-01-08 13:36:27 NIXL INFO    _api.py:363 Backend UCX was instantiated\n2026-01-08 13:36:27 NIXL INFO    _api.py:253 Initialized NIXL agent: agent1\n```\n\n### 简单通信示例\nNIXL 支持多进程\u002F多节点间的点对点数据传输。以下是一个简单的双节点测试示例（需在支持 CUDA 的环境中运行）：\n\n**终端 1 (启动目标节点):**\n```bash\npython3 examples\u002Fpython\u002Fexpanded_two_peers.py --mode=target --use_cuda=true --ip=127.0.0.1 --port=4242 &\n```\n\n**等待 5 秒后，在终端 2 (启动发起节点):**\n```bash\nsleep 5\npython3 examples\u002Fpython\u002Fexpanded_two_peers.py --mode=initiator --use_cuda=true --ip=127.0.0.1 --port=4242\n```\n\n### 分布式环境配置 (可选)\n如果在集群或容器化环境中使用 ETCD 进行元数据交换，需设置以下环境变量：\n\n```bash\nexport NIXL_ETCD_ENDPOINTS=\"http:\u002F\u002F\u003Cetcd-server-ip>:2379\"\nexport NIXL_ETCD_NAMESPACE=\"\u002Fnixl\u002Fagents\"\n```\n\n更多高级用法（如 Rust 绑定、自定义后端开发）请参考项目 `docs\u002F` 目录下的详细文档。","某大型电商公司正在构建基于 NVIDIA Dynamo 的实时推荐系统，需要在多节点 GPU 集群间高速传输海量用户行为特征数据（KV Cache）。\n\n### 没有 nixl 时\n- **内存拷贝繁琐**：开发人员需手动编写代码在 CPU 内存与 GPU 显存间搬运数据，不仅代码冗余，还极易因指针错误导致服务崩溃。\n- **通信延迟高企**：缺乏针对 AI 推理优化的点对点传输机制，跨节点数据同步耗时过长，导致用户刷新页面时推荐结果加载明显卡顿。\n- **存储适配困难**：面对文件系统、块存储和对象存储等多种后端，团队不得不为每种存储类型重复开发适配接口，维护成本极高。\n- **资源调度僵化**：无法统一抽象不同硬件层的内存资源，导致集群中部分 GPU 因等待数据传输而空闲，整体算力利用率不足 60%。\n\n### 使用 nixl 后\n- **零拷贝加速**：nixl 直接提供统一的内存抽象层，自动处理 CPU 与 GPU 间的数据流转，消除了手动拷贝需求，代码量减少 40% 且稳定性大幅提升。\n- **毫秒级传输**：依托底层 UCX 优化及点对点通信加速，跨节点特征数据同步延迟降低至毫秒级，实现了推荐结果的“秒开”体验。\n- **插件化扩展**：通过 nixl 的模块化插件架构，团队仅需配置即可无缝切换文件、块或对象存储，新存储后端的接入时间从数周缩短至数小时。\n- **算力满负荷运行**：高效的数据流转机制消除了计算等待瓶颈，集群 GPU 利用率攀升至 90% 以上，显著降低了单位请求的硬件成本。\n\nnixl 通过统一内存抽象与加速点对点通信，将复杂的分布式数据搬运转化为透明高效的基础设施，让研发团队能专注于核心算法迭代。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fai-dynamo_nixl_663a8596.png","ai-dynamo","Dynamo","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fai-dynamo_06011070.png","",null,"https:\u002F\u002Fdeveloper.nvidia.com\u002Fdynamo","https:\u002F\u002Fgithub.com\u002Fai-dynamo",[80,84,88,92,96,100,104,108],{"name":81,"color":82,"percentage":83},"C++","#f34b7d",73.1,{"name":85,"color":86,"percentage":87},"Python","#3572A5",8.1,{"name":89,"color":90,"percentage":91},"Rust","#dea584",7,{"name":93,"color":94,"percentage":95},"Meson","#007800",4.7,{"name":97,"color":98,"percentage":99},"Shell","#89e051",3.1,{"name":101,"color":102,"percentage":103},"Cuda","#3A4E3A",2.6,{"name":105,"color":106,"percentage":107},"Dockerfile","#384d54",0.8,{"name":109,"color":110,"percentage":111},"C","#555555",0.6,989,292,"2026-04-18T04:59:41","NOASSERTION","Linux","需要 NVIDIA GPU（用于 CUDA 加速），支持 CUDA 12 或 CUDA 13，需安装 GDRCopy 以获得最佳性能（可选但推荐）","未说明",{"notes":120,"python":121,"dependencies":122},"仅支持 Linux 环境（测试于 Ubuntu 22.04\u002F24.04 和 Fedora），不支持 macOS 和 Windows。可通过 PyPI 直接安装预编译包（nixl[cu12] 或 nixl[cu13]）。若从源码构建，需手动编译 UCX 及可选的 ETCD C++ API。支持通过 Docker 容器部署。Rust 绑定可选构建。","3.12 (示例中使用，建议版本)",[123,124,125,126,127,128,129,130],"UCX 1.20.x","PyTorch","meson","ninja","pybind11","tomlkit","etcd (可选，用于分布式元数据协调)","GDRCopy (可选，用于高性能)",[14],"2026-03-27T02:49:30.150509","2026-04-18T22:34:13.115315",[135,140,145,150,155,159],{"id":136,"question_zh":137,"answer_zh":138,"source_url":139},40739,"如何在 GB200 系列环境中启用 NIXL 的 MNNVL 通信以提升性能？","在 vLLM 中目前默认不支持 kv_cache 的 MNNVL。要启用它，需要应用特定的补丁（参考 PR #33540），并在预填充（prefill）和解码（decode）阶段同时设置以下参数：\n1. 启动参数添加 `--enable-sleep-mode`。\n2. 设置环境变量 `VLLM_CUDA_FABRIC=y`。\n配置成功后，通过 `UCX_PROTO_INFO=y` 查看日志，应能看到 `cuda_ipc\u002Fcuda` 被用于 zero-copy 传输，从而显著提升吞吐量并稳定首字延迟（TTFT）。","https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fissues\u002F1240",{"id":141,"question_zh":142,"answer_zh":143,"source_url":144},40740,"使用 UCX 后端配合 cuda_ipc 进行 NVLink 通信时出现段错误（Segmentation fault）或回退到低带宽模式怎么办？","这通常是由于容器权限不足导致无法创建或交换 IPC 句柄，UCX 因此静默回退到 rc_mlx5 模式。解决方法是确保 Docker 容器拥有足够的权限，建议在运行容器时添加以下参数：\n- `--ipc host`\n- `--privileged`\n- `--cap-add IPC_LOCK`\n- `--cap-add SYS_PTRACE`\n- `--shm-size` 设置为足够大（如 128GB）\n- `--ulimit memlock=-1`\n如果移除 rc 传输选项后报错 OOM，也证实了是资源创建权限问题。","https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fissues\u002F1042",{"id":146,"question_zh":147,"answer_zh":148,"source_url":149},40741,"发送端传输很快但接收端收到通知（notification）延迟很高（如 2 秒）的原因是什么？","该问题通常不是 NIXL 本身的缺陷，而是上层框架（如 SGLang）的性能瓶颈或 JIT 编译开销导致的。排查建议如下：\n1. 检查是否使用了固定的输入长度和页面大小（如 page_size=64），不固定的输入长度可能导致内核分配的 JIT 抖动。\n2. 尝试使用 NIXL 的 \"prepped\" 传输模式来处理大量分段，这可能有助于减少开销。\n3. 确认是否受限于特定模型规模或多卡环境下的预填充时间。","https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fissues\u002F182",{"id":151,"question_zh":152,"answer_zh":153,"source_url":154},40742,"NIXL 是否支持除 CUDA 12.8 以外的其他 CUDA 版本（如 12.4, 12.5）？","官方 Wheel 包主要针对 CUDA 12.8 构建。对于其他版本（如 CUDA 12.4），目前建议用户从源码构建（build from source）以确保兼容性。社区正在讨论支持动态链接或构建多个版本的 Wheel，但在正式支持前，源码编译是使用非默认 CUDA 版本的可靠方法。","https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fissues\u002F158",{"id":156,"question_zh":157,"answer_zh":158,"source_url":144},40743,"如何验证 NIXL 是否正确使用了高速传输路径（如 cuda_ipc 或 NVLink）而不是 TCP？","可以通过设置环境变量 `UCX_PROTO_INFO=y` 来启用 UCX 的详细协议信息输出。运行程序后，检查日志中的 `ucp_context` 配置表：\n- 如果看到 `cuda_ipc\u002Fcuda` 对应 `zero-copy` 或 `remote memory read\u002Fwrite`，说明高速路径已启用。\n- 如果看到 `tcp` 或被强制回退到 `rc_mlx5` 且带宽较低（约 20GB\u002Fs），则说明配置有误或权限不足。\n日志中应明确显示传输类型，确保没有意外回退到低效路径。",{"id":160,"question_zh":161,"answer_zh":162,"source_url":149},40744,"在传输大量小分段（segments）时性能不佳，有什么优化建议？","当分段数量非常多时（例如使用 SGLang 默认的 page_size=1），NIXL 的管理开销会增加。优化建议包括：\n1. 增大页面大小（如设置为 64），以减少需要传输的分段总数。\n2. 使用 NIXL 支持的 \"prepped\" 传输模式，该模式专为处理大量分段进行了优化。\n3. 保持输入长度固定，避免因动态形状导致的内核即时编译（JIT）开销和抖动。",[164,169,174,179,184,189,194,199,204,209,214,219,224,229,234,239,244,249,254,259],{"id":165,"version":166,"summary_zh":167,"released_at":168},324238,"v1.0.1","# 1.0.1\n\n## 摘要\nNVIDIA® NIXL 1.0.1 版本是一个针对性的维护版本，重点修复了 NIXL-EP 的稳定性问题、提升了 libfabric 传输层的可靠性，并在 UCX、Python Wheel 和 Docker 环境中改进了构建与打包流程。\n\n## NIXL-EP 修复\n- **修复销毁流程**：修正了 NIXL-EP 中资源清理和销毁顺序的问题，避免在关闭过程中出现崩溃和资源泄漏（#1452）。\n- **修复弹性扩容时的信号缓冲区损坏问题**：解决了 NIXL-EP 在弹性扩容期间新节点加入时可能出现的信号缓冲区损坏问题，确保在拓扑变化时缓冲区状态保持正确（#1453）。\n\n## Libfabric 修复\n- **修复传输句柄重新发布时的通知覆盖问题**：修复了 libfabric 后端的一个问题，即在传输句柄重新发布时，更新的通知消息会被忽略，导致重新发布的传输始终使用初始准备时的原始通知（#1482、#1433）。\n- **修复端点线程安全性问题**：在 libfabric 后端的所有端点访问中添加了适当的互斥锁保护，以满足 `FI_THREAD_COMPLETION` 的线程安全要求，防止并发 I\u002FO 操作中出现潜在的竞争条件（#1483、#1457）。\n\n## 构建与打包\n- **在 Python Wheel 构建中启用 UCX EP 支持**：将 UCX 端点支持添加到 Python Wheel 构建中，从而为通过 pip 安装的部署启用 NIXL-EP 功能（#1440）。\n- **在 UCX 构建中禁用 gdrcopy**：为避免在 gdrcopy 不可用或不需要的环境中出现链接冲突，已在 UCX 构建中禁用 gdrcopy（#1436）。\n- **修复 Abseil 版本冲突**：解决了 NIXL 构建和 Docker 镜像中可能引发链接错误或运行时符号不匹配的 Abseil 版本冲突问题（#1432）。\n- **升级 RDMA 内存检查使用的 UCX 版本**：将用于 RDMA 内存检查的 UCX 版本更新至最新支持的发行版，以保持一致（#1445）。\n- **将 PyTorch 版本固定为 2.11**：为实现可重复构建和兼容性，将 PyTorch 依赖版本固定为 2.11（#1471）。\n- **添加 pkg-config 安装**：在构建环境中添加了缺失的 `pkg-config` 安装，修复了在极简容器镜像中出现的构建失败问题（#1450）。\n- **修复依赖问题**：移除了模块初始化阶段对 PyTorch 版本的严格检查，以提升兼容性；同时统一了 UCX 代码库检出行为，确保始终使用配置的 UCX 基准版本（#1488）。\n\n完整变更日志：[1.0.0...1.0.1](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F1.0.0...1.0.1)\n\n","2026-04-14T19:56:20",{"id":170,"version":171,"summary_zh":172,"released_at":173},324239,"v1.0.0","# 1.0.0\n\n## 摘要\n\nNVIDIA® NIXL 发布 1.0.0 版本，标志着 API 稳定性和生产就绪性迈出了重要一步。此版本通过移除旧版 V1 实现并统一命名规范，最终完成了 Device API V2 的过渡，确立了一个简洁、稳定的 1.0 API 接口。同时，全新的两阶段配置框架取代了以往的临时性环境与 API 配置处理方式，为所有 NIXL 组件提供了一致且可扩展的运行时配置模型。\n\nNIXL 1.0.0 还在网络、存储和后端基础设施方面带来了显著改进。Libfabric 后端新增了 NUMA 感知的通道选择功能，可在多插槽系统上实现拓扑优化的传输；POSIX 插件则引入了基于引擎的 IO 队列机制，以提升文件系统的并发能力。云存储的成熟度也进一步提高，支持 Azure Blob 连接字符串和 CA 证书包，修正了 S3 CRT 分片阈值问题，并扩展了对象存储的功能测试。在核心传输路径上，描述符迭代和传输请求创建流程得到了优化，遥测开销降低，UCCL 批量传输的处理也更加简化，从而提升了生产环境中的吞吐量并降低了延迟。\n\n## 主要特性\n\n- **Device API V2 完成定型：** 移除了旧版 Device API V1 实现，并完成了所有面向 V2 的清理工作，包括在整个代码库中将 `MemoryView` 统一为 `MemView`。此举确立了 Device API V2 作为 NIXL 1.0 唯一且稳定的设备编程接口。（#1342、#1337、#1376）\n- **配置框架：** 引入了一个全面的两阶段配置系统，涵盖环境驱动（第一阶段）和 API 驱动（第二阶段）的设置。该框架取代了以往的临时性配置方式，提供了一种结构化、可扩展的运行时配置管理模型。（#1301、#1346）\n- **Libfabric NUMA 感知通道选择：** 在 Libfabric 后端为 `DRAM_SEG` 内存类型添加了 NUMA 感知的通道选择策略。这一功能可通过将内存访问与最近的可用网络通道对齐，实现在多插槽系统上的拓扑优化传输选择。（#1302）\n- **Libfabric 控制通道移除：** 从 Libfabric 后端移除了遗留的控制通道，简化了通道管理架构并降低了资源开销。（#1386）\n- **POSIX 基于引擎的 IO 队列：** 在 POSIX 插件中增加了基于引擎的 IO 队列支持，从而改善了基于文件系统的存储工作流的并发性和调度行为。（#1051）\n- **Azure Blob 存储增强：** 扩展了 Azure Blob 存储插件，支持连接字符串认证及可配置的 CA 证书包设置，提升了在企业级和空气隔离环境中的部署灵活性。（#1351、#1329）\n- **Rust 运行时库解析：** 通过 `libnixl_capi.so` 引入了基于 `dlopen` 的 `nixl-sys` 占位库，使 Rust 应用程序能够在运行时解析 NIXL，而无需在构建时进行链接。这些占位库使用 `dlopen`\u002F`dlsym` 来 l","2026-03-13T06:56:44",{"id":175,"version":176,"summary_zh":177,"released_at":178},324240,"0.10.1","# 0.10.1\n\n## 概述\nNVIDIA® NIXL 发布版本 0.10.1 是一个针对性的维护版本，重点改进了 Rust 绑定、Python 打包以及测试基础设施。\n\n## Rust 绑定\n- **运行时库解析**：引入了 `libnixl_capi.so` 共享库，该库导出了 `nixl_capi_*` C 符号。这使得下游使用 `nixl-sys` Rust crate 的项目可以在运行时加载 NIXL，而无需在构建时进行链接。NIXL 存根已从调用时直接中止的方式改写为使用 `dlopen`\u002F`dlsym` 延迟转发到实际实现（#1358）。\n- **构建检查**：在尝试链接之前，在 `build.rs` 中添加了对 `libnixl.so` 是否存在的检查。这确保了当 `nixl-sys` 作为 Git 依赖项使用时，如果仅提供了头文件而未安装库文件，回退到存根机制能够正常工作（#1358）。\n\n## Python 打包与测试\n- **精确版本锁定**：在 `nixl` 元包中，将 Python CUDA 特定依赖项（`nixl-cu12` 和 `nixl-cu13`）锁定为精确版本。这可以防止版本不匹配，并在安装元包时确保环境的一致性（#1354）。\n- **测试基础设施**：增加了对使用预先创建的虚拟环境运行 Python 测试的支持（同时支持标准 `pip` 和 `uv pip`），从而提升了 CI 的灵活性和本地开发工作流的效率（#1353）。\n\n完整变更日志：[0.10.0...0.10.1](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.10.0...0.10.1)","2026-03-03T20:13:44",{"id":180,"version":181,"summary_zh":182,"released_at":183},324241,"0.10.0","## 摘要\n\nNVIDIA® NIXL 0.10.0 版本在云存储、网络和设备集成方面实现了重大突破，进一步强化了 NIXL 的硬件无关设计。此版本引入了对 AWS Neuron 设备的全面支持，使开发者能够在异构计算环境中以一致的性能和统一的 API 进行开发。与此同时，NIXL 0.10.0 还通过新增 Azure Blob Storage 插件、S3 CRT 客户端集成以及分层对象存储架构，扩展了存储能力，该架构旨在支持即将推出的 S3 RDMA 加速功能。\n\n此外，该版本还首次推出了完整的 Device API V2，覆盖主机端、后端和设备端实现，从而简化集成并提升可移植性。网络方面的改进包括对 Slingshot\u002FCXI 提供程序的支持，以及多项 Libfabric 性能优化，例如 CQ 批量读取和改进的线程模型，以提高并发性和吞吐量。\n\n## 主要特性\n\n- **Azure Blob Storage 插件：** 引入 Azure Blob Storage 支持的初始实现，作为 NIXL 的又一个云对象存储后端。该插件使用 Azure C++ SDK 实现对象分段存储，并通过 Microsoft Entra ID 进行基于 OAuth 的身份验证。它为现有的 S3 后端提供了一种功能性替代方案，并包含集成测试、nixlbench 基准测试支持以及用于验证的 CI 基础设施。（#1233）\n- **S3 CRT 客户端支持：** 新增对 AWS Common Runtime (CRT) S3 客户端的支持，能够实现更高的 S3 传输吞吐量，并自动启用多部分上传\u002F下载的并行处理。（#1127）\n- **分层对象存储架构：** 将对象存储插件重构为基于继承的模块化客户端与引擎架构。这一设计实现了清晰的关注点分离，支持标准 S3、基于 CRT 的 S3，以及未来加速型（RDMA）S3 后端的占位符，并允许厂商特定的扩展。（#1247）\n- **Device API V2：** 对 NIXL Device API 进行了全面重新设计，为整个堆栈提供了全新的编程模型：\n  - API 定义和接口更新。（#1229）\n  - 核心主机端实现。（#1230）\n  - UCX 后端实现。（#1245）\n  - GPU 和 UCX 设备端实现。（#1255）\n- **Libfabric Slingshot\u002FCXI 支持：** 在 Libfabric 中新增对 HPE Slingshot\u002FCXI 提供程序的支持，包括 `FI_MR_ENDPOINT` 内存注册模式的处理。这使得 NIXL 能够在 HPC 环境中常见的 Slingshot 互连架构上运行。（#1242）\n- **AWS Neuron 设备支持：** 新增对 AWS Neuron 设备（Trainium\u002FInferentia）的 OFI（OpenFabrics Interfaces）支持，从而实现基于 Neuron 加速器的 NIXL 网络通信。（#1258）\n- **双许可更新：** 更新了 GitHub 仓库中的许可信息，以反映双重许可（Apache 2.0 + MIT，适用于 DeepEP 衍生代码）。\n\n## API 变更\n\n- **[设备] Device API V2：** Device API 已经被重新设计，引入了新的接口，用于……","2026-02-18T01:38:33",{"id":185,"version":186,"summary_zh":187,"released_at":188},324242,"0.9.0","## 摘要\nNVIDIA® NIXL 0.9.0 版本带来了显著的新功能和性能提升。主要亮点包括：引入了用于优化集体通信的 **UCCL 后端**、新增支持 Prometheus 的 **遥测插件框架**，以及演示专家级并行调度的 **NIXL-EP 示例**。此外，该版本还增加了对 **Python 3.14** 的支持，启用了 **Libfabric 共享内存** 的节点内传输，并针对核心请求处理进行了重要的性能优化。\n\n此版本包含一些破坏性变更，特别是移除了对 Python 3.9 的支持。\n\n### 主要特性\n*   **UCCL 后端集成**：新增对 UCCL P2P 后端的支持，可通过 RDMA 实现高效的 GPU 内存传输。（#895）\n* **遥测插件框架**：引入了一个可扩展的遥测插件系统，允许用户自定义指标导出器。\n* **Prometheus 导出器**：新增一个用于将指标导出到 Prometheus 的插件。（#1091）\n* **循环缓冲区导出器**：一个用于高性能循环缓冲区遥测的插件。（#1088）\n* **插件管理器**：支持加载和管理遥测插件的基础架构。（#1070）\n* **NIXL-EP（弹性示例）**：引入了 `examples\u002Fdevice\u002Fep`，这是一个全面的示例，展示了使用 NIXL 设备 API 进行专家级并行调度和组合操作。其中包括改进的元数据获取功能和 CI 集成。（#1043、#1132、#1104、#1077）\n  * **启用 CUDA IPC NVLINK 后端**：NIXL-EP 现在启用了 CUDA IPC NVLINK 后端，以提升节点内的 GPU 通信性能。（#1099）\n* **Libfabric 共享内存支持**：在 Libfabric 后端中，为 NVLink 节点内传输启用了共享内存提供者 (`shm`)。这通过利用 NVLink 而无需网络传输，从而提升了本地 GPU 到 GPU 通信的性能。（#1076）\n\n### 破坏性变更\n* **移除对 Python 3.9 的支持**：已移除对 Python 3.9 的支持。目前支持的 Python 版本为 3.10 至 3.14。（#1071）\n\n### API 变更\n* **[Python] 支持 Python 3.14**：正式添加对 Python 3.14 的支持。（#1071）\n* **[Python] 显式导出 API**：Python API 现在使用 `__all__` 显式导出，以控制公共命名空间并实现更整洁的导入。（#1062）\n* **[Rust] 自定义后端参数**：在 Rust 绑定中新增了传递自定义后端参数的支持。（#900）\n* **[Rust] 描述符序列化**：为 `RegDescList` 和 `XferDescList` 添加了 `Serde` 序列化支持。（#829）\n* **[Rust] 索引支持**：为描述符列表新增了 `Index`\u002F`IndexMut` 以及 `get`\u002F`get_mut` 方法。（#1003）\n\n### 增强功能\n**性能**\n* **[核心] 请求处理优化**：实施了请求处理优化，以降低大量小消息批次的开销。（#1009）\n* **[UCX] 松弛排序**：默认设置 `UCX_IB_PCI_RELAXED_ORDERING=try`，以在支持的情况下提升 PCI 性能。（#1012）\n* **[Libfabric] EFA ","2026-01-21T19:17:11",{"id":190,"version":191,"summary_zh":192,"released_at":193},324243,"0.8.0","## 摘要\n\nNVIDIA® NIXL 0.8.0 版本带来了显著的性能提升、多项重大新功能以及重要的依赖项更新。主要亮点包括：针对 UCX 后端中大批次工作负载的大幅优化、引入基于 Linux AIO 的全新 POSIX 后端以实现高性能存储 I\u002FO，以及为 Libfabric 后端提供直接 CUDA 内存注册支持。\n\n该版本包含一些破坏性变更，例如移除了旧版多对象 UCX 后端，并将 Libfabric 的最低要求版本进行了更新。此外，还新增了对 Python 3.13 的支持，并将默认构建类型更改为 `Release`，以便开箱即用时获得最佳性能。\n\n### 主要特性与改进\n\n*   **UCX 对大批次工作负载的性能优化：** UCX 后端的请求处理机制经过全面重构，以降低开销。对于包含大量小消息（\u003C1KB）的大批次工作负载（如 `sglang` 等使用分页注意力机制的 LLM 推理引擎），这一改进可显著降低延迟并缩短首个 token 的生成时间（TTFT）。内部基准测试显示，在这些场景下，`nixlbench` 的性能提升了约 50%，而 `sglang` 的 TTFT 则缩短了约 20%。(#982)\n*   **POSIX 后端的 Linux AIO 插件：** POSIX 后端现在可在可用时利用 Linux 异步 I\u002FO (`AIO`) API，从而为本地存储的异步数据传输提供高性能接口。内部基准测试表明，当读取数据量超过 100 KB 时，读取吞吐量有所提升。(#885)\n*   **Libfabric 的 CUDA 内存注册：** Libfabric 后端现可通过 `fi_mr_regattr` 直接注册 CUDA 内存区域，从而支持扩展的内存注册属性及优化的 RDMA 行为。(#960)\n*   **Python：** 新增对 Python 3.13 的支持。(#994)\n\n### 破坏性变更\n\n*   **移除 UCX 多对象后端：** 旧版多对象（UCX_MO）后端已被移除。用户应迁移到主 UCX 后端，该后端现已集成多设备支持。(#898)\n*   **Libfabric 最低版本提升：** 为支持新特性，Libfabric 的最低要求版本已提高至 **v1.21.0**。(#961)\n*   **默认构建类型更改为 `Release`：** 从源码构建时，默认构建类型现为 `Release`，而非 `Debug`。此举确保默认构建在性能上得到优化。(#869)\n\n### API 变更\n\n*   **[Rust]：** Rust 接口中新增了用于描述符管理的 `RegDescList` 和 `XferDescList` API。(#828)\n*   **[Python]：** 移除了 Python API 中已过时且未使用的代码。(#985)\n\n### 功能增强\n\n*   **[构建]：** 构建系统现会搜索由 `NIXL_PREFIX` 环境变量指定的路径中的库文件，从而更便于链接自定义构建。(#998)\n\n### Bug 修复\n\n*   **[核心]：** 通过改进错误处理机制，Socket 上的元数据交换得到了优化。(#999","2025-11-20T02:03:38",{"id":195,"version":196,"summary_zh":197,"released_at":198},324244,"0.7.1","## 摘要\nNVIDIA® NIXL 0.7.1 版是一个维护版本，引入了对 CUDA 13.0 的 Python 打包支持，改进了 Device API 的接口，并修复了 Libfabric 后端中的关键问题。\n\n*   **CUDA 12 和 CUDA 13 的 Python Wheels：**\n    本版本提供了官方支持的 CUDA 13 Python wheels，将打包拆分为一个 `nixl` 元包以及针对不同平台的包 `nixl-cu12` 和 `nixl-cu13`。更多信息请参见下方的“Python 打包变更”部分。\n*   **Device API：单线程支持多个队列对：**\n    单线程应用程序现在可以从一个线程驱动多个 QP，从而在无需创建多个代理或多个线程的情况下实现更高的性能。\n*   **Device API：改进了异步 API 处理：**\n    Device API 插件的 `post` 函数已更改为返回 `NIXL_IN_PROG`，以表示操作已提交但尚未完成，从而提高了异步调用的可预测性和性能。\n\n## Python 打包变更\nNIXL 现在使用 PyPI 上的 `nixl` 元包以及针对 CUDA 12 (`nixl-cuda12`) 和 CUDA 13 (`nixl-cuda13`) 的特定平台 wheels 进行打包。（#915、#954、#956、#966）\n\n### PyPI 用户\n\n现在可以通过 `pip install nixl[cu12]` 或 `pip install nixl[cu13]` 从 PyPI 安装所需的 wheel。\n\n为保持向后兼容性，`pip install nixl` 会自动安装 `nixl[cu12]`，继续无缝适用于 CUDA 12 用户，而无需更改下游项目的依赖项。\n\nCUDA 13 用户必须使用 `pip install nixl[cu13]` 来安装 NIXL。\n\n如果环境中同时安装了 `nixl-cu12` 和 `nixl-cu13`，则 `nixl-cu13` 将优先生效。\n\n### 从源代码安装 Python 包\n\n通过 `pip install .` 从源代码进行的安装现在需要额外的步骤来构建和安装 `nixl` 元包：\n\n在 CUDA 12 环境下：\n\n```bash\npip install .\npip install meson meson-python pybind11 tomlkit\nmeson setup build\nninja -C build\npip install build\u002Fsrc\u002Fbindings\u002Fpython\u002Fnixl-meta\u002Fnixl-*-py3-none-any.whl\n```\n\n在 CUDA 13 环境下：\n\n```bash\npip install .\npip install meson meson-python pybind11 tomlkit\n.\u002Fcontrib\u002Ftomlutil.py --wheel-name nixl-cu13 pyproject.toml\nmeson setup build\nninja -C build\npip install build\u002Fsrc\u002Fbindings\u002Fpython\u002Fnixl-meta\u002Fnixl-*-py3-none-any.whl\n```\n\n有关完整的 Docker 示例，请参阅 [此处](https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002F4d6cdcd203aae581030b2908140ad966b7a5d4b5\u002Fbenchmark\u002Fnixlbench\u002Fcontrib\u002FDockerfile#L208)。\n\n## API 变更\n*   [Device API] 异步 `post` 函数现在返回 `NIXL_IN_PROG`，以表示操作正在执行中，从而为非阻塞调用提供更清晰的状态信息（#911）。\n*   [Device API] 在后端添加了对 `worker_id` 选择的支持，允许通过单个线程驱动多个 QP 进行更精细的性能调优（#938）。\n\n## Bug 修复\n*   [Libfabric] 修复了偏移量计算方面的问题……","2025-11-06T19:06:36",{"id":200,"version":201,"summary_zh":202,"released_at":203},324245,"0.7.0","## 摘要\nNVIDIA® NIXL 0.7.0 版本引入了全新的 GUSLI 存储插件，新增对 CUDA 13.0 工具包的构建支持，并对 Libfabric、UCX 和 GPUNetIO 后端进行了重要改进。\n\n*   **GUSLI 存储插件：**\n    引入基于 [NVIDIA GUSLI](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FGUSLI) 的新型存储后端，用于实现对闪存存储的高性能访问。\n*   **CUDA 13 构建支持：**\n    现已支持使用 CUDA 13.0 从源代码构建 NIXL 及 `nixlbench` 套件，确保与最新驱动程序和库的兼容性。带有 CUDA 13.0 支持的官方二进制包计划在后续版本中发布。\n\n## 新特性\n*   引入 GUSLI 存储插件，用于高吞吐量、低延迟的 I\u002FO 操作 (#887)。\n\n## 改进\n*   [构建] 添加了使用 CUDA 13 从源代码构建 NIXL 和 `nixlbench` 的支持 (#820)。\n*   [Libfabric] 增加了对 AWS 上非 GDR 实例的支持，并解决了 Python wheel 的兼容性问题 (#901, #937)。\n*   [设备 API] 改进了对 MoE 工作负载中 GPU 发起的 UCX 传输的支持，提供了更易用的 API 和更低的延迟 (#815)。\n\n## Bug 修复\n*   [Libfabric] 修复了多项稳定性问题，包括拓扑初始化中的双重释放错误、断开连接时的资源清理问题，以及异构节点中不对称链路配置的处理问题 (#839, #860, #926)。\n*   [Libfabric] 通过在资源不可用时手动推进完成队列，并为失败操作添加重试逻辑，提升了系统的鲁棒性 (#856, #859)。\n*   [Libfabric] 修复了 EFA 设备发现功能，使其能够正确识别设备 ID (#876)。\n\n## 依赖项\n*   [GPUNetIO] 将 GPUNetIO 后端升级至 DOCA 3.1 Verbs 库 (#733)。\n\n## 绑定\n*   Python wheel 不再捆绑 `libfabric`、`libefa` 和 `libhwloc`，因为这些库已在许多 AWS 基础镜像中提供，从而避免了版本不匹配的问题 (#937)。\n*   新增了一个用于远程存储操作的 Python 示例 (#841)。\n\n## 基准测试\n*   为新的 GUSLI 后端添加了 `nixlbench` 支持 (#897, #929)。\n*   通过使 `etcd` 对于存储后端变为可选，并修复密钥冲突逻辑，提升了 `nixlbench` 的灵活性 (#862, #878)。\n*   修复了 `nixlbench` POSIX 基准测试中的 API 参数不匹配问题 (#880)。\n\n## 构建与测试基础设施\n*   扩展了 CI 覆盖范围，启用了 DGX 系统上的测试，并为 `nixlbench` 添加了 GPU 特有的测试 (#834, #780)。\n*   解决了 CUDA 13 基础容器镜像上 `libibverbs-dev` 包安装失败的问题 (#889)。\n*   改进了构建系统，以避免在未找到 CUDA 或 UCX 依赖项时构建 gtest (#925)。\n*   添加了后端选择选项，便于调试不同的传输后端 (#822)。\n*   优化了发布版打包流程，以正确管理测试和示例的包含 (#872, #896)。\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.6.1...0.7.0","2025-10-24T19:05:35",{"id":205,"version":206,"summary_zh":207,"released_at":208},324246,"0.6.1","## 摘要\nNVIDIA® NIXL 0.6.1 版本引入了适用于 AWS 高性能网络的新 Libfabric 插件，扩展了语言绑定支持，并提升了系统稳定性。\n\n*   **适用于 AWS 的 Libfabric 网络：**\n    引入了一个新的 Libfabric 插件，以利用 AWS Elastic Fabric Adapter (EFA) 实现高性能、低延迟的通信。\n*   **语言绑定：**\n    Rust 绑定现已支持代理配置和 `getXferTelemetry` API，从而提升 Rust 应用程序的集成能力和可观测性。\n*   **性能与稳定性：**\n    解决了多个关键稳定性问题，进一步增强了系统的健壮性。\n\n## 新特性\n*   引入了一个新的 Libfabric 插件，该插件具备拓扑感知能力，可支持 AWS EFA 设备（#784、#801、#802、#809、#817、#826、#831、#833、#850、#852、#866、#867、#868）。\n*   添加了一个代理配置标志，用于在每个代理的基础上启用或禁用遥测数据采集（#764）。\n*   使元数据监听器中的 ETCD 监视超时时间可配置，以避免在高负载情况下发生超时（#766）。\n*   在对象存储插件中新增了 `ca_bundle` 选项，以兼容使用自签名证书的 S3 兼容存储服务（#806）。\n\n## 错误修复\n*   [核心] 修复了断开连接时的关键“使用已释放内存”错误，该错误会导致请求在未正确管理所有权的情况下被删除（#782）。\n*   [监听器] 防止了 etcd 客户端在短时间内连续接收到多次元数据更新时发生的崩溃（#765）。\n*   [遥测] 解决了遥测框架中的若干小问题（#750）。\n\n## 绑定\n*   [Rust] 添加了 ThreadSync 和 AgentConfig 的绑定（#824）。\n*   [Rust] 在 Rust 绑定中公开了 `getXferTelemetry` API，用于获取每项请求的性能指标（#823）。\n\n## 依赖项\n*   UCX 依赖项现为可选，允许在不使用 UCX 的情况下构建 NIXL（#825）。\n*   将 `nixlbench` 容器中的 DOCA 依赖版本升级至 3.1（#760）。\n\n## 基准测试\n*   通过确保 Python 虚拟环境正确激活，修复了 `nixlbench` 容器运行时的问题（#848）。\n*   修复了 NVSHMEM 工作进程中的函数签名不匹配问题（#786）。\n\n## 构建与测试基础设施\n*   改进了 `nixlbench` 构建脚本中的 CUDA 检测机制，并添加了回退方案（#777）。\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.6.0...0.6.1","2025-10-09T02:42:54",{"id":210,"version":211,"summary_zh":212,"released_at":213},324247,"0.6.0","## 摘要\nNVIDIA® NIXL 0.6.0 版本引入了针对功能强大的 GPU 启动操作 Device API 的预览版 API 支持，扩展了按请求粒度的遥测指标，并改进了语言绑定支持。此版本实现了 GPU 原生工作流的关键路线图项目，同时提升了可观测性、性能和稳定性。\n\n*   **面向 GPU 密集型工作流的 Device API：**\n    引入了 GPU 端 Device API 的预览版 API 支持，使应用程序能够直接从 GPU 内核发起数据传输。这显著降低了 GPU 原生工作负载中的主机与 GPU 同步开销。\n*   **细粒度性能遥测：**\n    遥测框架通过新增 API 得到扩展，可直接从客户端应用程序获取基于每个请求的详细性能指标，从而实现更精细的性能分析和更便捷的调试。\n*   **扩展的语言绑定：**\n    Python 和 Rust 绑定得到了大幅增强，新增对核心 API 的支持，并在 Python 中引入了更为符合语言习惯的面向对象构造。\n*   **性能与稳定性：**\n    在描述符列表处理方面实现了性能提升，增强了插件性能，并修复了多个稳定性问题，从而提高了整体鲁棒性。\n\n## 新特性\n*   引入了 NIXL Device API 的预览版 API 支持，允许直接从 GPU 内核创建和触发传输请求（#704、#705、#749、#720）。\n*   添加了 `getXferTelemetry` API，用于获取单个传输的详细指标（#702）。\n\n## 性能优化\n*   改进了未完全排序的描述符列表的 `populate` 方法性能（#729）。\n*   为 HF3FS 插件添加了内存池支持，以提升性能（#695）。\n\n## Bug 修复\n*   [核心] 修复了因使用已释放资源而导致的崩溃问题，该问题发生在未正确管理所有权的情况下删除请求时（#782、#783）。\n*   [核心] 提供了针对 ARM 构建中 Meson CUDA 检测失败的临时解决方案，避免 CUDA 支持不完整的情况（#743）。\n*   [UCX] 修正了 `checkXfer` 函数中的错误处理逻辑（#690）。\n*   [绑定] Python Wheel 打包现在会排除 `libcuda*` 和 `libcufile`，以避免捆绑系统库（#745）。\n*   [Mooncake 插件] 修复了未能正确过滤 `DOWN` 网络接口的问题（#406）。\n*   [Mooncake 插件] 修复了通知消息销毁时出现的问题（#739）。\n\n## API 变更\n*   从存储后端 API 中移除了 `progress` 方法及其进度指示器（#701）。\n*   从用户侧的 `nixlDescList` 中移除了 `sorted` 标志（#731）。\n*   从 `nixlDescList` 中移除了未使用的 `has_overlaps` 和 `overlaps` 方法（#718）。\n*   增强了插件管理器，减少了创建新插件所需的样板代码量（#622）。\n\n## 绑定\n*   Rust 绑定现已包含 `get_local_partial_md`、`send_local_partial_md`、`query_xfer_backend`、`make_xfer_req` 和 `prepare_xfer_dlist` 等函数（#684、#693）。\n*   为传输和描述符列表添加了符合 Python 语言习惯的对象。","2025-09-18T18:03:34",{"id":215,"version":216,"summary_zh":217,"released_at":218},324248,"0.5.1","## Summary\r\nNVIDIA® NIXL Release 0.5.1 delivers major improvements in performance and scalability, enhances Rust language support, and rolls out key infrastructure for performance analysis. This release successfully delivers on roadmap items including a new core threadpool and telemetry export capabilities.\r\n\r\n*   **Performance and Scalability:**\r\n    Introduces the UCX backend threadpool for parallel posting of large requests, reducing latency by up to 30-87% in production workloads.\r\n*   **Telemetry Infrastructure:**\r\n    Introduces a telemetry framework that records and exports detailed transfer metrics, providing insights for performance analysis.\r\n*   **Enhanced Rust Integration:**\r\n    Rust support has been significantly expanded with full build and CI integration using Meson, CI-based testing, and the addition of new API bindings.\r\n*   **Stability and Resiliency:**\r\n    Fixes multiple critical stability issues that may cause crashes, and improves error handling and cleanup.\r\n\r\n## New Features\r\n*   Added a telemetry framework to track and export detailed transfer data (#562).\r\n*   The `queryMem` API is now available in the Rust bindings (#620).\r\n*   The Mooncake plugin now supports the `notify` operation (#493, #696).\r\n*   Added `nixlbench` support for the GDS_MT backend (#671).\r\n\r\n## Performance\r\n*   Introduced a threadpool to the UCX backend for parallel posting of large requests, significantly improving performance and scalability (#573, #606).\r\n\r\n## Observability\r\n*   Improved logging throughout the Python codebase (#633).\r\n*   `nixlbench` now measures tail latency and latency breakdown for performance analysis (#591).\r\n\r\n## Bugfixes\r\n*   Reworked metadata listener connection logic to fix a stack corruption bug (#681).\r\n*   Fixed a race condition in notification handling (#649).\r\n*   Improved error handling for canceled transfers (#677) and agent metadata cleanup on failure (#509).\r\n*   Fixed a bug in the GDS plugin related to `CUfileDescr_t` initialization (#721).\r\n*   Corrected `nixlbench` implementation for `--num_files` and `READ` mode (#635).\r\n*   Resolved a build regression in the Mooncake plugin (#710).\r\n\r\n## Dependencies\r\n*   UCX dependency upgraded to 1.19.0 (#673).\r\n*   GDRCopy dependency upgraded to 2.5.1 (#712).\r\n\r\n## Build and Test Infrastructure\r\n*   Enabled the build, installation, and testing of Rust bindings with Meson and CI (#290).\r\n*   Added a new CI check to detect excessively large pull requests (#627).\r\n*   Unified the container builder to support both `NIXL` and `NIXLBench` (#567).\r\n*   Implemented timeouts for CI test steps to prevent hangs (#660).\r\n*   Improved CI test scripting to avoid TCP port collisions (#568).\r\n*   Simplified the project's Docker build process (#604) and fixed build failures on Ubuntu 22.04 (#653).\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.5.0...0.5.1","2025-08-28T08:04:27",{"id":220,"version":221,"summary_zh":222,"released_at":223},324249,"0.5.0","## Summary\r\nNVIDIA® NIXL Release 0.5.0 introduces new API capabilities, major improvements to the build, performance infrastructure and several bug fixes.\r\n\r\n*   **New Capabilities:**\r\n    New `queryMem` API for inspecting memory registrations. New Telemetry API for tracking transfer time. Updated H3FS backend to support zero-copy using shared memory to improve performance.\r\n\r\n*   **Stronger Infrastructure:**\r\n    Significantly improved the build and testing infrastructure. Build portability was increased with fixes for RHEL8, and CI stability has been hardened. Smaller wheels and robust plugin dependency loading improve deployment.\r\n\r\n## New Features\r\n*  Added `queryMem` API to query properties of registered memory\r\n*  Added telemetry infrastructure that is used to track transfer time for performance analysis. \r\n*  Added a configurable checksum toggle for the Object (S3) plugin\r\n*  Expanded `nixlbench` to support the Object (S3) storage plugin\r\n\r\n## Infrastructure and Performance Improvements\r\n*  The HF3FS backend now uses shared memory to significantly improve performance by avoiding memory copy.\r\n*  Added a comprehensive Backend Developer Guide to the documentation\r\n*  Improved `nixlbench` with page-aligned memory allocation for better performance\r\n*  Enabled running `nixlbench` tests in parallel using a shared etcd server\r\n*  Unified and simplified the Python wheel building process\r\n*  Improved python logging project-wide\r\n*  Reduced the size of Python wheels\r\n*  Added `etcd` gtests to the CI pipeline\r\n*  Added gtest infrastructure for plugins\r\n*  Fixed various build issues for `etcd-cpp-api`, Meson, and RHEL8 environments\r\n*  Fixed an issue in `registerMem` for the Object plugin\r\n*  Fixed the Rust cargo package definition\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.4.1...0.5.0","2025-08-06T21:26:07",{"id":225,"version":226,"summary_zh":227,"released_at":228},324250,"0.4.1","## Summary\r\nNVIDIA® NIXL Release 0.4.1 delivers improvements in resiliency, and developer experience.\r\n* Improved resiliency:\r\nAdvanced error handling is enabled in the UCX backend, ensuring that send requests are always completed even in case of remote failure.\r\n* Stronger infrastructure:\r\nAdded NIXL plugins into the packaged python wheel, and improved NIXL plugins loader, to facilitate usage of NIXL installed as a Python package.\r\n\r\n## Developer Enhancements\r\n* Enabled advanced error handling in UCX backend\r\n* Added NIXL plugins to the wheel\r\n* Improved NIXL plugins loader\r\n* Enhanced resource management in UCX backend\r\n* Extended contributing and coding style guidelines\r\n\r\n## Testing\r\n* Enabled CI running on GPU\r\n* Extended gtest covering Agent class functionality\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.4.0...0.4.1","2025-07-24T05:08:36",{"id":230,"version":231,"summary_zh":232,"released_at":233},324251,"0.4.0","## Summary\r\nNVIDIA® NIXL Release 0.4.0 delivers key improvements in performance, portability, and developer experience.\r\n* New capabilities:\r\nIntroduced two new storage plugins. GDS (GPUDirect Storage) multi-threaded and AWS S3 (Amazon Simple Storage Service). We also integrated Custom Traffic Performance Test (CTPerf) into our benchmarks to measure KV Cache Transfers over network.\r\n\r\n* Stronger infrastructure:\r\nImproved the existing backends UCX, GDS, POSIX, GPUNetIO, boosting performance and extending functionality. Enhanced python wheels adding ARM 64 support, enabling compression.\r\n\r\n* Improved testing:\r\nExpanded `nixlbench` to support Mooncake and HF3FS backends. Integrated clang-format stage into CI, enhanced logging in various CI stages.\r\n\r\n## New Features\r\n* Added GDS multi-threaded storage plugin\r\n* Added OBJ storage plugin for AWS S3 generic objects\r\n* Integrated CTPerf into `kvbench`\r\n* Expanded `nixlbench` to support Mooncake and HF3FS backends\r\n\r\n## Developer Enhancements\r\n* Improved performance of UCX plugin for EFA\r\n* Fixed Dockerfile to support setups with different versions of CUDA drivers\r\n* Added missing UCX plugins to enable CUDA and RDMACM support in UCX backend\r\n* Compressed wheels after patching to reduce the size of the wheels\r\n* Added support for building python wheels on ARM 64\r\n* Decreased the default number of GDS batches created to facilitate creating multiple NIXL agents\r\n* Fixed GPUNetIO plugin usage by `nixlbench`\r\n* Improved memory type detection in UCX backend\r\n* Enabled reposting requests using POSIX backend\r\n* Added support for self notification in UCX backend\r\n* Upgraded ssl dependency for wheel\r\n* Fixed python bindings to handle unsorted descriptor lists\r\n* Improved NIXL logging for better debugging\r\n* Enhanced resource management and error handling in UCX backend\r\n \r\n## Testing\r\n* Added clang-format CI stage to improve the quality of the code\r\n* Removes automatic retry on AWS job failures, adds GitHub metadata tags for traceability and better debugging\r\n* Renamed CI Docker images to explicitly show OS coverage in our test matrix\r\n* Improved logging in AWS EFA tests for better visibility and debugging\r\n* Added Jenkins job for `nixlbench` container builds\r\n* Reduced gtest wall time improving error handling tests\r\n* Fixed multi-threaded gtest output\r\n* Enabled building with stub API in Rust bindings\r\n* Fixed C++ examples to enable cmake build\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.3.1...0.4.0","2025-07-10T17:34:30",{"id":235,"version":236,"summary_zh":237,"released_at":238},324252,"0.3.1","## Summary\r\nNVIDIA® NIXL Release 0.3.1 delivers key improvements in **performance**, **portability**, and **developer experience**.\r\n\r\n- **New capabilities**:  \r\n  Introduced GDAKI (GPUDirect Async Kernel-Initiated) transfers using DOCA RDMA (our Data-Center-on-a-Chip framework with Remote Direct Memory Access). We also added smarter runtime defaults and enhanced Python and Rust bindings for streamlined LLM runtime development.\r\n\r\n- **Stronger infrastructure**:  \r\n  Refactored the UCX backend for improved performance and clarity, added support for AArch64 and manylinux (Python's portable binary distribution standard), and cleaned up the API structure for better maintainability.\r\n\r\n- **Improved testing**:  \r\n  Integrated **Blossom-CI** (our custom continuous integration pipeline), expanded multi-threaded and RDMA test coverage, and enhanced logging and diagnostics for faster issue resolution.\r\n\r\n## New Features\r\n\r\n- Added GDAKI support integrating DOCA GPUNetIO and RDMA in stream mode  \r\n- Enabled listener-side invalidation messaging and socket-based transfers  \r\n- Expanded `nixlbench` to configure GDS batch and pool sizes  \r\n- Auto-detection of NIXL library path based on host CPU architecture  \r\n- Enabled VMM CUDA memory allocation in `nixlbench`  \r\n- Ensured that CPU is used as default device when CUDA is disabled  \r\n- Improved Python bindings with NumPy array support and performance tuning  \r\n- Enhanced Rust bindings with usability improvements and feature support  \r\n\r\n## Developer Enhancements\r\n\r\n- Refactored UCX backend including RMA rails configuration and worker logic  \r\n- Improved UCX progress engine granularity for better performance  \r\n- Fixed memory registration checks in UCX memory operations  \r\n- Simplified and corrected UCP AM header usage  \r\n- Optimized backend serialization to include only supported options  \r\n- Introduced `Dockerfile.manylinux` for Python packaging compatibility  \r\n- Added build and containerization support for AArch64  \r\n- Cleaned up UCX plugin notification list handling  \r\n- Updated Rust and C++ APIs for better structure and public access clarity  \r\n- Fixed missing headers in 3fs plugin and ensured build consistency  \r\n- Standardized naming conventions in benchmarking tools  \r\n- Expanded contributor roles and ownership metadata  \r\n- Streamlined dependencies and build options for GDS and IB devices  \r\n\r\n## Testing & Performance\r\n\r\n- Enabled multi-threaded GTest transfers for concurrency testing  \r\n- Integrated `blossom-ci` for more robust CI builds and EFA test debugging  \r\n- Increased CI timeouts and polling intervals for more reliable AWS tests  \r\n- Improved logging in plugin tests for better visibility and debugging  \r\n- Added support for ARM builds and cross-platform validation  \r\n- Fixed UCX build issues with IB dependencies and container options  \r\n- Ensured accurate attribution updates and release version tagging \r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.3.0...0.3.1","2025-07-01T17:26:45",{"id":240,"version":241,"summary_zh":242,"released_at":243},324253,"0.3.0","## What's Changed\r\n### New Features\r\n\r\n* **New Backends & Plugins**:\r\n\r\n  * Added 3FS backend support\r\n  * Added POSIX plugin support in `nixlbench`\r\n\r\n* **Architecture Support**:\r\n\r\n  * Enabled ARM (aarch64) build support for `nixl-sys`\r\n\r\n* **Cost & Performance Tools**:\r\n\r\n  * Implemented cost estimation and exposed a Python API\r\n  * Added NIXL KV Cache Benchmark to simulate KV Cache Transfers through NIXL\r\n\r\n* **Synchronization Improvements**:\r\n\r\n  * Introduced a new synchronization read-write locking mode\r\n  * UCX\\_MO now sends completion notifications in `postXfer`\r\n  * Included missing UCX backend options\r\n\r\n\r\n### Developer Enhancements\r\n\r\n* **UCX Improvements**:\r\n\r\n  * Refactored plugin glue logic and removed busy-polling in progress thread\r\n  * Enabled multi-worker support\r\n  * Improved error handling, debug logging, and support for multi-GPU and CUDA IPC\r\n  * Added configuration options for error handling and UCX endpoint estimation\r\n  * Set CUDA context for remote MD operations\r\n  * Improved UCX backend read\u002Fwrite operation handling\r\n\r\n* **Build & Packaging**:\r\n\r\n  * Added crate metadata and updated Python dependencies\r\n  * Fixed build issues on RedHat8, Manylinux, and 3FS plugin\r\n  * Removed unused packages and improved consistency in naming\r\n\r\n* **Code Quality & Maintenance**:\r\n\r\n  * Marked internal methods with `const`\r\n  * Updated CODEOWNERS, attribution files, and third-party notices\r\n  * Ignored crate files in pre-commit checks\r\n\r\n* **Rust Bindings**:\r\n\r\n  * Improved Rust API with missing functions, reordering, and better test coverage\r\n  * Fixed build and example issues\r\n\r\n* **Agent & Metadata Handling**:\r\n\r\n  * Improved ETCD metadata fetch and invalidation using watchers\r\n  * Updated agent API to require labels\r\n  * Removed unsafe move semantics and applied socket timeout handling\r\n\r\n\r\n### Testing & Performance\r\n\r\n* **CI & Infrastructure**:\r\n\r\n  * Added AWS EFA testing infrastructure\r\n  * Enabled container builds with UCX from source\r\n\r\n* **Stability & Reliability**:\r\n\r\n  * Applied timeout to polling thread and agent socket connections\r\n  * Fixed memory leaks and double-destroy issues in batch I\u002FO\r\n  * Resolved multiple build and runtime bugs for better test coverage and platform support\r\n \r\n### Known Issues\r\n  * Arm builds for GDS plugin, nixl, nixlbench aren't supported correctly for this release - #273, #414\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.2.1...0.3.0","2025-06-02T21:17:36",{"id":245,"version":246,"summary_zh":247,"released_at":248},324254,"0.2.1","## What's Changed\r\n\r\n### New Features\r\n- Added **Mooncake backend** to support the Mooncake Transfer Engine.\r\n- **ARM (aarch64)** build support for broader platform compatibility.\r\n- Added **POSIX plugin** with support for both AIO and io_uring.\r\n- Enhanced **NIXLBench** with improved barrier handling, environment persistence, and graceful termination.\r\n- Enabled **multi-threaded test execution** in `nixl_test`.\r\n- Enhanced **UCX backend** with `cuda-ipc` support and optimized request completion notifications.\r\n\r\n### Developer Enhancements\r\n- Unified and improved **logging infrastructure** across components.\r\n- Applied modern C++ practices: `using` declarations, smart pointers, and cleaner memory management.\r\n- Improved Python usability: clarified GDS dependencies, updated examples, and streamlined test behavior.\r\n- Documentation updates: plugin requirements, ETCD hostname usage, PyPI references, and ownership metadata.\r\n\r\n### Testing & Performance\r\n- Extended unit and integration test coverage using `pytest` and `gtest`.\r\n- Improved CI workflows: system info logging, updated runner labels, and test configuration controls.\r\n- Resolved performance and stability issues:\r\n  - Fixed memory leak in UCX `genNotif()`\r\n  - Addressed descriptor list and statistic calculation bugs in NIXLBench\r\n  - Minor communication reliability fixes in the Agent\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.2.0...0.2.1","2025-05-22T18:14:31",{"id":250,"version":251,"summary_zh":252,"released_at":253},324255,"0.2.0","## What's Changed\r\n\r\n### New Features\r\n- Metadata exchange APIs\r\n  - API for exchanging metadata through listener thread\r\n  - API for getting\u002Fsending partial agent metadata\r\n  - New APIs can exchange metadata in peer-to-peer mode (sockets) or central metadata server (ETCD)\r\n  - See Doxygen documentation for new APIs, and README for ETCD instructions\r\n- Improved transfer preparation backend API for better performance with storage backends\r\n- Basic C and Rust bindings\r\n- Thread safe mode that can be enabled through agent config\r\n- Introducing nixlbench - our NIXL performance benchmarking suite\r\n  - https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fblob\u002Fmain\u002Fbenchmark\u002Fnixlbench\u002FREADME.md\r\n\r\n### Developer Tools\r\n- Logging infrastructure to enable new developers to use NIXL logging macros\r\n- Debug mode infrastructure to enable new features that will only run in debug mode\r\n- Integrate Abseil for thread and logging features\r\n\r\n### Testing and Performance\r\n- AWS EFA Testing with latest version of UCX\r\n- Gtest infrastructure and mock backend for unit testing\r\n- Various Bug fixes, optimizations and documentation clarification\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.1.1...0.2.0","2025-05-01T14:09:11",{"id":255,"version":256,"summary_zh":257,"released_at":258},324256,"0.1.1","## What's Changed\r\n\r\n* Auto register-deregister by @mkhazraee in #56 and #55 and #128 \r\n* Multi-Object (MO) UCX backend implementation to support multi-GPU buffers in a single transfer request by @artpol84 in #58 \r\n* Return bytes for notifications in Python API by @tstamler in #109\r\n* Add typing and backend selection to Python API by @tstamler in #86\r\n* Discover plugins in directory by @aranadive in #82\r\n* Allow dynamic linking of plugins in wheel by @tstamler in #94\r\n* Update dockerfiles and use cuda-dl-base image by @aranadive in #106 and @nv-anants in #115\r\n* Add Doxygen for c++ and python APIs by @vvenkates27 in #32 and @tstamler in #127\r\n* Improved examples for GDS by @vvenkates27 in #114, and initial Pytest implementation by @tstamler in #100\r\n* Switch to C++17 by @aranadive in #104\r\n* Add code of conduct for project by @saturley-hall in #102\r\n* Several bug fixes, performance improvements and clarifications in the documentation\r\n* Improved tests throughout, for pytest, agent, and GDS\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fai-dynamo\u002Fnixl\u002Fcompare\u002F0.1.0...0.1.1","2025-04-11T22:44:31",{"id":260,"version":261,"summary_zh":262,"released_at":263},324257,"0.1.0","**NVIDIA Inference Xfer Library (NIXL)** is targeted for accelerating point to point communications in AI inference frameworks such as NVIDIA Dynamo, while providing an abstraction over various types of memory (e.g., CPU and GPU) and storage (e.g., file, block and object store) through a modular plug-in architecture.\r\n\r\n### NIXL features\r\n#### Core Transfer Capabilities\r\n* Accelerated point-to-point communications for AI inference workloads\r\n* Efficient zero-copy memory transfer between CPU and GPU\r\n* Direct GPU-to-GPU transfer with RDMA support via UCX\r\n* Asynchronous transfer operations with completion notifications\r\n#### Memory Abstraction\r\n* Unified interface across heterogeneous memory types (DRAM, VRAM)\r\n* Transparent handling of different storage backends (file, block, object)\r\n* Intelligent buffer management for optimal data placement\r\n#### Metadata Management\r\n* Lightweight serialization system for transfer descriptors\r\n* Cross-platform memory region exchange\r\n* Zero-overhead lookup mechanisms for registered memory\r\n#### Plugin Architecture\r\n* Modular design enabling custom communication backends\r\n* UCX and GPU Direct Storage plugins for high-performance networking and storage\r\n* Extensible framework for various types of backends to perform data transfer","2025-03-18T14:55:17"]