[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-InternLM--lmdeploy":3,"tool-InternLM--lmdeploy":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",158594,2,"2026-04-16T23:34:05",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":72,"owner_avatar_url":73,"owner_bio":74,"owner_company":75,"owner_location":75,"owner_email":76,"owner_twitter":77,"owner_website":78,"owner_url":79,"languages":80,"stars":112,"forks":113,"last_commit_at":114,"license":115,"difficulty_score":10,"env_os":116,"env_gpu":117,"env_ram":118,"env_deps":119,"category_tags":125,"github_topics":126,"view_count":32,"oss_zip_url":75,"oss_zip_packed_at":75,"status":17,"created_at":138,"updated_at":139,"faqs":140,"releases":170},8284,"InternLM\u002Flmdeploy","lmdeploy","LMDeploy is a toolkit for compressing, deploying, and serving LLMs.","LMDeploy 是一套专为大语言模型（LLM）打造的高效工具箱，核心功能涵盖模型压缩、部署与服务化。它主要解决了大模型在实际应用中面临的显存占用高、推理速度慢以及部署流程复杂等痛点，让庞大的模型能在有限的硬件资源上流畅运行。\n\n无论是希望快速搭建本地对话服务的开发者，还是致力于模型性能优化的研究人员，亦或是需要集成多模态能力的工程师，都能从 LMDeploy 中获益。其独特技术亮点在于自研的 TurboMind 推理引擎，支持 4-bit 权重量化、FP8 混合专家模型（MoE）优化以及 CUDA 图加速，显著提升了推理吞吐量。此外，它不仅全面适配 NVIDIA GPU，还率先支持华为昇腾（Ascend）平台，并能无缝集成 DeepSeek、Qwen、Llama 等主流前沿模型及多模态视觉语言模型。通过简洁的接口，LMDeploy 帮助用户轻松实现从模型量化到高性能服务发布的全流程，是构建高效 AI 应用的得力助手。","\u003Cdiv align=\"center\">\n  \u003Cimg src=\"docs\u002Fen\u002F_static\u002Fimage\u002Flmdeploy-logo.svg\" width=\"450\"\u002F>\n\n[![PyPI](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Flmdeploy)](https:\u002F\u002Fpypi.org\u002Fproject\u002Flmdeploy)\n![PyPI - Downloads](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fdm\u002Flmdeploy)\n[![license](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flicense\u002FInternLM\u002Flmdeploy.svg)](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Ftree\u002Fmain\u002FLICENSE)\n[![issue resolution](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fissues-closed-raw\u002FInternLM\u002Flmdeploy)](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues)\n[![open issues](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fissues-raw\u002FInternLM\u002Flmdeploy)](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues)\n\n[📘Documentation](https:\u002F\u002Flmdeploy.readthedocs.io\u002Fen\u002Flatest\u002F) |\n[🛠️Quick Start](https:\u002F\u002Flmdeploy.readthedocs.io\u002Fen\u002Flatest\u002Fget_started\u002Fget_started.html) |\n[🤔Reporting Issues](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues\u002Fnew\u002Fchoose)\n\nEnglish | [简体中文](README_zh-CN.md) | [日本語](README_ja.md)\n\n👋 join us on [![Static Badge](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F-grey?style=social&logo=wechat&label=WeChat)](https:\u002F\u002Fcdn.vansin.top\u002Finternlm\u002Flmdeploy.jpg)\n[![Static Badge](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F-grey?style=social&logo=twitter&label=Twitter)](https:\u002F\u002Ftwitter.com\u002Fintern_lm)\n[![Static Badge](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F-grey?style=social&logo=discord&label=Discord)](https:\u002F\u002Fdiscord.gg\u002Fxa29JuW87d)\n\n\u003C\u002Fdiv>\n\n______________________________________________________________________\n\n## Latest News 🎉\n\n\u003Cdetails open>\n\u003Csummary>\u003Cb>2026\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2026\u002F04\\] The LMDeploy project on PyPI has reached its storage quota, so pre-built wheels for new releases cannot be uploaded for the time being. You can download packages from the [GitHub Releases](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Freleases) page or install from source instead. We will update this notice when wheel uploads to PyPI resume. Affected versions: >=0.12.2\n- \\[2026\u002F02\\] Support [Qwen3.5](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002FQwen\u002Fqwen35)\n- \\[2026\u002F02\\] Support [vllm-project\u002Fllm-compressor](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fllm-compressor) 4bit symmetric\u002Fasymmetric quantization. Refer [here](.\u002Fdocs\u002Fen\u002Fquantization\u002Fllm_compressor.md) for detailed guide\n\n\u003C\u002Fdetails>\n\n\u003Cdetails close>\n\u003Csummary>\u003Cb>2025\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2025\u002F09\\] TurboMind supports MXFP4 on NVIDIA GPUs starting from V100, achieving 1.5x the performmance of vLLM on H800 for openai gpt-oss models!\n- \\[2025\u002F06\\] Comprehensive inference optimization for FP8 MoE Models\n- \\[2025\u002F06\\] DeepSeek PD Disaggregation deployment is now supported through integration with [DLSlime](https:\u002F\u002Fgithub.com\u002FDeepLink-org\u002FDLSlime) and [Mooncake](https:\u002F\u002Fgithub.com\u002Fkvcache-ai\u002FMooncake). Huge thanks to both teams!\n- \\[2025\u002F04\\] Enhance DeepSeek inference performance by integration deepseek-ai techniques: FlashMLA, DeepGemm, DeepEP, MicroBatch and eplb\n- \\[2025\u002F01\\] Support DeepSeek V3 and R1\n\n\u003C\u002Fdetails>\n\n\u003Cdetails close>\n\u003Csummary>\u003Cb>2024\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2024\u002F11\\] Support Mono-InternVL with PyTorch engine\n- \\[2024\u002F10\\] PyTorchEngine supports graph mode on ascend platform, doubling the inference speed\n- \\[2024\u002F09\\] LMDeploy PyTorchEngine adds support for [Huawei Ascend](.\u002Fdocs\u002Fen\u002Fget_started\u002Fascend\u002Fget_started.md). See supported models [here](docs\u002Fen\u002Fsupported_models\u002Fsupported_models.md)\n- \\[2024\u002F09\\] LMDeploy PyTorchEngine achieves 1.3x faster on Llama3-8B inference by introducing CUDA graph\n- \\[2024\u002F08\\] LMDeploy is integrated into [modelscope\u002Fswift](https:\u002F\u002Fgithub.com\u002Fmodelscope\u002Fswift) as the default accelerator for VLMs inference\n- \\[2024\u002F07\\] Support Llama3.1 8B, 70B and its TOOLS CALLING\n- \\[2024\u002F07\\] Support [InternVL2](docs\u002Fen\u002Fmulti_modal\u002Finternvl.md) full-series models, [InternLM-XComposer2.5](docs\u002Fen\u002Fmulti_modal\u002Fxcomposer2d5.md) and [function call](docs\u002Fen\u002Fllm\u002Fapi_server_tools.md) of InternLM2.5\n- \\[2024\u002F06\\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next\n- \\[2024\u002F05\\] Balance vision model when deploying VLMs with multiple GPUs\n- \\[2024\u002F05\\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2\n- \\[2024\u002F04\\] Support Llama3 and more VLMs, such as InternVL v1.1, v1.2, MiniGemini, InternLMXComposer2.\n- \\[2024\u002F04\\] TurboMind adds online int8\u002Fint4 KV cache quantization and inference for all supported devices. Refer [here](docs\u002Fen\u002Fquantization\u002Fkv_quant.md) for detailed guide\n- \\[2024\u002F04\\] TurboMind latest upgrade boosts GQA, rocketing the [internlm2-20b](https:\u002F\u002Fhuggingface.co\u002Finternlm\u002Finternlm2-20b) model inference to 16+ RPS, about 1.8x faster than vLLM.\n- \\[2024\u002F04\\] Support Qwen1.5-MOE and dbrx.\n- \\[2024\u002F03\\] Support DeepSeek-VL offline inference pipeline and serving.\n- \\[2024\u002F03\\] Support VLM offline inference pipeline and serving.\n- \\[2024\u002F02\\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on.\n- \\[2024\u002F01\\] [OpenAOE](https:\u002F\u002Fgithub.com\u002FInternLM\u002FOpenAOE) seamless integration with [LMDeploy Serving Service](docs\u002Fen\u002Fllm\u002Fapi_server.md).\n- \\[2024\u002F01\\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](docs\u002Fen\u002Fllm\u002Fproxy_server.md)\n- \\[2024\u002F01\\] Support [PyTorch inference engine](.\u002Fdocs\u002Fen\u002Finference\u002Fpytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable  rapid experimentation with new features and technologies.\n\n\u003C\u002Fdetails>\n\n\u003Cdetails close>\n\u003Csummary>\u003Cb>2023\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2023\u002F12\\] Turbomind supports multimodal input.\n- \\[2023\u002F11\\] Turbomind supports loading hf model directly. Click [here](docs\u002Fen\u002Finference\u002Fload_hf.md) for details.\n- \\[2023\u002F11\\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75\n- \\[2023\u002F09\\] TurboMind supports Qwen-14B\n- \\[2023\u002F09\\] TurboMind supports InternLM-20B\n- \\[2023\u002F09\\] TurboMind supports all features of Code Llama: code completion, infilling, chat \u002F instruct, and python specialist. Click [here](.\u002Fdocs\u002Fen\u002Fllm\u002Fcodellama.md) for deployment guide\n- \\[2023\u002F09\\] TurboMind supports Baichuan2-7B\n- \\[2023\u002F08\\] TurboMind supports flash-attention2.\n- \\[2023\u002F08\\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling\n- \\[2023\u002F08\\] TurboMind supports Windows (tp=1)\n- \\[2023\u002F08\\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation. Check [this](docs\u002Fen\u002Fquantization\u002Fw4a16.md) guide for detailed info\n- \\[2023\u002F08\\] LMDeploy has launched on the [HuggingFace Hub](https:\u002F\u002Fhuggingface.co\u002Flmdeploy), providing ready-to-use 4-bit models.\n- \\[2023\u002F08\\] LMDeploy supports 4-bit quantization using the [AWQ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00978) algorithm.\n- \\[2023\u002F07\\] TurboMind supports Llama-2 70B with GQA.\n- \\[2023\u002F07\\] TurboMind supports Llama-2 7B\u002F13B.\n- \\[2023\u002F07\\] TurboMind supports tensor-parallel inference of InternLM.\n\n\u003C\u002Fdetails>\n\n______________________________________________________________________\n\n# Introduction\n\nLMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the [MMRazor](https:\u002F\u002Fgithub.com\u002Fopen-mmlab\u002Fmmrazor) and [MMDeploy](https:\u002F\u002Fgithub.com\u002Fopen-mmlab\u002Fmmdeploy) teams. It has the following core features:\n\n- **Efficient Inference**: LMDeploy delivers up to 1.8x higher request throughput than vLLM, by introducing key features like persistent batch(a.k.a. continuous batching), blocked KV cache, dynamic split&fuse, tensor parallelism, high-performance CUDA kernels and so on.\n\n- **Effective Quantization**: LMDeploy supports weight-only and k\u002Fv quantization, and the 4-bit inference performance is 2.4x higher than FP16. The quantization quality has been confirmed via OpenCompass evaluation.\n\n- **Effortless Distribution Server**: Leveraging the request distribution service, LMDeploy facilitates an easy and efficient deployment of multi-model services across multiple machines and cards.\n\n- **Excellent Compatibility**: LMDeploy supports [KV Cache Quant](docs\u002Fen\u002Fquantization\u002Fkv_quant.md), [AWQ](docs\u002Fen\u002Fquantization\u002Fw4a16.md) and [Automatic Prefix Caching](docs\u002Fen\u002Finference\u002Fturbomind_config.md) to be used simultaneously.\n\n# Performance\n\n![v0 1 0-benchmark](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FInternLM_lmdeploy_readme_9b9b13fa758d.png)\n\n# Supported Models\n\n\u003Ctable>\n\u003Ctbody>\n\u003Ctr align=\"center\" valign=\"middle\">\n\u003Ctd>\n  \u003Cb>LLMs\u003C\u002Fb>\n\u003C\u002Ftd>\n\u003Ctd>\n  \u003Cb>VLMs\u003C\u002Fb>\n\u003C\u002Ftd>\n\u003Ctr valign=\"top\">\n\u003Ctd align=\"left\" valign=\"top\">\n\u003Cul>\n  \u003Cli>Llama (7B - 65B)\u003C\u002Fli>\n  \u003Cli>Llama2 (7B - 70B)\u003C\u002Fli>\n  \u003Cli>Llama3 (8B, 70B)\u003C\u002Fli>\n  \u003Cli>Llama3.1 (8B, 70B)\u003C\u002Fli>\n  \u003Cli>Llama3.2 (1B, 3B)\u003C\u002Fli>\n  \u003Cli>InternLM (7B - 20B)\u003C\u002Fli>\n  \u003Cli>InternLM2 (7B - 20B)\u003C\u002Fli>\n  \u003Cli>InternLM3 (8B)\u003C\u002Fli>\n  \u003Cli>InternLM2.5 (7B)\u003C\u002Fli>\n  \u003Cli>Qwen (1.8B - 72B)\u003C\u002Fli>\n  \u003Cli>Qwen1.5 (0.5B - 110B)\u003C\u002Fli>\n  \u003Cli>Qwen1.5 - MoE (0.5B - 72B)\u003C\u002Fli>\n  \u003Cli>Qwen2 (0.5B - 72B)\u003C\u002Fli>\n  \u003Cli>Qwen2-MoE (57BA14B)\u003C\u002Fli>\n  \u003Cli>Qwen2.5 (0.5B - 32B)\u003C\u002Fli>\n  \u003Cli>Qwen3, Qwen3-MoE\u003C\u002Fli>\n  \u003Cli>Qwen3-Next(80B)\u003C\u002Fli>\n  \u003Cli>Baichuan (7B)\u003C\u002Fli>\n  \u003Cli>Baichuan2 (7B-13B)\u003C\u002Fli>\n  \u003Cli>Code Llama (7B - 34B)\u003C\u002Fli>\n  \u003Cli>ChatGLM2 (6B)\u003C\u002Fli>\n  \u003Cli>GLM-4 (9B)\u003C\u002Fli>\n  \u003Cli>GLM-4-0414 (9B, 32B)\u003C\u002Fli>\n  \u003Cli>CodeGeeX4 (9B)\u003C\u002Fli>\n  \u003Cli>YI (6B-34B)\u003C\u002Fli>\n  \u003Cli>Mistral (7B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-MoE (16B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V2 (16B, 236B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V2.5 (236B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V3 (685B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V3.2 (685B)\u003C\u002Fli>\n  \u003Cli>Mixtral (8x7B, 8x22B)\u003C\u002Fli>\n  \u003Cli>Gemma (2B - 7B)\u003C\u002Fli>\n  \u003Cli>StarCoder2 (3B - 15B)\u003C\u002Fli>\n  \u003Cli>Phi-3-mini (3.8B)\u003C\u002Fli>\n  \u003Cli>Phi-3.5-mini (3.8B)\u003C\u002Fli>\n  \u003Cli>Phi-3.5-MoE (16x3.8B)\u003C\u002Fli>\n  \u003Cli>Phi-4-mini (3.8B)\u003C\u002Fli>\n  \u003Cli>MiniCPM3 (4B)\u003C\u002Fli>\n  \u003Cli>SDAR (1.7B-30B)\u003C\u002Fli>\n  \u003Cli>gpt-oss (20B, 120B)\u003C\u002Fli>\n  \u003Cli>GLM-4.7-Flash (30B)\u003C\u002Fli>\n  \u003Cli>GLM-5 (754B)\u003C\u002Fli>\n\u003C\u002Ful>\n\u003C\u002Ftd>\n\u003Ctd>\n\u003Cul>\n  \u003Cli>LLaVA(1.5,1.6) (7B-34B)\u003C\u002Fli>\n  \u003Cli>InternLM-XComposer2 (7B, 4khd-7B)\u003C\u002Fli>\n  \u003Cli>InternLM-XComposer2.5 (7B)\u003C\u002Fli>\n  \u003Cli>Qwen-VL (7B)\u003C\u002Fli>\n  \u003Cli>Qwen2-VL (2B, 7B, 72B)\u003C\u002Fli>\n  \u003Cli>Qwen2.5-VL (3B, 7B, 72B)\u003C\u002Fli>\n  \u003Cli>Qwen3-VL (2B - 235B)\u003C\u002Fli>\n  \u003Cli>Qwen3.5 (0.8B - 397B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-VL (7B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-VL2 (3B, 16B, 27B)\u003C\u002Fli>\n  \u003Cli>InternVL-Chat (v1.1-v1.5)\u003C\u002Fli>\n  \u003Cli>InternVL2 (1B-76B)\u003C\u002Fli>\n  \u003Cli>InternVL2.5(MPO) (1B-78B)\u003C\u002Fli>\n  \u003Cli>InternVL3 (1B-78B)\u003C\u002Fli>\n  \u003Cli>InternVL3.5 (1B-241BA28B)\u003C\u002Fli>\n  \u003Cli>Intern-S1 (241B)\u003C\u002Fli>\n  \u003Cli>Intern-S1-mini (8.3B)\u003C\u002Fli>\n  \u003Cli>Intern-S1-Pro (1TB)\u003C\u002Fli>\n  \u003Cli>Mono-InternVL (2B)\u003C\u002Fli>\n  \u003Cli>ChemVLM (8B-26B)\u003C\u002Fli>\n  \u003Cli>CogVLM-Chat (17B)\u003C\u002Fli>\n  \u003Cli>CogVLM2-Chat (19B)\u003C\u002Fli>\n  \u003Cli>MiniCPM-Llama3-V-2_5\u003C\u002Fli>\n  \u003Cli>MiniCPM-V-2_6\u003C\u002Fli>\n  \u003Cli>Phi-3-vision (4.2B)\u003C\u002Fli>\n  \u003Cli>Phi-3.5-vision (4.2B)\u003C\u002Fli>\n  \u003Cli>GLM-4V (9B)\u003C\u002Fli>\n  \u003Cli>GLM-4.1V-Thinking (9B)\u003C\u002Fli>\n  \u003Cli>Llama3.2-vision (11B, 90B)\u003C\u002Fli>\n  \u003Cli>Molmo (7B-D,72B)\u003C\u002Fli>\n  \u003Cli>Gemma3 (1B - 27B)\u003C\u002Fli>\n  \u003Cli>Llama4 (Scout, Maverick)\u003C\u002Fli>\n\u003C\u002Ful>\n\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003C\u002Ftbody>\n\u003C\u002Ftable>\n\nLMDeploy has developed two inference engines - [TurboMind](.\u002Fdocs\u002Fen\u002Finference\u002Fturbomind.md) and [PyTorch](.\u002Fdocs\u002Fen\u002Finference\u002Fpytorch.md), each with a different focus. The former strives for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers.\n\nThey differ in the types of supported models and the inference data type. Please refer to [this table](.\u002Fdocs\u002Fen\u002Fsupported_models\u002Fsupported_models.md) for each engine's capability and choose the proper one that best fits your actual needs.\n\n# Quick Start [![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)\n\n## Installation\n\nIt is recommended installing lmdeploy using pip in a conda environment (python 3.10 - 3.13):\n\n```shell\nconda create -n lmdeploy python=3.12 -y\nconda activate lmdeploy\npip install lmdeploy\n```\n\nSince v0.3.0, the default prebuilt package is compiled on **CUDA 12**. Starting from v0.10.2, LMDeploy no longer supports CUDA 11 series.\n\nIf you are using a GeForce RTX 50 series graphics card, please install the LMDeploy prebuilt package compiled with **CUDA 12.8** as follows:\n\n```shell\nexport LMDEPLOY_VERSION=0.12.3\nexport PYTHON_VERSION=312\npip install https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Freleases\u002Fdownload\u002Fv${LMDEPLOY_VERSION}\u002Flmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu128\n```\n\n## Offline Batch Inference\n\n```python\nimport lmdeploy\nwith lmdeploy.pipeline(\"internlm\u002Finternlm3-8b-instruct\") as pipe:\n    response = pipe([\"Hi, pls intro yourself\", \"Shanghai is\"])\n    print(response)\n```\n\n> \\[!NOTE\\]\n> By default, LMDeploy downloads model from HuggingFace. If you would like to use models from ModelScope, please install ModelScope by `pip install modelscope` and set the environment variable:\n>\n> `export LMDEPLOY_USE_MODELSCOPE=True`\n>\n> If you would like to use models from openMind Hub, please install openMind Hub by `pip install openmind_hub` and set the environment variable:\n>\n> `export LMDEPLOY_USE_OPENMIND_HUB=True`\n\nFor more information about inference pipeline, please refer to [here](docs\u002Fen\u002Fllm\u002Fpipeline.md).\n\n# Tutorials\n\nPlease review [getting_started](docs\u002Fen\u002Fget_started\u002Fget_started.md) section for the basic usage of LMDeploy.\n\nFor detailed user guides and advanced guides, please refer to our [tutorials](https:\u002F\u002Flmdeploy.readthedocs.io\u002Fen\u002Flatest\u002F):\n\n- User Guide\n  - [LLM Inference pipeline](docs\u002Fen\u002Fllm\u002Fpipeline.md) [![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)\n  - [VLM Inference pipeline](docs\u002Fen\u002Fmulti_modal\u002Fvl_pipeline.md) [![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)\n  - [LLM Serving](docs\u002Fen\u002Fllm\u002Fapi_server.md)\n  - [VLM Serving](docs\u002Fen\u002Fmulti_modal\u002Fapi_server_vl.md)\n  - [Quantization](docs\u002Fen\u002Fquantization)\n- Advance Guide\n  - [Inference Engine - TurboMind](docs\u002Fen\u002Finference\u002Fturbomind.md)\n  - [Inference Engine - PyTorch](docs\u002Fen\u002Finference\u002Fpytorch.md)\n  - [Customize chat templates](docs\u002Fen\u002Fadvance\u002Fchat_template.md)\n  - [Add a new model](docs\u002Fen\u002Fadvance\u002Fpytorch_new_model.md)\n  - gemm tuning\n  - [Long context inference](docs\u002Fen\u002Fadvance\u002Flong_context.md)\n  - [Multi-model inference service](docs\u002Fen\u002Fllm\u002Fproxy_server.md)\n\n# Third-party projects\n\n- Deploying LLMs offline on the NVIDIA Jetson platform by LMDeploy: [LMDeploy-Jetson](https:\u002F\u002Fgithub.com\u002FBestAnHongjun\u002FLMDeploy-Jetson)\n\n- Example project for deploying LLMs using LMDeploy and BentoML: [BentoLMDeploy](https:\u002F\u002Fgithub.com\u002Fbentoml\u002FBentoLMDeploy)\n\n# Contributing\n\nWe appreciate all contributions to LMDeploy. Please refer to [CONTRIBUTING.md](.github\u002FCONTRIBUTING.md) for the contributing guideline.\n\n# Acknowledgement\n\n- [FasterTransformer](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer)\n- [llm-awq](https:\u002F\u002Fgithub.com\u002Fmit-han-lab\u002Fllm-awq)\n- [vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm)\n- [DeepSpeed-MII](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed-MII)\n\n# Citation\n\n```bibtex\n@misc{2023lmdeploy,\n    title={LMDeploy: A Toolkit for Compressing, Deploying, and Serving LLM},\n    author={LMDeploy Contributors},\n    howpublished = {\\url{https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy}},\n    year={2023}\n}\n```\n\n```bibtex\n@article{zhang2025efficient,\n  title={Efficient Mixed-Precision Large Language Model Inference with TurboMind},\n  author={Zhang, Li and Jiang, Youhe and He, Guoliang and Chen, Xin and Lv, Han and Yao, Qian and Fu, Fangcheng and Chen, Kai},\n  journal={arXiv preprint arXiv:2508.15601},\n  year={2025}\n}\n```\n\n# License\n\nThis project is released under the [Apache 2.0 license](LICENSE).\n","\u003Cdiv align=\"center\">\n  \u003Cimg src=\"docs\u002Fen\u002F_static\u002Fimage\u002Flmdeploy-logo.svg\" width=\"450\"\u002F>\n\n[![PyPI](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Flmdeploy)](https:\u002F\u002Fpypi.org\u002Fproject\u002Flmdeploy)\n![PyPI - Downloads](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fdm\u002Flmdeploy)\n[![license](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flicense\u002FInternLM\u002Flmdeploy.svg)](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Ftree\u002Fmain\u002FLICENSE)\n[![issue resolution](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fissues-closed-raw\u002FInternLM\u002Flmdeploy)](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues)\n[![open issues](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fissues-raw\u002FInternLM\u002Flmdeploy)](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues)\n\n[📘文档](https:\u002F\u002Flmdeploy.readthedocs.io\u002Fen\u002Flatest\u002F) |\n[🛠️快速入门](https:\u002F\u002Flmdeploy.readthedocs.io\u002Fen\u002Flatest\u002Fget_started\u002Fget_started.html) |\n[🤔提交问题](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues\u002Fnew\u002Fchoose)\n\nEnglish | [简体中文](README_zh-CN.md) | [日本語](README_ja.md)\n\n👋 加入我们：[![静态徽章](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F-grey?style=social&logo=wechat&label=WeChat)](https:\u002F\u002Fcdn.vansin.top\u002Finternlm\u002Flmdeploy.jpg)\n[![静态徽章](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F-grey?style=social&logo=twitter&label=Twitter)](https:\u002F\u002Ftwitter.com\u002Fintern_lm)\n[![静态徽章](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F-grey?style=social&logo=discord&label=Discord)](https:\u002F\u002Fdiscord.gg\u002Fxa29JuW87d)\n\n\u003C\u002Fdiv>\n\n______________________________________________________________________\n\n## 最新消息 🎉\n\n\u003Cdetails open>\n\u003Csummary>\u003Cb>2026\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2026\u002F04\\] LMDeploy 在 PyPI 上的项目已达到存储配额，因此暂时无法上传新版本的预构建 wheel。您可以从 [GitHub Releases](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Freleases) 页面下载软件包，或直接从源码安装。待 PyPI 的 wheel 上传恢复后，我们将更新此通知。受影响的版本：>=0.12.2\n- \\[2026\u002F02\\] 支持 [Qwen3.5](https:\u002F\u002Fhuggingface.co\u002Fcollections\u002FQwen\u002Fqwen35)\n- \\[2026\u002F02\\] 支持 [vllm-project\u002Fllm-compressor](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fllm-compressor) 的 4bit 对称\u002F非对称量化。详细指南请参阅 [这里](.\u002Fdocs\u002Fen\u002Fquantization\u002Fllm_compressor.md)\n\n\u003C\u002Fdetails>\n\n\u003Cdetails close>\n\u003Csummary>\u003Cb>2025\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2025\u002F09\\] TurboMind 自 V100 起支持 NVIDIA GPU 上的 MXFP4，对于 OpenAI GPT-OSS 模型，性能是 vLLM 在 H800 上的 1.5 倍！\n- \\[2025\u002F06\\] 针对 FP8 MoE 模型进行全面推理优化\n- \\[2025\u002F06\\] 现可通过与 [DLSlime](https:\u002F\u002Fgithub.com\u002FDeepLink-org\u002FDLSlime) 和 [Mooncake](https:\u002F\u002Fgithub.com\u002Fkvcache-ai\u002FMooncake) 集成，支持 DeepSeek PD 分离式部署。非常感谢这两个团队！\n- \\[2025\u002F04\\] 通过集成 deepseek-ai 技术（FlashMLA、DeepGemm、DeepEP、MicroBatch 和 eplb），提升 DeepSeek 推理性能\n- \\[2025\u002F01\\] 支持 DeepSeek V3 和 R1\n\n\u003C\u002Fdetails>\n\n\u003Cdetails close>\n\u003Csummary>\u003Cb>2024\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2024\u002F11\\] 使用 PyTorch 引擎支持 Mono-InternVL\n- \\[2024\u002F10\\] PyTorchEngine 在 ascend 平台上支持图模式，使推理速度翻倍\n- \\[2024\u002F09\\] LMDeploy PyTorchEngine 新增对 [华为 Ascend](.\u002Fdocs\u002Fen\u002Fget_started\u002Fascend\u002Fget_started.md) 的支持。支持的模型请见 [这里](docs\u002Fen\u002Fsupported_models\u002Fsupported_models.md)\n- \\[2024\u002F09\\] LMDeploy PyTorchEngine 通过引入 CUDA 图，在 Llama3-8B 推理上实现了 1.3 倍的加速\n- \\[2024\u002F08\\] LMDeploy 已集成到 [modelscope\u002Fswift](https:\u002F\u002Fgithub.com\u002Fmodelscope\u002Fswift)，作为 VLM 推理的默认加速器\n- \\[2024\u002F07\\] 支持 Llama3.1 8B、70B 及其工具调用功能\n- \\[2024\u002F07\\] 支持 [InternVL2](docs\u002Fen\u002Fmulti_modal\u002Finternvl.md) 全系列模型、[InternLM-XComposer2.5](docs\u002Fen\u002Fmulti_modal\u002Fxcomposer2d5.md) 以及 InternLM2.5 的 [函数调用](docs\u002Fen\u002Fllm\u002Fapi_server_tools.md)功能\n- \\[2024\u002F06\\] PyTorch 引擎支持 DeepSeek-V2 和多种 VLM，如 CogVLM2、Mini-InternVL、LlaVA-Next\n- \\[2024\u002F05\\] 在使用多 GPU 部署 VLM 时平衡视觉模型\n- \\[2024\u002F05\\] 支持 VLM 的 4-bit 权重量化及推理，例如 InternVL v1.5、LLaVa、InternLMXComposer2\n- \\[2024\u002F04\\] 支持 Llama3 及更多 VLM，如 InternVL v1.1、v1.2、MiniGemini、InternLMXComposer2。\n- \\[2024\u002F04\\] TurboMind 新增所有支持设备的在线 int8\u002Fint4 KV 缓存量化与推理。详细指南请参阅 [这里](docs\u002Fen\u002Fquantization\u002Fkv_quant.md)\n- \\[2024\u002F04\\] TurboMind 最新升级提升了 GQA 性能，使 [internlm2-20b](https:\u002F\u002Fhuggingface.co\u002Finternlm\u002Finternlm2-20b) 模型的推理速度达到 16+ RPS，比 vLLM 快约 1.8 倍。\n- \\[2024\u002F04\\] 支持 Qwen1.5-MOE 和 dbrx。\n- \\[2024\u002F03\\] 支持 DeepSeek-VL 的离线推理流程及服务部署。\n- \\[2024\u002F03\\] 支持 VLM 的离线推理流程及服务部署。\n- \\[2024\u002F02\\] 支持 Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOE 等模型。\n- \\[2024\u002F01\\] [OpenAOE](https:\u002F\u002Fgithub.com\u002FInternLM\u002FOpenAOE) 与 [LMDeploy Serving Service](docs\u002Fen\u002Fllm\u002Fapi_server.md) 实现无缝集成。\n- \\[2024\u002F01\\] 支持多模型、多机器、多卡的推理服务。使用说明请参阅 [这里](docs\u002Fen\u002Fllm\u002Fproxy_server.md)\n- \\[2024\u002F01\\] 支持完全用 Python 开发的 [PyTorch 推理引擎](.\u002Fdocs\u002Fen\u002Finference\u002Fpytorch.md)，有助于降低开发者门槛，并实现对新功能和技术的快速实验。\n\n\u003C\u002Fdetails>\n\n\u003Cdetails close>\n\u003Csummary>\u003Cb>2023\u003C\u002Fb>\u003C\u002Fsummary>\n\n- \\[2023\u002F12\\] Turbomind 支持多模态输入。\n- \\[2023\u002F11\\] Turbomind 支持直接加载 hf 模型。详情请点击 [这里](docs\u002Fen\u002Finference\u002Fload_hf.md)。\n- \\[2023\u002F11\\] TurboMind 进行了重大升级，包括：分页注意力、无序列长度限制的更快注意力内核、KV8 内核速度提升 2 倍、Split-K 解码（闪速解码）以及针对 sm_75 的 W4A16 推理。\n- \\[2023\u002F09\\] TurboMind 支持 Qwen-14B\n- \\[2023\u002F09\\] TurboMind 支持 InternLM-20B\n- \\[2023\u002F09\\] TurboMind 支持 Code Llama 的所有功能：代码补全、代码填充、聊天\u002F指令模式以及 Python 专业模式。部署指南请参阅 [这里](.\u002Fdocs\u002Fen\u002Fllm\u002Fcodellama.md)\n- \\[2023\u002F09\\] TurboMind 支持 Baichuan2-7B\n- \\[2023\u002F08\\] TurboMind 支持 flash-attention2。\n- \\[2023\u002F08\\] TurboMind 支持 Qwen-7B、动态 NTK-RoPE 缩放和动态 logN 缩放\n- \\[2023\u002F08\\] TurboMind 支持 Windows (tp=1)\n- \\[2023\u002F08\\] TurboMind 支持 4-bit 推理，速度比 FP16 快 2.4 倍，是目前最快的开源实现。详细信息请参阅 [这篇](docs\u002Fen\u002Fquantization\u002Fw4a16.md) 指南\n- \\[2023\u002F08\\] LMDeploy 已在 [HuggingFace Hub](https:\u002F\u002Fhuggingface.co\u002Flmdeploy) 上线，提供开箱即用的 4-bit 模型。\n- \\[2023\u002F08\\] LMDeploy 支持使用 [AWQ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00978) 算法进行 4-bit 量化。\n- \\[2023\u002F07\\] TurboMind 支持带有 GQA 的 Llama-2 70B。\n- \\[2023\u002F07\\] TurboMind 支持 Llama-2 7B\u002F13B。\n- \\[2023\u002F07\\] TurboMind 支持 InternLM 的张量并行推理。\n\n\u003C\u002Fdetails>\n\n______________________________________________________________________\n\n# 简介\n\nLMDeploy 是一个用于压缩、部署和推理服务大语言模型的工具包，由 [MMRazor](https:\u002F\u002Fgithub.com\u002Fopen-mmlab\u002Fmmrazor) 和 [MMDeploy](https:\u002F\u002Fgithub.com\u002Fopen-mmlab\u002Fmmdeploy) 团队共同开发。它具备以下核心特性：\n\n- **高效推理**：通过引入持久化批处理（即连续批处理）、分块 KV 缓存、动态拆分与融合、张量并行、高性能 CUDA 核函数等关键技术，LMDeploy 的请求吞吐率最高可达 vLLM 的 1.8 倍。\n\n- **高效量化**：LMDeploy 支持权重量化和键值对（k\u002Fv）量化，其中 4 位精度的推理性能是 FP16 的 2.4 倍。量化质量已通过 OpenCompass 评测得到验证。\n\n- **便捷的分布式部署**：借助请求分发服务，LMDeploy 能够轻松高效地在多台机器、多张 GPU 上部署多模型服务。\n\n- **出色的兼容性**：LMDeploy 支持同时使用 [KV 缓存量化](docs\u002Fen\u002Fquantization\u002Fkv_quant.md)、[AWQ](docs\u002Fen\u002Fquantization\u002Fw4a16.md) 和 [自动前缀缓存](docs\u002Fen\u002Finference\u002Fturbomind_config.md)。\n\n# 性能\n\n![v0 1 0-benchmark](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FInternLM_lmdeploy_readme_9b9b13fa758d.png)\n\n# 支持的模型\n\n\u003Ctable>\n\u003Ctbody>\n\u003Ctr align=\"center\" valign=\"middle\">\n\u003Ctd>\n  \u003Cb>大语言模型\u003C\u002Fb>\n\u003C\u002Ftd>\n\u003Ctd>\n  \u003Cb>多模态模型\u003C\u002Fb>\n\u003C\u002Ftd>\n\u003Ctr valign=\"top\">\n\u003Ctd align=\"left\" valign=\"top\">\n\u003Cul>\n  \u003Cli>Llama (7B - 65B)\u003C\u002Fli>\n  \u003Cli>Llama2 (7B - 70B)\u003C\u002Fli>\n  \u003Cli>Llama3 (8B, 70B)\u003C\u002Fli>\n  \u003Cli>Llama3.1 (8B, 70B)\u003C\u002Fli>\n  \u003Cli>Llama3.2 (1B, 3B)\u003C\u002Fli>\n  \u003Cli>InternLM (7B - 20B)\u003C\u002Fli>\n  \u003Cli>InternLM2 (7B - 20B)\u003C\u002Fli>\n  \u003Cli>InternLM3 (8B)\u003C\u002Fli>\n  \u003Cli>InternLM2.5 (7B)\u003C\u002Fli>\n  \u003Cli>Qwen (1.8B - 72B)\u003C\u002Fli>\n  \u003Cli>Qwen1.5 (0.5B - 110B)\u003C\u002Fli>\n  \u003Cli>Qwen1.5 - MoE (0.5B - 72B)\u003C\u002Fli>\n  \u003Cli>Qwen2 (0.5B - 72B)\u003C\u002Fli>\n  \u003Cli>Qwen2-MoE (57BA14B)\u003C\u002Fli>\n  \u003Cli>Qwen2.5 (0.5B - 32B)\u003C\u002Fli>\n  \u003Cli>Qwen3, Qwen3-MoE\u003C\u002Fli>\n  \u003Cli>Qwen3-Next(80B)\u003C\u002Fli>\n  \u003Cli>Baichuan (7B)\u003C\u002Fli>\n  \u003Cli>Baichuan2 (7B-13B)\u003C\u002Fli>\n  \u003Cli>Code Llama (7B - 34B)\u003C\u002Fli>\n  \u003Cli>ChatGLM2 (6B)\u003C\u002Fli>\n  \u003Cli>GLM-4 (9B)\u003C\u002Fli>\n  \u003Cli>GLM-4-0414 (9B, 32B)\u003C\u002Fli>\n  \u003Cli>CodeGeeX4 (9B)\u003C\u002Fli>\n  \u003Cli>YI (6B-34B)\u003C\u002Fli>\n  \u003Cli>Mistral (7B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-MoE (16B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V2 (16B, 236B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V2.5 (236B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V3 (685B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-V3.2 (685B)\u003C\u002Fli>\n  \u003Cli>Mixtral (8x7B, 8x22B)\u003C\u002Fli>\n  \u003Cli>Gemma (2B - 7B)\u003C\u002Fli>\n  \u003Cli>StarCoder2 (3B - 15B)\u003C\u002Fli>\n  \u003Cli>Phi-3-mini (3.8B)\u003C\u002Fli>\n  \u003Cli>Phi-3.5-mini (3.8B)\u003C\u002Fli>\n  \u003Cli>Phi-3.5-MoE (16x3.8B)\u003C\u002Fli>\n  \u003Cli>Phi-4-mini (3.8B)\u003C\u002Fli>\n  \u003Cli>MiniCPM3 (4B)\u003C\u002Fli>\n  \u003Cli>SDAR (1.7B-30B)\u003C\u002Fli>\n  \u003Cli>gpt-oss (20B, 120B)\u003C\u002Fli>\n  \u003Cli>GLM-4.7-Flash (30B)\u003C\u002Fli>\n  \u003Cli>GLM-5 (754B)\u003C\u002Fli>\n\u003C\u002Ful>\n\u003C\u002Ftd>\n\u003Ctd>\n\u003Cul>\n  \u003Cli>LLaVA(1.5,1.6) (7B-34B)\u003C\u002Fli>\n  \u003Cli>InternLM-XComposer2 (7B, 4khd-7B)\u003C\u002Fli>\n  \u003Cli>InternLM-XComposer2.5 (7B)\u003C\u002Fli>\n  \u003Cli>Qwen-VL (7B)\u003C\u002Fli>\n  \u003Cli>Qwen2-VL (2B, 7B, 72B)\u003C\u002Fli>\n  \u003Cli>Qwen2.5-VL (3B, 7B, 72B)\u003C\u002Fli>\n  \u003Cli>Qwen3-VL (2B - 235B)\u003C\u002Fli>\n  \u003Cli>Qwen3.5 (0.8B - 397B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-VL (7B)\u003C\u002Fli>\n  \u003Cli>DeepSeek-VL2 (3B, 16B, 27B)\u003C\u002Fli>\n  \u003Cli>InternVL-Chat (v1.1-v1.5)\u003C\u002Fli>\n  \u003Cli>InternVL2 (1B-76B)\u003C\u002Fli>\n  \u003Cli>InternVL2.5(MPO) (1B-78B)\u003C\u002Fli>\n  \u003Cli>InternVL3 (1B-78B)\u003C\u002Fli>\n  \u003Cli>InternVL3.5 (1B-241BA28B)\u003C\u002Fli>\n  \u003Cli>Intern-S1 (241B)\u003C\u002Fli>\n  \u003Cli>Intern-S1-mini (8.3B)\u003C\u002Fli>\n  \u003Cli>Intern-S1-Pro (1TB)\u003C\u002Fli>\n  \u003Cli>Mono-InternVL (2B)\u003C\u002Fli>\n  \u003Cli>ChemVLM (8B-26B)\u003C\u002Fli>\n  \u003Cli>CogVLM-Chat (17B)\u003C\u002Fli>\n  \u003Cli>CogVLM2-Chat (19B)\u003C\u002Fli>\n  \u003Cli>MiniCPM-Llama3-V-2_5\u003C\u002Fli>\n  \u003Cli>MiniCPM-V-2_6\u003C\u002Fli>\n  \u003Cli>Phi-3-vision (4.2B)\u003C\u002Fli>\n  \u003Cli>Phi-3.5-vision (4.2B)\u003C\u002Fli>\n  \u003Cli>GLM-4V (9B)\u003C\u002Fli>\n  \u003Cli>GLM-4.1V-Thinking (9B)\u003C\u002Fli>\n  \u003Cli>Llama3.2-vision (11B, 90B)\u003C\u002Fli>\n  \u003Cli>Molmo (7B-D,72B)\u003C\u002Fli>\n  \u003Cli>Gemma3 (1B - 27B)\u003C\u002Fli>\n  \u003Cli>Llama4 (Scout, Maverick)\u003C\u002Fli>\n\u003C\u002Ful>\n\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003C\u002Ftbody>\n\u003C\u002Ftable>\n\nLMDeploy 开发了两种推理引擎——[TurboMind](.\u002Fdocs\u002Fen\u002Finference\u002Fturbomind.md) 和 [PyTorch](.\u002Fdocs\u002Fen\u002Finference\u002Fpytorch.md)，它们各有侧重。前者致力于实现极致的推理性能优化，而后者则完全基于 Python 实现，旨在降低开发者的使用门槛。\n\n两者在支持的模型类型和推理数据类型上有所不同。请参考 [这张表格](.\u002Fdocs\u002Fen\u002Fsupported_models\u002Fsupported_models.md)，了解每种引擎的具体能力，并根据实际需求选择合适的引擎。\n\n# 快速入门 [![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)\n\n## 安装\n\n建议在 Conda 环境中使用 pip 安装 lmdeploy（Python 3.10 - 3.13）：\n\n```shell\nconda create -n lmdeploy python=3.12 -y\nconda activate lmdeploy\npip install lmdeploy\n```\n\n自 v0.3.0 起，默认预编译包是在 **CUDA 12** 上编译的。从 v0.10.2 开始，LMDeploy 不再支持 CUDA 11 系列。\n\n如果您使用的是 GeForce RTX 50 系列显卡，请按照以下步骤安装使用 **CUDA 12.8** 编译的预编译包：\n\n```shell\nexport LMDEPLOY_VERSION=0.12.3\nexport PYTHON_VERSION=312\npip install https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Freleases\u002Fdownload\u002Fv${LMDEPLOY_VERSION}\u002Flmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu128\n```\n\n## 离线批量推理\n\n```python\nimport lmdeploy\nwith lmdeploy.pipeline(\"internlm\u002Finternlm3-8b-instruct\") as pipe:\n    response = pipe([\"你好，请自我介绍一下\", \"上海是\"])\n    print(response)\n```\n\n> \\[!NOTE\\]\n> 默认情况下，LMDeploy 会从 HuggingFace 下载模型。如果您希望使用 ModelScope 上的模型，请先通过 `pip install modelscope` 安装 ModelScope，并设置环境变量：\n>\n> `export LMDEPLOY_USE_MODELSCOPE=True`\n>\n> 如果您希望使用 openMind Hub 上的模型，请先通过 `pip install openmind_hub` 安装 openMind Hub，并设置环境变量：\n>\n> `export LMDEPLOY_USE_OPENMIND_HUB=True`\n\n有关推理管道的更多信息，请参阅 [此处](docs\u002Fen\u002Fllm\u002Fpipeline.md)。\n\n# 教程\n\n请查看 [getting_started](docs\u002Fen\u002Fget_started\u002Fget_started.md) 部分，了解 LMDeploy 的基本用法。\n\n有关详细的用户指南和高级指南，请参阅我们的 [教程](https:\u002F\u002Flmdeploy.readthedocs.io\u002Fen\u002Flatest\u002F)：\n\n- 用户指南\n  - [LLM 推理流水线](docs\u002Fen\u002Fllm\u002Fpipeline.md) [![在 Colab 中打开](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)\n  - [VLM 推理流水线](docs\u002Fen\u002Fmulti_modal\u002Fvl_pipeline.md) [![在 Colab 中打开](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)\n  - [LLM 服务](docs\u002Fen\u002Fllm\u002Fapi_server.md)\n  - [VLM 服务](docs\u002Fen\u002Fmulti_modal\u002Fapi_server_vl.md)\n  - [量化](docs\u002Fen\u002Fquantization)\n- 高级指南\n  - [推理引擎 - TurboMind](docs\u002Fen\u002Finference\u002Fturbomind.md)\n  - [推理引擎 - PyTorch](docs\u002Fen\u002Finference\u002Fpytorch.md)\n  - [自定义聊天模板](docs\u002Fen\u002Fadvance\u002Fchat_template.md)\n  - [添加新模型](docs\u002Fen\u002Fadvance\u002Fpytorch_new_model.md)\n  - gemm 调优\n  - [长上下文推理](docs\u002Fen\u002Fadvance\u002Flong_context.md)\n  - [多模型推理服务](docs\u002Fen\u002Fllm\u002Fproxy_server.md)\n\n# 第三方项目\n\n- 使用 LMDeploy 在 NVIDIA Jetson 平台上离线部署 LLM：[LMDeploy-Jetson](https:\u002F\u002Fgithub.com\u002FBestAnHongjun\u002FLMDeploy-Jetson)\n\n- 使用 LMDeploy 和 BentoML 部署 LLM 的示例项目：[BentoLMDeploy](https:\u002F\u002Fgithub.com\u002Fbentoml\u002FBentoLMDeploy)\n\n# 贡献\n\n我们非常感谢对 LMDeploy 的所有贡献。请参阅 [CONTRIBUTING.md](.github\u002FCONTRIBUTING.md) 以获取贡献指南。\n\n# 致谢\n\n- [FasterTransformer](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer)\n- [llm-awq](https:\u002F\u002Fgithub.com\u002Fmit-han-lab\u002Fllm-awq)\n- [vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm)\n- [DeepSpeed-MII](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed-MII)\n\n# 引用\n\n```bibtex\n@misc{2023lmdeploy,\n    title={LMDeploy：用于压缩、部署和提供 LLM 服务的工具包},\n    author={LMDeploy 贡献者},\n    howpublished = {\\url{https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy}},\n    year={2023}\n}\n```\n\n```bibtex\n@article{zhang2025efficient,\n  title={使用 TurboMind 进行高效的混合精度大型语言模型推理},\n  author={Zhang, Li; Jiang, Youhe; He, Guoliang; Chen, Xin; Lv, Han; Yao, Qian; Fu, Fangcheng; Chen, Kai},\n  journal={arXiv 预印本 arXiv:2508.15601},\n  year={2025}\n}\n```\n\n# 许可证\n\n本项目采用 [Apache 2.0 许可证](LICENSE) 发布。","# LMDeploy 快速上手指南\n\nLMDeploy 是由 MMRazor 和 MMDeploy 团队开发的大语言模型（LLM）压缩、部署和服务工具包。它以高效的推理性能（比 vLLM 快 1.8 倍）、优秀的量化支持（4-bit 推理性能提升 2.4 倍）以及便捷的分布式服务部署而著称。\n\n## 1. 环境准备\n\n在开始之前，请确保您的系统满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 18.04\u002F20.04\u002F22.04) 或 Windows (仅支持 tp=1)。\n*   **Python 版本**: 3.10 - 3.13。\n*   **GPU 驱动**: 已安装兼容的 NVIDIA 驱动。\n*   **CUDA 版本**: \n    *   LMDeploy v0.3.0 及以上默认基于 **CUDA 12** 编译。\n    *   **注意**: 从 v0.10.2 版本开始，不再支持 CUDA 11 系列。\n    *   如果您使用的是 GeForce RTX 50 系列显卡，请务必安装对应的预构建包。\n*   **硬件架构**: 支持 NVIDIA GPU (Volta 架构及以上，如 V100, A100, H800, RTX 30\u002F40\u002F50 系列等) 以及华为昇腾 (Ascend) 平台。\n\n## 2. 安装步骤\n\n推荐使用 `conda` 创建独立的虚拟环境进行安装，以避免依赖冲突。\n\n### 步骤一：创建并激活环境\n\n```bash\nconda create -n lmdeploy python=3.12 -y\nconda activate lmdeploy\n```\n\n### 步骤二：安装 LMDeploy\n\n使用 pip 直接安装最新稳定版：\n\n```bash\npip install lmdeploy\n```\n\n> **国内加速建议**：\n> 如果下载速度较慢，建议使用国内镜像源（如清华源）：\n> ```bash\n> pip install lmdeploy -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n> **特殊情况说明**：\n> 若遇到 PyPI 存储配额限制导致无法下载特定版本（>=0.12.2）的 wheel 包，请前往 [GitHub Releases](https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Freleases) 页面下载对应版本的 `.whl` 文件手动安装，或选择从源码安装。\n\n## 3. 基本使用\n\nLMDeploy 提供了命令行工具和 Python API 两种主要使用方式。以下展示最快速的命令行推理示例。\n\n### 场景一：一键启动交互式对话\n\n假设您想要运行一个 Hugging Face 上的模型（例如 `internlm\u002Finternlm2_5-7b-chat`），只需一条命令即可自动下载模型并启动对话服务。\n\n```bash\nlmdeploy chat internlm\u002Finternlm2_5-7b-chat\n```\n\n*   该命令会自动拉取模型权重。\n*   启动后，您可以在终端直接与模型进行多轮对话。\n*   支持自动识别模型类型并加载相应的配置。\n\n### 场景二：启动本地 API 服务\n\n如果您希望通过 HTTP API 调用模型（兼容 OpenAI 接口格式），可以使用 `serve` 命令：\n\n```bash\nlmdeploy serve api_server internlm\u002Finternlm2_5-7b-chat --server-port 23333\n```\n\n启动成功后，您可以通过 curl 或 Python 请求接口：\n\n```bash\ncurl http:\u002F\u002Flocalhost:23333\u002Fv1\u002Fchat\u002Fcompletions \\\n  -H \"Content-Type: application\u002Fjson\" \\\n  -d '{\n    \"model\": \"internlm2_5-7b-chat\",\n    \"messages\": [{\"role\": \"user\", \"content\": \"你好，请介绍一下你自己\"}]\n  }'\n```\n\n### 场景三：Python 代码快速推理\n\n您也可以直接在 Python 脚本中调用 LMDeploy 进行推理：\n\n```python\nfrom lmdeploy import pipeline\n\n# 初始化推理管道\npipe = pipeline(\"internlm\u002Finternlm2_5-7b-chat\")\n\n# 执行推理\nresponse = pipe(\"你好，LMDeploy 是什么？\")\nprint(response)\n```\n\n---\n\n**提示**：LMDeploy 包含 **TurboMind** 和 **PyTorch** 两个推理引擎。默认情况下会自动选择最优引擎（通常为 TurboMind 以获得最佳性能）。如需指定引擎或进行高级量化配置，请参考官方详细文档。","某初创团队希望在单张消费级显卡上部署 70B 参数的 Llama3.1 模型，以构建低成本的智能客服系统。\n\n### 没有 lmdeploy 时\n- **显存爆满无法启动**：70B 模型全精度加载需要超过 140GB 显存，远超单卡上限，导致服务根本无法运行。\n- **推理延迟过高**：即使强行使用多卡并行或 CPU 卸载，首字生成延迟也高达数秒，用户等待体验极差。\n- **吞吐量瓶颈明显**：在高并发场景下，请求排队严重，每秒处理令牌数（TPS）极低，无法满足实时对话需求。\n- **部署流程繁琐**：需要手动配置复杂的量化脚本和推理后端，调试环境依赖耗费大量开发时间。\n\n### 使用 lmdeploy 后\n- **单卡轻松运行大模型**：利用 lmdeploy 的 4bit 权重量化技术，将 70B 模型压缩至约 40GB 以内，成功在单张高端消费级显卡上启动。\n- **延迟降低至毫秒级**：借助 TurboMind 推理引擎和 CUDA Graph 优化，首字延迟从数秒缩短至几百毫秒，对话流畅自然。\n- **并发性能显著提升**：通过高效的显存管理和批处理策略，高并发下的吞吐量提升数倍，稳定支撑多人同时在线。\n- **一行命令完成部署**：提供标准化的命令行工具和 API 服务接口，无需编写底层代码即可快速搭建生产级服务。\n\nlmdeploy 通过极致的量化压缩与推理加速，让大模型在有限硬件资源下实现了低成本、高性能的落地应用。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FInternLM_lmdeploy_9b9b13fa.png","InternLM","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FInternLM_bc4eb14c.png","",null,"internlm@pjlab.org.cn","intern_lm","https:\u002F\u002Fchat.intern-ai.org.cn\u002F","https:\u002F\u002Fgithub.com\u002FInternLM",[81,85,89,93,97,101,105,109],{"name":82,"color":83,"percentage":84},"Python","#3572A5",67.1,{"name":86,"color":87,"percentage":88},"C++","#f34b7d",19.5,{"name":90,"color":91,"percentage":92},"Cuda","#3A4E3A",12.3,{"name":94,"color":95,"percentage":96},"CMake","#DA3434",0.7,{"name":98,"color":99,"percentage":100},"Shell","#89e051",0.2,{"name":102,"color":103,"percentage":104},"PowerShell","#012456",0.1,{"name":106,"color":107,"percentage":108},"Dockerfile","#384d54",0,{"name":110,"color":111,"percentage":108},"C","#555555",7779,685,"2026-04-16T11:17:15","Apache-2.0","Linux, Windows","NVIDIA GPU 必需 (TurboMind 引擎); 支持从 V100 到 RTX 50 系列; 显存需求取决于模型大小 (例如 4-bit 量化可降低需求); CUDA 12+ (v0.3.0+ 默认，不再支持 CUDA 11); 另支持华为 Ascend (PyTorch 引擎)","未说明 (取决于模型大小及是否使用量化)",{"notes":120,"python":121,"dependencies":122},"1. 推荐使用 conda 创建环境安装。2. v0.10.2 起不再支持 CUDA 11 系列，默认编译基于 CUDA 12。3. 若使用 GeForce RTX 50 系列显卡，需安装特定的预构建包。4. 提供 TurboMind (C++\u002FCUDA, 高性能) 和 PyTorch (纯 Python, 易开发) 两种推理引擎。5. 支持 4-bit 权重量化及 KV Cache 量化以节省显存并提升速度。6. PyPI 存储配额已满 (>=0.12.2 版本)，新版本的预构建 wheel 暂时无法上传，需从 GitHub Releases 下载或源码安装。","3.10 - 3.13",[64,123,124],"torch","transformers",[14,35],[127,128,129,130,131,132,133,134,135,136,137],"cuda-kernels","deepspeed","fastertransformer","llm-inference","turbomind","internlm","llama","llm","codellama","llama2","llama3","2026-03-27T02:49:30.150509","2026-04-17T08:23:43.182065",[141,146,151,156,160,165],{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},37091,"如何在 LMDeploy 中部署和推理 DeepSeek-V3 模型？","LMDeploy v0.7.2 及以上版本已支持在 H800\u002FH100\u002FH20 平台上部署 DeepSeek-V3。\n\n**离线推理示例：**\n```python\nfrom lmdeploy import pipeline, PytorchEngineConfig\npipe = pipeline(\"deepseek-ai\u002FDeepSeek-V3-FP8\", backend_config=PytorchEngineConfig(tp=8))\nmessages_list = [[{\"role\": \"user\", \"content\": \"Who are you?\"}]]\noutput = pipe(messages_list)\nprint(output)\n```\n\n**在线服务启动命令：**\n```bash\nlmdeploy serve api_server deepseek-ai\u002FDeepSeek-V3-FP8 --tp 8 --backend pytorch\n```\n\n**客户端调用示例：**\n```python\nfrom openai import OpenAI\nclient = OpenAI(api_key='YOUR_API_KEY', base_url=\"http:\u002F\u002F0.0.0.0:23333\u002Fv1\")\nresponse = client.chat.completions.create(model=\"model_name\", messages=[{\"role\": \"user\", \"content\": \"Hello\"}])\n```","https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues\u002F2960",{"id":147,"question_zh":148,"answer_zh":149,"source_url":150},37092,"在 V100 显卡上运行遇到异常或错误时如何解决？","如果在 V100 显卡上遇到问题，可以尝试设置以下环境变量来调整异常处理级别：\n```shell\nexport TM_ANOMALY_HANDLER=level=2,inf=65504,nan=0\n```\n这有助于处理特定的浮点异常问题。","https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues\u002F2771",{"id":152,"question_zh":153,"answer_zh":154,"source_url":155},37093,"使用 InternVL 等多模态模型时，加载报错提示需要安装 flash_attn 怎么办？","对于推理过程，LMDeploy 的 Turbomind 后端并不实际使用 flash_attn（仅用于通过 Transformers 加载视觉模型时的配置检查）。\n解决方法是修改模型目录下的 `config.json` 文件，将 `\"attn_implementation\": \"flash_attention_2\"` 改为 `\"attn_implementation\": \"eager\"`，即可移除对 flash_attn 的依赖并正常加载模型。","https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues\u002F1495",{"id":157,"question_zh":158,"answer_zh":159,"source_url":155},37094,"如何在 LMDeploy API 服务中进行多轮对话或少样本提示（Few-shot Prompting）？","可以通过构建包含历史对话的消息列表来实现少样本提示或多轮对话。在使用 `openai` 客户端或 `lmdeploy.client` 时，`messages` 参数应包含交替的 user 和 assistant 角色内容。\n\n示例结构：\n```python\nmessages = [\n    {\n      \"role\": \"user\",\n      \"content\": [{\"type\": \"text\", \"text\": \"问题文本\"}, {\"type\": \"image_url\", \"image_url\": {\"url\": \"图片链接\"}}]\n    },\n    {\n      \"role\": \"assistant\",\n      \"content\": [{\"type\": \"text\", \"text\": \"之前的回答\"}]\n    },\n    {\n      \"role\": \"user\",\n      \"content\": [{\"type\": \"text\", \"text\": \"当前问题\"}]\n    }\n]\n```\n确保按顺序传递完整的上下文列表给 API。",{"id":161,"question_zh":162,"answer_zh":163,"source_url":164},37095,"LMDeploy 是否支持华为昇腾（Ascend）系列芯片？具体支持哪些型号？","目前 LMDeploy 主要适配华为 Atlas A2 系列（对应芯片型号通常为 910B 系列，如 910B1, 910B2, 910B2C 等）。可以通过 `npu-smi info` 查看芯片号确认。\n注意：910A 系列（如 910ProB 属于 910A 系列升级版）目前暂不支持，因为算子特性主要针对 Atlas A2 优化。未来可能会考虑支持 Atlas 300I Duo 等其他型号。","https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues\u002F2585",{"id":166,"question_zh":167,"answer_zh":168,"source_url":169},37096,"遇到 'session finished, reason error' 或其他不明原因的运行时错误该如何排查？","如果遇到此类会话错误且无法确定原因，最有效的解决方法是创建一个全新的干净 Python 环境重新安装 LMDeploy 及其依赖。\n很多此类错误是由旧环境中依赖包版本冲突或缓存污染导致的。重建环境通常能解决大部分不明原因的报错。如果问题依旧，请提供 `pip list` 输出和当前的 git commit ID 以便进一步复现。","https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fissues\u002F3280",[171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266],{"id":172,"version":173,"summary_zh":174,"released_at":175},297550,"v0.12.3","\u003C!-- 使用 .github\u002Frelease.yml 中的配置在 main 分支生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能\n* 支持视频输入，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4360 中实现\n* 新特性：在 TurboMind 中全面实现压缩张量 gs32 的支持，由 @lapy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4429 中实现\n* 草案：更新模型参数，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4452 中实现\n### 💥 优化\n* 支持 Qwen3.5 在 Volta 上运行，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4405 中实现\n* 优化 Qwen3.5，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4434 中实现\n* 内置 mrope，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4393 中实现\n* 删除 Ray 远程函数的返回值，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4422 中实现\n* 支持 recurrent-gdr 和 causal-conv1d-update 中的 cache_seqlen，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4417 中实现\n* 安全的 Ray API，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4455 中实现\n* 为 Qwen3-vl-moe 模型添加 R3，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4457 中实现\n* 对齐 lmdeploy 中的 rope 初始化，由 @RangiLyu 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4466 中实现\n* 将 tilelang 设为仅限 Linux 的依赖项（与 triton 类似），由 @Copilot 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4469 中实现\n* 在缓存初始化之前准备分块索引，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4458 中实现\n* 统一 rope 的设备，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4467 中实现\n* 自定义处理器参数，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4472 中实现\n* 当 proxy_url 未设置时，分配顺序的 api_server 端口，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4416 中实现\n* 禁用 fla intracard_backend，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4482 中实现\n* 【修复】【新特性】修复使用外部 pg bundle 时的工作器排序问题，并支持 update_params 的持久化缓冲区，由 @CyCle1024 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4397 中实现\n* 简化 InternS1 Pro 的代码，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4480 中实现\n### 🐞 错误修复\n* 修复 transformers>5 版本下的 test_hf_overrides 测试，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4418 中实现\n* 修复 Qwen3.5 的 PyTorch 多模态推理，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4430 中实现\n* 修复 `generate` 端点，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4432 中实现\n* 使 Intern-S1-Pro 兼容 Transformers 5.0 及以上版本，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4435 中实现\n* 修复多轮对话，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4438 中实现\n* 修复(async_engine)：通过 shield 和 SafeRunException，使 safe_run 的取消清理更加可靠，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4439 中实现\n* 释放状态缓存，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4462 中实现\n* 为 Qwen3Coder 工具调用（Qwen3.5）拆分工具调用参数 JSON，由 @lapy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F443 中实现","2026-04-08T03:37:26",{"id":177,"version":178,"summary_zh":179,"released_at":180},297551,"v0.12.2","\u003C!-- 使用 .github\u002Frelease.yml 中的配置在 main 分支生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能特性\n* 支持 glm5，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4355 中实现\n* Qwen\u002FInternlm\u002FLlama 密集\u002FMoE 模型 fp8 量化在线推理，由 @43758726 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4324 中实现\n* Qwen3.5，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4351 中实现\n* GLM-4.7-Flash Turbomind 支持，由 @lapy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4362 中实现\n* 为 qwen3.5 支持路由器重放及忽略量化层功能，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4394 中实现\n* 【功能】为 Qwen3.5 模型（密集 + MoE）添加 Turbomind 支持，由 @lapy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4389 中实现\n* 支持重复 n-gram 对数处理器，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4288 中实现\n### 💥 优化改进\n* Turbomind 端兼容 transformers 5.0，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4304 中实现\n* 为 qwen 和 internlm 模型支持 fp32 头部，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4160 中实现\n* 减少 MLA kv 缓存内存占用，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4373 中实现\n* 添加 recurrent_gated_delta_rule 内核，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4376 中实现\n* 【升腾】适配 s1-pro dp*tp+ep，由 @yao-fengchen 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4380 中实现\n* 支持 glm4.7 与 mtp 配合使用，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4346 中实现\n* 加快 MLA 内核运行速度，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4391 中实现\n* 注意力内核自注册及解耦调度，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4396 中实现\n### 🐞 Bug 修复\n* 修复：将 RepetitionPenaltyKernel 中的调试日志级别由 ERROR 调整为 DEBUG，由 @murray-macdonald 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4363 中实现\n* 修复 internvl awq 模型的量化配置解析问题，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4369 中实现\n* 修复 XGrammar 位掩码初始化问题，并在 generate 方法中为 gen_config 添加空值检查，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4349 中实现\n* 修复会话关闭逻辑，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4370 中实现\n* 修复授权问题，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4338 中实现\n* 修复一些小问题，并为 Pipeline 提供测试用例，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4365 中实现\n* 修复 dllm mask 在 set_step 时的问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4278 中实现\n* 修复 transformers>=5 版本下的模型问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4381 中实现\n* 修复请求中断时的异常问题，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4403 中实现\n* 修复 v100 显卡上 qwen3.5-0.8b 推理崩溃问题，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4420 中实现\n### 🌐 其他\n* ci(lint)：跳过 Python 维基页面中不稳定且容易失败的死链测试，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4357 中实现\n* 修复 fa3","2026-03-18T03:13:55",{"id":182,"version":183,"summary_zh":184,"released_at":185},297552,"v0.12.1","\u003C!-- 使用 .github\u002Frelease.yml 中的配置在 main 分支生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能\n* 支持 glm-4.7-flash，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4320 中实现\n* [ascend]支持 ep，由 @yao-fengchen 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3696 中实现\n### 💥 优化\n* 修复 transformers v5 的旋转位置编码，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4303 中实现\n* 改进指标日志记录，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4297 中实现\n* 支持在量化配置中忽略层，适用于 qwen3 模型，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4293 中实现\n* 添加自定义 noaux 内核，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4345 中实现\n* 修复 qwen3vl 在 transformers5 下的问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4348 中实现\n### 🐞 Bug 修复\n* 修复工具调用解析器的流式游标问题，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4333 中实现\n* 修复 TP 模式下引导解码的数据竞争问题，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4341 中实现\n* 进行 fa3 检查，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4340 中实现\n* 修复时间序列预处理问题，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4339 中实现\n* 修复 Attention 算子中负 KV 序列长度错误，由 @jinminxi104 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4316 中实现\n* 修复 qwen3-vl-moe 的长上下文问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4342 中实现\n* 修复：将量化归一化移至 CPU，而非使用过时的 q_linear 引用进行 smooth_quant，由 @Mr-Neutr0n 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4352 中实现\n* 更新 noaux-kernel 检查，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4358 中实现\n### 🌐 其他\n* 将 INPUT_CUDA_VERSION 更改为 12.6.2，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4322 中实现\n* 在 llm_compressor.md 中添加 Qwen3-8B 的精度评估，由 @43758726 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4319 中实现\n* [ci]重构 ete 测试用例，由 @zhulinJulia24 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4274 中实现\n* 为 interns1_pro 设置别名 interns1_1，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4334 中实现\n* 构建（docker）：当使用 cu13 时跳过 FA2，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4356 中实现\n* 将版本号提升至 v0.12.1，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4350 中实现\n\n## 新贡献者\n* @Mr-Neutr0n 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4352 中完成了首次贡献\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.12.0...v0.12.1","2026-02-13T09:02:12",{"id":187,"version":188,"summary_zh":189,"released_at":190},297553,"v0.12.0","\u003C!-- 使用 .github\u002Frelease.yml 中的配置在 main 分支生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能\n* 添加 Gloo 通信到 turbomind，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3362 中实现\n* [功能] 在 TurboMind 中支持 llm-compressor AWQ 模型，由 @43758726 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4290 中实现\n* 为 gpt oss 提供路由器重放功能，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4298 中实现\n* 在 TurboMind 中支持 llm-compressor 对称量化模型推理，由 @43758726 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4305 中实现\n* 支持 Intern-S1-Pro，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4318 中实现\n### 💥 改进\n* 为 CUDA IPC 通信器配置最大 CTAs 和 NVLS 使用量，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4227 中实现\n* 改进所有会话的中止机制，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4215 中实现\n* 实现 Moe Reduce 内核，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4228 中实现\n* 重构注意力机制，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4238 中实现\n* 优化异常抛出和错误处理流程，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4236 中实现\n* [AsyncEngine 重构 1\u002FN] 定义 MultimodalProcessor 以处理多模态数据，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4250 中实现\n* [AsyncEngine 重构 2\u002FN] 移除聊天模板中的弃用内容，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4252 中实现\n* 可配置的 uvicorn 超时时间，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4255 中实现\n* 适配 dlsime v0.0.2，由 @JimyMa 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4242 中实现\n* [修复] 修复量化校准数据集问题，由 @43758726 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4256 中实现\n* lmdeploy 支持并行嵌入，由 @Tsundoku958 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4192 中实现\n* 重构 turbomind 引擎，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4223 中实现\n* 重构 Engine 和 ModelAgent 的交互，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4265 中实现\n* 支持睡眠和销毁 deepep 缓冲区，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4246 中实现\n* 添加 Yarn 截断功能，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4301 中实现\n* [AsyncEngine 重构 3\u002FN] 引入 Session 和 SessionManager，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4253 中实现\n* 添加关于 NCCL 2.27 内存泄漏的警告，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4313 中实现\n### 🐞 错误修复\n* 修复 fope cos\u002Fsin 系数的设备类型问题，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4240 中实现\n* 修复 include_stop_str_in_output 与 output_logits 异常相关的问题，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4244 中实现\n* 修复 logit softcapping 为 None 的问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4247 中实现\n* 修复前缀缓存的性能退化问题，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4270 中实现\n* 将 FP8 模型的 float16 权重转换为 bfloat16，由 @lv","2026-02-04T06:28:25",{"id":192,"version":193,"summary_zh":194,"released_at":195},297554,"v0.11.1","\u003C!-- 使用 main 分支 .github\u002Frelease.yml 中的配置生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能\n* [ascend] 支持 dptp，由 @tangzhiyi11 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4218 中实现\n* 支持 Deepseek v32，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4026 中实现\n### 💥 优化\n* 由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4178 中改进指标\n* 由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4157 中为占位输入预留块\n* 为 Qwen3-VL 添加视觉 ID，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4183 中实现\n* 【增强】：在请求被取消时返回路由专家，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4197 中实现\n* 为 Qwen3-VL 添加多模态处理器参数，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4196 中实现\n* 支持 v1\u002Fchat\u002Fcompletions 中的 chat_template_kwargs，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4201 中实现\n* 由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4163 中重构调度器和 engine.py\n* 由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4204 中更新 dp 超时时间\n* 由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4207 中改进 Qwen3-VL\n### 🐞 Bug 修复\n* 【修复】：按查询长度拆分路由专家，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4180 中实现\n* 【Maca】修复 Ray 和内存同步问题，由 @wanfengcxz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4164 中实现\n* 在预填充阶段构建块 Trie 并添加命中率，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4184 中实现\n* 由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4191 中修复 fope 问题\n* 由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4188 中修复多处理器导致的 Hugging Face 模块读写冲突\n* 由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4185 中进行一些小修复\n* 由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4202 中修复调用 torch.load() 时的不安全反序列化问题\n* 由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4200 中修复处理器参数问题\n* 由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4217 中移除 get_model_config，以避免 RPC 调用中出现 pickle hf_config 错误\n* 由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4212 中修复量化 scale-fmt 问题\n* 由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4222 中修复 mix return_logprobs 的请求问题\n* 由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4229 中修复 fillkv quant8 问题\n* 由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4230 中修复 scale-fmt 问题\n### 📚 文档\n* 【文档】：为 VLMEvalKit 添加指南，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4156 中实现\n### 🌐 其他\n* 由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4166 中添加 FA3\n* 由 @littlegy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4161 中添加分布式测试用例\n* 由 @littlegy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4181 中添加生成测试\n* 【ci】由 @zhulinJulia24 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4230 中添加 MLLM 评估","2025-12-24T13:27:06",{"id":197,"version":198,"summary_zh":199,"released_at":200},297555,"v0.11.0","\u003C!-- 使用 .github\u002Frelease.yml 中的配置在 main 分支生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能\n* 由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4092 中添加了 `\u002Fabort_request` 端点\n* Qwen3 next，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4039 中实现\n* 支持 Qwen3-VL，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4093 中实现\n* 支持使用展平的 bucket 张量同步权重，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4109 中实现\n* 支持用于 MoE 模型的分组路由器，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4120 中实现\n* 【功能】：返回路由后的专家以供复用，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4090 中实现\n* 支持上下文并行，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3951 中实现\n* fope，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4043 中实现\n* 【功能】：支持推测解码，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3945 中实现\n* MoE 的 bf16 ep，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4144 中实现\n### 💥 改进\n* 增大垃圾回收阈值，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4076 中实现\n* 从 `EngineOutput` 中移除 `num_tokens` 字段，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4088 中实现\n* 还原对 `vocab_size` 的掩码处理，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4089 中实现\n* 功能：在 `response_format` 中添加对 `json_object` 的支持，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4080 中实现\n* 支持向 `\u002Fgenerate` 端点传入图像数据，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4086 中实现\n* 【修复】：在 RL 训练中将所有 `RayEngineWorker` actor 创建在节点 0 上，由 @CyCle1024 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4107 中实现\n* 优化 TurboMind 后端的 `sleep level=1`，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4074 中实现\n* 【功能】：启用 Ascend 的 `update_params`，由 @CyCle1024 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4111 中实现\n* 增强请求检查器，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4104 中实现\n* 重构 DP 和 TP，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4004 中实现\n* 修复内核数值误差，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4133 中实现\n* 释放 Ray 的 `put` 操作，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4137 中实现\n* 在调整大小时减少专家缓存，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4138 中实现\n* 支持消息中文本与图像的交错，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4141 中实现\n* 优化 RMS 归一化，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4153 中实现\n* 修复淘汰策略，由 @Tsundoku958 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4127 中实现\n### 🐞 错误修复\n* 修复类型提示，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4078 中实现\n* 修复输入分割问题，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4083 中实现\n* 补充缺失的 `update_model_meta` 函数调用，由 @jinminxi104 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4099 中实现\n* 修复加载 VL 模型时 PyTorch 后端的 `update_params` 问题。","2025-12-04T06:20:38",{"id":202,"version":203,"summary_zh":204,"released_at":205},297556,"v0.10.2","\u003C!-- 发布说明由 .github\u002Frelease.yml 配置在 main 分支上生成 -->\n\n## 变更内容\n### 🚀 新特性\n* 添加 \u002Fgenerate API，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4019 中实现\n* 为 TurboMind 引入基于 xgrammar 的引导解码功能，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3965 中实现\n* 为 PyTorch Engine 重新实现基于 xgrammar 的引导解码功能，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4028 中实现\n### 💥 性能优化\n* [ascend] 支持 aclgraph，由 @yao-fengchen 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4063 中实现\n* 利用推理引擎和异步引擎之间的增量输出提升性能，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4054 中实现\n* 优化多项式采样算法，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4056 中实现\n### 🐞 Bug 修复\n* zmqrpc 仅支持 localhost，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4017 中修复\n* 修复 dp+tp 热身阶段的 bug，由 @Tsundoku958 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3991 中修复\n* 修复 dllm 长上下文问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4012 中修复\n* 修复 GPT-OSS 流式工具调用解析问题，由 @QwertyJack 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4023 中修复\n* 将资源释放逻辑从异步引擎移至推理引擎，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4041 中实现\n* 修复引导解码时的分词器解析 bug，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4044 中修复\n* 修复工具调用及多模态输入中消息内容字段的处理问题，由 @QwertyJack 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4029 中修复\n* 为 kimi-k2 构建器修复问题，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4069 中实现\n* 跳过不必要的采样并修正随机偏移量，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4068 中修复\n* 修复当 ignore_special_tokens 为 False 时 stop_token_string 重复的问题，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4077 中修复\n### 🌐 其他\n* 停止对 CUDA 11.8 构建的支持，将 CI\u002FCD 升级至 CUDA 12.6\u002F12.8，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4013 中实现\n* 移除 profile_generation.py 及其测试用例，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4027 中实现\n* [ci] 将评估流程重构为 API 评估，并新增 h800 评估工作流，由 @zhulinJulia24 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4008 中实现\n* 为 NVIDIA Jetson 添加 Docker 镜像，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3834 中实现\n* [ci] 将 API 评估测试重构为 LLM 判官评估，由 @littlegy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4046 中实现\n* 检查彩色日志记录功能，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4060 中实现\n* 使用 HLE 和 LCB 数据集更新 API 测试，由 @littlegy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4061 中实现\n* 更新 ascend 相关依赖要求，由 @yao-fengchen 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4066 中实现\n* 将版本号升级至 v0.10.2，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4062 中实现\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare","2025-10-28T11:32:42",{"id":207,"version":208,"summary_zh":209,"released_at":210},297557,"v0.10.1","\u003C!-- 使用 .github\u002Frelease.yml 中的配置在 main 分支生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能\n* 添加 ROCm 支持：由 @Vivicai1005 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3925 中提供了 AMD GPU 的安装指南和 FlashAttention 兼容性。\n* 支持 gpt-oss 基本输出，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3956 中实现。\n* 添加 FP8*(B)F16 GEMM，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3960 中完成。\n* 支持 GLM-4.5，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3863 中实现。\n* 【重构】：在构建引擎时移除分词器，由 @RunningLeon 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3978 中完成。\n* 支持 InternVL3.5-Flash，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3952 中实现。\n* 支持 gpt-oss 的函数调用和推理功能，用于 \u002Fv1\u002Fchat\u002Fcompletions 接口，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3962 中完成。\n* 支持在输出中返回 stop_str，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3984 中实现。\n* 支持 SDAR，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3922 中实现。\n### 💥 优化\n* 指定在 GeForce RTX 50 系列上的安装方法，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3947 中完成。\n* 将 PR-3708 的更改合入，以返回 token_id，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3976 中完成。\n* 优化 AsyncEngine 的生成方法，由 @shell-nlp 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3982 中完成。\n* 当 TP 引擎空闲时使用阻塞同步，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3974 中完成。\n* 在依赖项中添加 openai_harmony，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4006 中完成。\n### 🐞 Bug 修复\n* 修复与 triton3.4.0 相关的 bug，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3946 中完成。\n* 修复 longrope 问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3968 中完成。\n* 修复 xtuner 中 tm rl 的使用问题，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3912 中完成。\n* 在服务 VLM 模型时禁用前缀缓存，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3990 中完成。\n* 移除 NCCL_LAUNCH_MODE，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3994 中完成。\n* 如果请求 include_stop_str_in_output，则返回最后一个 token 的 logprobs、logits 和 last_hidden_states，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4000 中完成。\n* 【修复】在使用 PyTorch 引擎时，chat cli 中的 device 参数问题，由 @CyCle1024 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3999 中完成。\n* 修复 internvl 相关问题，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3997 中完成。\n* 修复 SequenceManager::Erase 中未返回的迭代器问题，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4001 中完成。\n* 修复没有预热的 cuGraph 问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4005 中完成。\n* 修复 internvl flash 长上下文准确率问题，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F4003 中完成。\n### 🌐 其他\n* 【ci】更新每日测试用例，由 @zhulinJulia24 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3944 中完成。\n* 【maca】将 kv 布局从 pagedattn 更改为 flashattn，由 @yuchiwang 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F39 中完成。","2025-09-26T02:45:30",{"id":212,"version":213,"summary_zh":214,"released_at":215},297558,"v0.10.0","\u003C!-- 使用 .github\u002Frelease.yml 中的配置在 main 分支生成的发布说明 -->\n\n## 变更内容\n### 🚀 功能\n* 支持为 TurboMind 卸载权重和 KV 缓存，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3798 中实现\n* 添加 PPU 后端支持，由 @guozixu2001 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3807 中实现\n* 添加 TurboMind 指标，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3811 中实现\n* PytorchEngine 支持 gpt-oss bf16，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3820 中实现\n* 为 PT 引擎支持睡眠\u002F唤醒功能，由 @irexyc 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3687 中实现\n* [ascend] 在 A3 上运行 intern-s1，由 @yao-fengchen 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3831 中实现\n* 初步支持 TurboMind 的 gpt-oss，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3839 中实现\n* 支持 GLM-4-0414 和 GLM-4.1V，由 @CUHKSZzxy 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3846 中实现\n* 支持 internvl3.5，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3886 中实现\n* 更新 TurboMind 通信库，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3736 中实现\n* TurboMind GEMM 库支持 MXFP4，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3927 中实现\n* 为 sm70 和 sm75 分配 MXFP4 权重转换任务，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3937 中实现\n### 💥 改进\n* 修复：CLI 服务中的 TurboMind 后端配置，由 @PeymanRM 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3784 中修复\n* 移除已弃用的代码，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3759 中完成\n* 重构 FP8 MoE GEMM，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3795 中完成\n* 修复构建 RoPE 参数的问题，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3760 中修复\n* 针对 head_dim=128 优化 RMSNorm，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3814 中完成\n* 简化 GEMM 接口，由 @lzhangzz 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3818 中实现\n* 优化 create_model_inputs 和 schedule_decoding，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3766 中完成\n* 添加远程日志；优化前向锁，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3737 中完成\n* 支持 DeepGemm 新 API，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3827 中实现\n* 移除使用 Gradio 提供的服务，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3829 中完成\n* 从 api_server 中弃用交互模式，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3830 中完成\n* 构建（Docker）：尝试优化 Docker，由 @windreamer 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3779 中完成\n* 制作通用 chat.py 替换各引擎的版本，由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3836 中完成\n* Ray MP 引擎后端，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3790 中实现\n* 【特性】支持使用带有 bundle 的外部 Ray PG，由 @CyCle1024 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3850 中实现\n* 移除 PT 引擎中未使用的代码，由 @grimoire 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3858 中完成\n* 支持 logprobs，由 @grimoire 在 https:\u002F\u002F","2025-09-09T05:05:09",{"id":217,"version":218,"summary_zh":219,"released_at":220},297559,"v0.9.2.post1","\u003C!-- 发布说明由 .github\u002Frelease.yml 中的配置生成，针对 dev-0.9.2post1 分支 -->\r\n\r\n## 变更内容\r\n\r\n* 由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3848 中修复了 turbomind 引擎的 interns1 LLM 映射问题\r\n* 由 @lvhan028 在 https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3849 中将版本号升级至 v0.9.2.post1\r\n\r\n\r\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.9.2...v0.9.2.post1","2025-08-19T09:44:41",{"id":222,"version":223,"summary_zh":224,"released_at":225},297560,"v0.9.2","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 🚀 Features\r\n* [Feature] metrics support by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3534\r\n* Relax FP8 TP requirement by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3697\r\n* FA3 by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3623\r\n* support qwen2\u002F2.5-vl in turbomind by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3744\r\n* feat: add pytorch_engine_qwen2_5vl_sm120 by @kolmogorov-quyet in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3750\r\n* Internvl pt by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3765\r\n* Improve internvl for turbomind engine by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3769\r\n### 💥 Improvements\r\n* Refactor linear by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3653\r\n* remove python3.8 support and add python3.13 support by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3638\r\n* refactor vl inputs split by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3699\r\n* [Fix]: Replace mutable default with default_factory for scheduler_stats by @ConvolutedDog in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3730\r\n* Fix the logic of calculating max_new_tokens and determining finish_reason by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3727\r\n* Override HF config.json via CLI by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3722\r\n* feat(build): Integrate and build turbomind backend directly in setup.py by @windreamer in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3726\r\n* Generate the benchmark output filename with given arguments by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3740\r\n* Make loading llm without vlm as an option by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3745\r\n### 🐞 Bug fixes\r\n* add ray to ascend requirements by @sigma-plus in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3713\r\n* fix accessing undefined attribute `seq_aux` of deepseek-r1-0528 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3728\r\n* [Fix]: Avoid quantize qk norm for qwen3 dense models by @taishan1994 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3733\r\n* fix py313 env creation failed when building lmdeploy-builder image by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3739\r\n* [Fix]: kernel meta retrieval for SM7X does not work by @xiaoajie738 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3746\r\n* limit max_session_len by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3751\r\n* fix internvl norm by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3756\r\n* support qwen3 moe yarn and vlm hf_overrides by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3757\r\n* [PD Disaggregation] fix double unshelf by @JimyMa in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3762\r\n* fix(build): fix version parse regex to support post-release versions by @windreamer in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3764\r\n* adapt transformers>=v4.52.0 to loading qwen2.5-vl with turbomind by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3771\r\n* fix chat template with tool call by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3773\r\n* fix vl nothink mode by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3776\r\n### 📚 Documentations\r\n* update reward model docs by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3721\r\n### 🌐 Other\r\n* update twomicrobatch by @SHshenhao in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3651\r\n* [CI]: Upgrade to py310 for ut by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3718\r\n* [ci] update dailytest environment and scripts by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3716\r\n* Preliminary Blackwell (sm_120a, RTX 50 series) support by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3701\r\n* [ci] add fp8 evaluation workflow by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3729\r\n* Add VRAM bandwidth utilization stat to attention test by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3731\r\n* doc: fix dead links to MindX DL to recover CI. by @windreamer in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3741\r\n* fix free cache in MPEngine branch by @JimyMa in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3670\r\n* fix: make RelWithDebInfo default cmake build type by @windreamer in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3774\r\n* bump version to v0.9.2 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3770\r\n\r\n## New Contributors\r\n* @sigma-plus made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3713\r\n* @ConvolutedDog made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3730\r\n* @windreamer made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3726\r\n* @taishan1994 made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3733\r\n* @xiaoajie738 made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpul","2025-07-26T10:00:25",{"id":227,"version":228,"summary_zh":229,"released_at":230},297561,"v0.9.1","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 🚀 Features\r\n* feature: enable tool_call and reasoning_content parsing for qwen3 by @ywx217 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3615\r\n* Support Mooncake migration backend for PD disaggregation by @Risc-lt in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3620\r\n* Support load fused moe weights by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3672\r\n* Seperate api_server and pytorch engine into different processors by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3627\r\n* add reward model api by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3665\r\n### 💥 Improvements\r\n* [ascend]import patch at initiazing time by @JackWeiw in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3662\r\n* [ascend]use custon transdata in python kernel by @JackWeiw in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3671\r\n* move import transformers in patch by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3660\r\n* set ray envs by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3643\r\n* raise ImportError when enable ep and not install dlblas by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3636\r\n* Reduce sampling memory usage by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3666\r\n### 🐞 Bug fixes\r\n* fix dockerfile by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3657\r\n* Fix top-p only sampling with padded vocab size by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3661\r\n* fix pt engine stop & cancel by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3681\r\n* Fix convert bf16 to numpy by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3686\r\n* disable torch.compile in cuda graph runner by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3691\r\n* fix reward model api by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3703\r\n### 📚 Documentations\r\n* add reward model documents by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3706\r\n### 🌐 Other\r\n* upgrade torch and triton by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3677\r\n* support do_preprocess=False for chat.completions by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3645\r\n* [ci] change flash atten installation in pr test by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3688\r\n* fix profile_throughput.py by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3692\r\n* fix profile_generation.py by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3707\r\n* update dlblas version in dockerfile by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3711\r\n* bump version to v0.9.1 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3685\r\n\r\n## New Contributors\r\n* @ywx217 made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3615\r\n* @Risc-lt made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3620\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.9.0...v0.9.1","2025-07-04T10:05:31",{"id":232,"version":233,"summary_zh":234,"released_at":235},297562,"v0.9.0","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 🚀 Features\r\n* LMDeploy Distserve by @JimyMa in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3304\r\n* allow api server terminated through requests from clients by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3533\r\n* support update params for pytorch backend from api server by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3535\r\n* support eplb for Qwen3-MoE by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3582\r\n* support update params for turbomind backend by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3566\r\n* Quantize Qwen3 MoE bf16 model to fp8 model at runtime by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3631\r\n* [Feat]: Support internvl3-8b-hf by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3633\r\n* Add FP8 MoE for turbomind by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3601\r\n### 💥 Improvements\r\n* reduce ray memory usage by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3487\r\n* use dlblas by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3489\r\n* internlm3 dense fp8 by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3527\r\n* random pad input ids by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3530\r\n* ray nsys profile support by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3448\r\n* update blockedfp8 scale name by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3532\r\n* start engine loop on server startup event by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3523\r\n* update two microbatch by @SHshenhao in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3540\r\n* [ascend]set transdata dynamic shape true by @JackWeiw in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3531\r\n* ray safe exit by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3545\r\n* support update params with dp=1 for pytorch engine by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3562\r\n* Skip dp dummy input forward by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3552\r\n* Unclock mutual exclusivity of argument:  `tool-call-parser` and `reasoning-parser` by @jingyibo123 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3550\r\n* perform torch.cuda.empty_cache() after conversion by @bltcn in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3570\r\n* pipeline warmup by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3548\r\n* Launch multiple api servers for dp > 1 by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3414\r\n* support awq for Qwen2.5-VL  by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3559\r\n* support qwen3 \u002Fthink & \u002Fno_think & enable_thinking parameter by @BUJIDAOVS in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3564\r\n* Eplb by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3572\r\n* Update benchmark by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3578\r\n* block output when prefetch next forward inputs. by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3573\r\n* support both eplb and microbatch simultaneously by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3591\r\n* Add log_file and set loglevel in launch_servers by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3596\r\n* 1. add migration flow control by @JimyMa in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3599\r\n* sampling on the tokenizer's vocab by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3604\r\n* update deepgemm version by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3606\r\n* [Ascend] set default distrbuted backend as ray for ascend device by @JackWeiw in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3603\r\n* Blocked fp8 tma by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3470\r\n* [PDDisaggreagtion] Async migration by @JimyMa in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3610\r\n* move dp loop to model agent by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3598\r\n* update some logs of proxy_server and pt engine by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3621\r\n* improve loading model performance by shuffling the weight files by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3625\r\n* add benchmark scripts about pipeline api and inference engines according to the config file by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3622\r\n### 🐞 Bug fixes\r\n* [ascend] fix recompile on different rank by @jinminxi104 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3513\r\n* fix attention sm86 by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3519\r\n* fix stopwords kv cache by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3494\r\n* [bug fix] fix PD Disaggregation in DSV3 by @JimyMa in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3547\r\n* fix proxy server heart beat by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3543\r\n* fix dp>1 tp=1 ep=1 by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3555\r\n* fix mixtral on new transf","2025-06-19T02:27:57",{"id":237,"version":238,"summary_zh":239,"released_at":240},297563,"v0.8.0","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 🚀 Features\r\n* Torch dp support by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3207\r\n* Add deep gemm with tma pre allocated by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3287\r\n* Add mixed DP + TP by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3229\r\n* Add Qwen3 and Qwen3MoE by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3305\r\n* [ascend] support multi nodes on ascend device by @tangzhiyi11 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3260\r\n* [Feature] support qwen3 and qwen3-moe for pytorch engine by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3315\r\n* [ascend]support deepseekv2 by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3206\r\n* add deepep by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3313\r\n* support ascend w8a8 graph_mode by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3267\r\n* support all2all ep by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3370\r\n* optimize ep in decoding stage by @zhaochaoxing in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3383\r\n* Warmup deepgemm by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3387\r\n* support Llama4 by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3408\r\n* add twomicrobatch support by @SHshenhao in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3381\r\n* Support phi4 mini by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3467\r\n* [Dlinfer][Ascend] support 310P by @JackWeiw in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3484\r\n* support qwen3 fp8 by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3505\r\n### 💥 Improvements\r\n* Add spaces_between_special_tokens to \u002Fv1\u002Finteractive and make compatible with empty text by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3283\r\n* add env var to control timeout by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3291\r\n* refactor attn param by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3164\r\n* Verbose log by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3329\r\n* optimize mla, remove load `v` by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3334\r\n* support dp decoding with cudagraph by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3311\r\n* optimize quant-fp8 kernel by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3345\r\n* refactor dlinfer rope by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3326\r\n* enable qwenvl2.5 graph mode on ascend by @jinminxi104 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3367\r\n* Add AIOHTTP_TIMEOUT env var for proxy server by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3355\r\n* disable sync batch on dp eager mode by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3382\r\n* fix for deepgemm update by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3380\r\n* Add string before hash tokens in blocktrie by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3386\r\n* optimize moe get sorted idx by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3356\r\n* use half\u002Fbf16 lm_head output by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3213\r\n* remove ep eager check by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3392\r\n* Optimize ascend moe by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3364\r\n* optimize fp8 moe kernel by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3419\r\n* ray async forward execute by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3443\r\n* map internvl3 chat template to builtin chat template internvl2_5 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3450\r\n* Refactor turbomind (low-level abstractions) by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3423\r\n* remove barely used code to improve maintenance by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3462\r\n* optimize sm80 long context by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3465\r\n* move partial_json_parser from ’serve.txt‘ to ‘runtime.txt‘ by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3493\r\n* support qwen3-dense models awq quantization by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3503\r\n* Optimize MoE gate for Qwen3 by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3500\r\n* Pass num_tokens_per_iter and max_prefill_iters params through in `lmdeploy serve api_server` by @josephrocca in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3504\r\n* [Dlinfer][Ascend] Optimize performance of 310P device by @JackWeiw in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3486\r\n* optimize longcontext decoding by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3510\r\n* Support min_p in openai completions_v1 by @josephrocca in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3506\r\n### 🐞 Bug fixes\r\n* fix activation grid oversize by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3282\r\n* Set ensure_ascii=False ","2025-05-04T03:17:23",{"id":242,"version":243,"summary_zh":244,"released_at":245},297564,"v0.7.3","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at dev -->\r\n\r\n## What's Changed\r\n### 🚀 Features\r\n* Add Qwen3 and Qwen3MoE by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3305\r\n* [Feature] support qwen3 and qwen3-moe for pytorch engine by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3315\r\n* [ascend]support deepseekv2 by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3206\r\n* support ascend w8a8 graph_mode by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3267\r\n* support Llama4 by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3408\r\n### 💥 Improvements\r\n* Add spaces_between_special_tokens to \u002Fv1\u002Finteractive and make compatible with empty text by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3283\r\n* add env var to control timeout by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3291\r\n* optimize mla, remove load `v` by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3334\r\n* refactor dlinfer rope by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3326\r\n* enable qwenvl2.5 graph mode on ascend by @jinminxi104 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3367\r\n* Optimize ascend moe by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3364\r\n* find port by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3429\r\n### 🐞 Bug fixes\r\n* fix activation grid oversize by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3282\r\n* Set ensure_ascii=False for tool calling by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3295\r\n* add `v` check by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3307\r\n* Fix Qwen3MoE config parsing by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3336\r\n* Fix finish reasons by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3338\r\n* remove think_end_token_id in streaming content by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3327\r\n* Fix the finish_reason by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3350\r\n* support List[dict] prompt input without do_preprocess by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3385\r\n* fix tensor dispatch in dynamo by @wanfengcxz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3417\r\n### 📚 Documentations\r\n* update ascend doc by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3420\r\n### 🌐 Other\r\n* bump version to v0.7.2.post1 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3298\r\n* Optimize internvit by @caikun-pjlab in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3316\r\n* bump version to v0.7.3 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3416\r\n\r\n## New Contributors\r\n* @wanfengcxz made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3417\r\n* @caikun-pjlab made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3316\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.7.2...v0.7.3","2025-04-14T10:04:08",{"id":247,"version":248,"summary_zh":249,"released_at":250},297565,"v0.7.2.post1","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 💥 Improvements\r\n* Add spaces_between_special_tokens to \u002Fv1\u002Finteractive and make compatible with empty text by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3283\r\n* add env var to control timeout by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3291\r\n### 🐞 Bug fixes\r\n* fix activation grid oversize by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3282\r\n* Set ensure_ascii=False for tool calling by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3295\r\n### 🌐 Other\r\n* bump version to v0.7.2.post1 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3298\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.7.2...v0.7.2.post1","2025-03-21T06:38:34",{"id":252,"version":253,"summary_zh":254,"released_at":255},297566,"v0.7.2","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 🚀 Features\r\n* [Feature] support qwen2.5-vl for pytorch engine by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3194\r\n* Support reward models by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3192\r\n* Add collective communication kernels by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3163\r\n* PytorchEngine multi-node support v2 by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3147\r\n* Add flash mla by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3218\r\n* Add gemma3 implementation by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3272\r\n### 💥 Improvements\r\n* remove update badwords by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3183\r\n* defaullt executor ray by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3210\r\n* change ascend&camb default_batch_size to 256 by @jinminxi104 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3251\r\n* Tool reasoning parsers and streaming function call by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3198\r\n* remove torchelastic flag by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3242\r\n* disable flashmla warning on sm\u003C90 by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3271\r\n### 🐞 Bug fixes\r\n* Fix missing cli chat option by @lzhangzz in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3209\r\n* [ascend] fix multi-card distributed inference failures by @tangzhiyi11 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3215\r\n* fix for small cache-max-entry-count by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3221\r\n* [dlinfer] fix glm-4v graph mode on ascend by @jinminxi104 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3235\r\n* fix qwen2.5 pytorch engine dtype error on NPU by @tcye in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3247\r\n* [Fix] failed to update the tokenizer's eos_token_id into stop_word list by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3257\r\n* fix dsv3 gate scaling by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3263\r\n* Fix the bug for reading dict error by @GxjGit in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3196\r\n* Fix get ppl by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3268\r\n### 📚 Documentations\r\n* Specifiy lmdeploy version in benchmark guide  by @lyj0309 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3216\r\n* [ascend] add Ascend docker image by @jinminxi104 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3239\r\n### 🌐 Other\r\n* [ci] testcase refactoring by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3151\r\n* [ci] add testcase for native communicator by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3217\r\n* [ci] add volc evaluation testcase by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3240\r\n* [ci] remove v100 testconfig by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3253\r\n* add rdma dependencies into docker file by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3262\r\n* docs: update ascend docs for docker running by @CyCle1024 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3266\r\n* bump version to v0.7.2 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3252\r\n\r\n## New Contributors\r\n* @lyj0309 made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3216\r\n* @tcye made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3247\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.7.1...v0.7.2","2025-03-19T08:36:30",{"id":257,"version":258,"summary_zh":259,"released_at":260},297567,"v0.7.1","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 🚀 Features\r\n* support release pipeline by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3069\r\n* [feature] add dlinfer w8a8 support. by @Reinerzhou in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F2988\r\n* [maca] support deepseekv2 for maca backend. by @Reinerzhou in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F2918\r\n* [Feature] support deepseek-vl2 for pytorch engine by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3149\r\n### 💥 Improvements\r\n* use weights iterator while loading by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F2886\r\n* Add deepseek-r1 chat template by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3072\r\n* Update tokenizer by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3061\r\n* Set max concurrent requests by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F2961\r\n* remove logitswarper by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3109\r\n* Update benchmark script and user guide by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3110\r\n* support eos_token list in turbomind by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3044\r\n* Use aiohttp inside proxy server && add --disable-cache-status argument by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3020\r\n* Update runtime package dependencies by @zgjja in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3142\r\n* Make turbomind support embedding inputs on GPU by @chengyuma in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3177\r\n### 🐞 Bug fixes\r\n* [dlinfer] fix ascend qwen2_vl graph_mode by @yao-fengchen in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3045\r\n* fix error in interactive api by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3074\r\n* fix sliding window mgr by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3068\r\n* More arguments in api_client, update docstrings by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3077\r\n* Add system role to deepseek chat template by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3031\r\n* Fix xcomposer2d5 by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3087\r\n* fix user guide about cogvlm deployment by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3088\r\n* fix postional argument by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3086\r\n* Fix UT of deepseek chat template by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3125\r\n* Fix internvl2.5 error after eviction by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3122\r\n* Fix cogvlm and phi3vision by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3137\r\n* [fix] fix vl gradio, use pipeline api and remove interactive chat by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3136\r\n* fix the issue that stop_token may be less than defined in model.py by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3148\r\n* fix typing by @lz1998 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3153\r\n* fix min length penalty by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3150\r\n* fix default temperature value by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3166\r\n* Use pad_token_id as image_token_id for vl models by @RunningLeon in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3158\r\n* Fix tool call prompt for InternLM and Qwen by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3156\r\n* Update qwen2.py by @GxjGit in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3174\r\n* fix temperature=0 by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3176\r\n* fix blocked fp8 moe by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3181\r\n* fix deepseekv2 has no attribute use_mla error by @CUHKSZzxy in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3188\r\n* fix unstoppable chat by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3189\r\n### 🌐 Other\r\n* [ci] add internlm3 into testcase by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3038\r\n* add internlm3 to supported models by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3041\r\n* update pre-commit config by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F2683\r\n* [maca] add cudagraph support on maca backend. by @Reinerzhou in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F2834\r\n* bump version to v0.7.0.post1 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3076\r\n* bump version to v0.7.0.post2 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3094\r\n* [Fix] fix the URL judgment problem in Windows by @Lychee-acaca in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3103\r\n* bump version to v0.7.0.post3 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3115\r\n* [ci] fix some fail in daily testcase by @zhulinJulia24 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3134\r\n* Bump version to v0.7.1 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3178\r\n\r\n## New Contributors\r\n* @Lychee-acaca made their first contribution in https:\u002F\u002Fgithub.com\u002FI","2025-02-27T02:19:22",{"id":262,"version":263,"summary_zh":264,"released_at":265},297568,"v0.7.0.post3","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 💥 Improvements\r\n* Set max concurrent requests by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F2961\r\n* remove logitswarper by @grimoire in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3109\r\n### 🐞 Bug fixes\r\n* fix user guide about cogvlm deployment by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3088\r\n* fix postional argument by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3086\r\n### 🌐 Other\r\n* [Fix] fix the URL judgment problem in Windows by @Lychee-acaca in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3103\r\n* bump version to v0.7.0.post3 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3115\r\n\r\n## New Contributors\r\n* @Lychee-acaca made their first contribution in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3103\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.7.0.post2...v0.7.0.post3","2025-02-10T06:00:46",{"id":267,"version":268,"summary_zh":269,"released_at":270},297569,"v0.7.0.post2","\u003C!-- Release notes generated using configuration in .github\u002Frelease.yml at main -->\r\n\r\n## What's Changed\r\n### 💥 Improvements\r\n* Add deepseek-r1 chat template by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3072\r\n* Update tokenizer by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3061\r\n### 🐞 Bug fixes\r\n* Add system role to deepseek chat template by @AllentDan in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3031\r\n* Fix xcomposer2d5 by @irexyc in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3087\r\n### 🌐 Other\r\n* bump version to v0.7.0.post2 by @lvhan028 in https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fpull\u002F3094\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002FInternLM\u002Flmdeploy\u002Fcompare\u002Fv0.7.0.post1...v0.7.0.post2","2025-01-27T15:57:36"]