[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-mit-han-lab--duo-attention":3,"tool-mit-han-lab--duo-attention":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":80,"owner_email":80,"owner_twitter":81,"owner_website":82,"owner_url":83,"languages":84,"stars":93,"forks":94,"last_commit_at":95,"license":96,"difficulty_score":97,"env_os":98,"env_gpu":99,"env_ram":100,"env_deps":101,"category_tags":115,"github_topics":80,"view_count":23,"oss_zip_url":80,"oss_zip_packed_at":80,"status":16,"created_at":116,"updated_at":117,"faqs":118,"releases":119},3203,"mit-han-lab\u002Fduo-attention","duo-attention","[ICLR 2025] DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads","duo-attention 是一款专为提升长上下文大语言模型（LLM）推理效率而设计的开源框架。它主要解决了在处理超长文本时，传统方法因缓存所有注意力状态而导致显存占用过高、推理延迟大的痛点。现有方案往往在压缩缓存时会损害模型的长文理解能力，而 duo-attention 巧妙地平衡了效率与性能。\n\n其核心技术亮点在于发现并非所有注意力头都需要全量缓存。duo-attention 将注意力头智能分为两类：负责捕捉长距离依赖的“检索头”和仅关注近期信息的“流式头”。系统仅对关键的检索头保留完整缓存，而对流式头采用轻量级的定长缓存策略。这种差异化处理无需牺牲模型的长上下文能力，即可将显存占用降低最高 2.55 倍，解码速度提升最高 2.18 倍。值得注意的是，结合量化技术，它甚至能让单张 A100 显卡支持 Llama-3-8B 模型处理长达 330 万 token 的上下文。\n\n该工具非常适合 AI 研究人员、大模型开发者以及需要部署长文本应用的技术团队使用。如果你正在探索如何让大模型更高效地处理书籍、长文档或复杂代码库，duo-attention 提供了一个经过 ICLR 2025 验","duo-attention 是一款专为提升长上下文大语言模型（LLM）推理效率而设计的开源框架。它主要解决了在处理超长文本时，传统方法因缓存所有注意力状态而导致显存占用过高、推理延迟大的痛点。现有方案往往在压缩缓存时会损害模型的长文理解能力，而 duo-attention 巧妙地平衡了效率与性能。\n\n其核心技术亮点在于发现并非所有注意力头都需要全量缓存。duo-attention 将注意力头智能分为两类：负责捕捉长距离依赖的“检索头”和仅关注近期信息的“流式头”。系统仅对关键的检索头保留完整缓存，而对流式头采用轻量级的定长缓存策略。这种差异化处理无需牺牲模型的长上下文能力，即可将显存占用降低最高 2.55 倍，解码速度提升最高 2.18 倍。值得注意的是，结合量化技术，它甚至能让单张 A100 显卡支持 Llama-3-8B 模型处理长达 330 万 token 的上下文。\n\n该工具非常适合 AI 研究人员、大模型开发者以及需要部署长文本应用的技术团队使用。如果你正在探索如何让大模型更高效地处理书籍、长文档或复杂代码库，duo-attention 提供了一个经过 ICLR 2025 验证的高效解决方案。","# DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads\n[[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10819)] [[slides](figures\u002FDuoAttention.pdf)]\n\n![method1](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_2c701b8ab49f.jpg)\n![method2](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_38ec197f8f52.jpg)\n\n## Demo\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002Fb372882b-bf14-4c89-a610-22724d91a415\n\n## TL;DR\nWe significantly reduce both pre-filling and decoding memory and latency for long-context LLMs without sacrificing their long-context abilities.\n\n## Abstract\nDeploying long-context large language models (LLMs) is essential but poses significant computational and memory challenges.\nCaching all Key and Value (KV) states across all attention heads consumes substantial memory.\nExisting KV cache pruning methods either damage the long-context capabilities of LLMs or offer only limited efficiency improvements.\nIn this paper, we identify that only a fraction of attention heads, a.k.a, Retrieval Heads, are critical for processing long contexts and require full attention across all tokens.\nIn contrast, all other heads, which primarily focus on recent tokens and attention sinks, referred to as Streaming Heads, do not require full attention.\nBased on this insight, we introduce DuoAttention, a framework that only applies a full KV cache to retrieval heads while using a light-weight, constant-length KV cache for streaming heads, which reduces both LLM's decoding and pre-filling memory and latency without compromising its long-context abilities.\nDuoAttention uses a lightweight, optimization-based algorithm with synthetic data to identify retrieval heads accurately.\nOur method significantly reduces long-context inference memory by up to 2.55x for MHA and 1.67x for GQA models while speeding up decoding by up to 2.18x and 1.50x and accelerating pre-filling by up to 1.73x and 1.63x for MHA and GQA models, respectively, with minimal accuracy loss compared to full attention.\nNotably, combined with quantization, DuoAttention enables Llama-3-8B decoding with 3.3 million context length on a single A100 GPU.\n\n## Installation and Usage\n\n### Environment Setup\n\n#### Training and Evaluation Environment\n\n```bash\nconda create -yn duo python=3.10\nconda activate duo\n\nconda install -y git\nconda install -y nvidia\u002Flabel\u002Fcuda-12.4.0::cuda-toolkit\nconda install -y nvidia::cuda-cudart-dev\nconda install -y pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia\n\npip install transformers==4.45.2 accelerate sentencepiece datasets wandb zstandard matplotlib huggingface_hub==0.25.2\npip install tensor_parallel==2.0.0\n\npip install ninja packaging\npip install flash-attn==2.6.3 --no-build-isolation\n\n# LongBench evaluation\npip install seaborn rouge_score einops pandas\n\npip install flashinfer -i https:\u002F\u002Fflashinfer.ai\u002Fwhl\u002Fcu121\u002Ftorch2.4\u002F\n\n# Install DuoAttention\npip install -e .\n\n# Install Block Sparse Streaming Attention\ngit clone https:\u002F\u002Fgithub.com\u002Fmit-han-lab\u002FBlock-Sparse-Attention\ncd Block-Sparse-Attention\npython setup.py install\n```\n\n\n#### Demo Environment\n```bash\nconda create -yn duo_demo python=3.10\nconda activate duo_demo\n\n# Install DuoAttention\npip install -e .\n\nconda install -y git\nconda install -y nvidia\u002Flabel\u002Fcuda-12.4.0::cuda-toolkit\nconda install -y nvidia::cuda-cudart-dev\n\n# Install QServe\ngit clone https:\u002F\u002Fgithub.com:mit-han-lab\u002Fqserve\ncd qserve\npip install -e .\npip install ninja packaging\npip install flash-attn==2.4.1 --no-build-isolation\ncd kernels\npython setup.py install\n\n# Install FlashInfer\npip install flashinfer -i https:\u002F\u002Fflashinfer.ai\u002Fwhl\u002Fcu121\u002Ftorch2.3\u002F\npip install tensor_parallel\n```\n\n### Dataset\nTo download the dataset:\n\n```bash\nmkdir -p datasets\ncd datasets\n\nwget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ftogethercomputer\u002FLong-Data-Collections\u002Fresolve\u002Fmain\u002Ffine-tune\u002Fbooksum.jsonl.zst\n```\n\n### Model\nTo download models supported by DuoAttention:\n```bash\nmkdir -p models\ncd models\n\n# Models that DuoAttention currently supports for evaluation\nhuggingface-cli download togethercomputer\u002FLlama-2-7B-32K-Instruct --local-dir Llama-2-7B-32K-Instruct\nhuggingface-cli download gradientai\u002FLlama-3-8B-Instruct-Gradient-1048k --local-dir Llama-3-8B-Instruct-Gradient-1048k\nhuggingface-cli download gradientai\u002FLlama-3-8B-Instruct-Gradient-4194k --local-dir Llama-3-8B-Instruct-Gradient-4194k\nhuggingface-cli download mistralai\u002FMistral-7B-Instruct-v0.2 --local-dir Mistral-7B-Instruct-v0.2\nhuggingface-cli download mistralai\u002FMistral-7B-Instruct-v0.3 --local-dir Mistral-7B-Instruct-v0.3\n\n#  W8A8KV4 models using SmoothQuant and QServe for demo purposes\nhuggingface-cli download mit-han-lab\u002FLlama-3-8B-Instruct-Gradient-1048k-w8a8kv4-per-channel --local-dir Llama-3-8B-Instruct-Gradient-1048k-w8a8kv4-per-channel\nhuggingface-cli download mit-han-lab\u002FLlama-3-8B-Instruct-Gradient-4194k-w8a8kv4-per-channel --local-dir Llama-3-8B-Instruct-Gradient-4194k-w8a8kv4-per-channel\n```\n\n## Quick Start for DuoAttention\nWe offer a simple one-click patch to enable DuoAttention optimization on HuggingFace models, including Llama and Mistral. Pretrained retrieval head patterns for five long-context models are available in the `attn_patterns` directory: `Llama-2-7B-32K-Instruct`, `Llama-3-8B-Instruct-Gradient-1048k`, `Llama-3-8B-Instruct-Gradient-4194k`, `Mistral-7B-Instruct-v0.2`, `Mistral-7B-Instruct-v0.3`, and `Meta-Llama-3.1-8B-Instruct`. If you'd like to train your own retrieval head patterns, you can use the training script provided in the scripts directory. Below is an example of how to enable DuoAttention on the `Llama-3-8B-Instruct-Gradient-1048k` model.\n\n\n```python\nfrom duo_attn.utils import load_attn_pattern, sparsify_attention_heads\nfrom duo_attn.patch import enable_duo_attention_eval\nimport transformers\nimport torch\n\n# Load the model\nmodel = transformers.AutoModelForCausalLM.from_pretrained(\n    \"models\u002FLlama-3-8B-Instruct-Gradient-1048k\",\n    torch_dtype=torch.bfloat16,\n    low_cpu_mem_usage=True,\n    attn_implementation=\"eager\",\n)\n\n# Load the attention pattern\nattn_heads, sink_size, recent_size = load_attn_pattern(\n    \"attn_patterns\u002FLlama-3-8B-Instruct-Gradient-1048k\u002Flr=0.02-reg=0.05-ctx=1000_32000-multi_passkey10\"\n)\n\n# Sparsify attention heads\nattn_heads, sparsity = sparsify_attention_heads(attn_heads, sparsity=0.5)\n\n# Enable DuoAttention\nenable_duo_attention_eval(\n    model,\n    attn_heads,\n    sink_size=64,\n    recent_size=256,\n)\n\n# Move model to GPU\nmodel = model.cuda()\n\n# Ready for inference!\n```\n\n## Demo\nAfter setting up the environment, you can run the following script to execute the W4A8KV4 with DuoAttention demo on the `Llama-3-8B-Instruct-Gradient-4194k` model. The demo is designed to run on a single A100 GPU and supports a context length of up to 3.3 million tokens.\n\n```bash\nbash scripts\u002Frun_demo.sh\n```\n\n## Results \n\n### Retrieval Head Identification\nAfter preparing the dataset and models, you can run the training script to identify the retrieval heads. For the models we evaluated, the corresponding attention patterns are available in the `attn_patterns` directory.\n\n```bash\nbash scripts\u002Frun_train.sh\n```\n\n### Needle in a Haystack (NIAH)\nDuoAttention provides comparable accuracy as full attention on the Needle-in-a-Haystack benchmark using 25% full attention ratio on the MHA model and 50% full attention ratio on the GQA model.\n\n```bash\nbash scripts\u002Frun_niah.sh\n```\n\n![niah](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_8bae986001e0.jpg)\n\n### LongBench\n\n```bash\nbash scripts\u002Frun_longbench.sh\n```\n\nDuoAttention provides better KV budget and accuracy trade-off on LongBench benchmarks.\n\n![longbench](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_08603755a46d.jpg)\n\n### Efficiency\n\n```bash\nbash scripts\u002Frun_efficiency.sh\n```\n\n- Per-token decoding latency and memory usage of DuoAttention compared to full attention across varying context sizes. DuoAttention uses a 25% retrieval head ratio for Llama-2-7B (MHA) and 50% for Llama-3-8B (GQA). DuoAttention achieves up to 2.45× memory reduction for MHA and 1.65× for GQA models, along with up to 2.13× latency reduction for MHA and 1.5× for GQA models. These reductions approach the inverse of the retrieval head ratios as context length increases. Out-of-memory (OOM) results are linearly extrapolated from measured data.\n\n![efficiency_decoding](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_08a5422499fe.jpg)\n\n- Pre-filling latency and memory usage of DuoAttention compared to full attention across varying\npre-filling chunk sizes. DuoAttention uses a 25% retrieval head ratio for Llama-2-7B (MHA), pre-filling a context of 100K tokens, and a 50% ratio for Llama-3-8B (GQA), pre-filling a context of 320K tokens. As the pre-filling chunk size decreases, DuoAttention achieves up to 1.73× latency reduction for MHA and 1.63× for GQA models, with memory reductions up to 2.38× for MHA and 1.53× for GQA models.\n\n![efficiency_prefilling](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_5a35a0ec89f2.jpg)\n\n- DuoAttention’s decoding memory and latency vs. KV budget with a fixed context length. Memory and latency are reduced linearly when the ratio of retrieval heads is reduced. DuoAttention\nachieves up to 2.55× memory reduction for MHA and 1.67× for GQA models, along with up to 2.18× latency reduction for MHA and 1.50× for GQA models.\n\n![efficiency_curve](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_6846e2fce8db.jpg)\n\n- Combined with 8-bit weight and 4-bit KV cache quantization, DuoAttention can accommodate 3.3 million tokens on a single A100-80G GPU for the Llama-3-8B model.\n\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_537df4070350.jpg\" alt=\"kv_capacity\" width=\"400\"\u002F>\n\u003C\u002Fp>\n\n## Citation\n\nIf you find DuoAttention useful or relevant to your project and research, please kindly cite our paper:\n\n```bibtex\n@article{xiao2024duo,\n        title={DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads},\n        author={Xiao, Guangxuan and Tang, Jiaming and Zuo, Jingwei and Guo, Junxian and Yang, Shang and Tang, Haotian and Fu, Yao and Han, Song},\n        journal={arXiv},\n        year={2024}\n}\n```\n","# DuoAttention：结合检索头与流式头的高效长上下文大模型推理\n[[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10819)] [[幻灯片](figures\u002FDuoAttention.pdf)]\n\n![method1](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_2c701b8ab49f.jpg)\n![method2](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_38ec197f8f52.jpg)\n\n## 演示\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002Fb372882b-bf14-4c89-a610-22724d91a415\n\n## 简要概述\n我们在不牺牲长上下文能力的前提下，显著降低了长上下文大模型在预填充和解码阶段的内存占用与延迟。\n\n## 摘要\n部署长上下文大型语言模型至关重要，但同时也带来了巨大的计算和内存挑战。\n缓存所有注意力头中的键值状态会消耗大量内存。\n现有的键值缓存修剪方法要么会损害大模型的长上下文能力，要么只能带来有限的效率提升。\n在本文中，我们发现只有一小部分注意力头——即检索头——对于处理长上下文至关重要，需要对所有 token 进行全注意力计算。\n相比之下，其他主要关注近期 token 和注意力汇聚点的头——称为流式头——则不需要进行全注意力计算。\n基于这一洞察，我们提出了 DuoAttention 框架：仅对检索头应用完整的键值缓存，而对流式头使用轻量级的固定长度键值缓存。这种方法能够在不降低长上下文能力的情况下，同时减少大模型的解码和预填充阶段的内存占用与延迟。\nDuoAttention 使用一种基于优化的轻量级算法，并结合合成数据来准确识别检索头。\n我们的方法可将多头注意力（MHA）模型的长上下文推理内存最多减少 2.55 倍，将分组多头注意力（GQA）模型的长上下文推理内存最多减少 1.67 倍；同时，解码速度分别加快至多 2.18 倍和 1.50 倍，预填充速度分别加快至多 1.73 倍和 1.63 倍，且与全注意力相比，精度损失极小。\n值得注意的是，结合量化技术后，DuoAttention 使得 Llama-3-8B 在单张 A100 GPU 上即可实现 330 万上下文长度的解码。\n\n## 安装与使用\n\n### 环境搭建\n\n#### 训练与评估环境\n\n```bash\nconda create -yn duo python=3.10\nconda activate duo\n\nconda install -y git\nconda install -y nvidia\u002Flabel\u002Fcuda-12.4.0::cuda-toolkit\nconda install -y nvidia::cuda-cudart-dev\nconda install -y pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia\n\npip install transformers==4.45.2 accelerate sentencepiece datasets wandb zstandard matplotlib huggingface_hub==0.25.2\npip install tensor_parallel==2.0.0\n\npip install ninja packaging\npip install flash-attn==2.6.3 --no-build-isolation\n\n# LongBench 评估\npip install seaborn rouge_score einops pandas\n\npip install flashinfer -i https:\u002F\u002Fflashinfer.ai\u002Fwhl\u002Fcu121\u002Ftorch2.4\u002F\n\n# 安装 DuoAttention\npip install -e .\n\n# 安装块稀疏流式注意力\ngit clone https:\u002F\u002Fgithub.com\u002Fmit-han-lab\u002FBlock-Sparse-Attention\ncd Block-Sparse-Attention\npython setup.py install\n```\n\n\n#### 演示环境\n```bash\nconda create -yn duo_demo python=3.10\nconda activate duo_demo\n\n# 安装 DuoAttention\npip install -e .\n\nconda install -y git\nconda install -y nvidia\u002Flabel\u002Fcuda-12.4.0::cuda-toolkit\nconda install -y nvidia::cuda-cudart-dev\n\n# 安装 QServe\ngit clone https:\u002F\u002Fgithub.com:mit-han-lab\u002Fqserve\ncd qserve\npip install -e .\npip install ninja packaging\npip install flash-attn==2.4.1 --no-build-isolation\ncd kernels\npython setup.py install\n\n# 安装 FlashInfer\npip install flashinfer -i https:\u002F\u002Fflashinfer.ai\u002Fwhl\u002Fcu121\u002Ftorch2.3\u002F\npip install tensor_parallel\n```\n\n### 数据集\n下载数据集：\n\n```bash\nmkdir -p datasets\ncd datasets\n\nwget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ftogethercomputer\u002FLong-Data-Collections\u002Fresolve\u002Fmain\u002Ffine-tune\u002Fbooksum.jsonl.zst\n```\n\n### 模型\n下载 DuoAttention 支持的模型：\n```bash\nmkdir -p models\ncd models\n\n# 目前 DuoAttention 支持用于评估的模型\nhuggingface-cli download togethercomputer\u002FLlama-2-7B-32K-Instruct --local-dir Llama-2-7B-32K-Instruct\nhuggingface-cli download gradientai\u002FLlama-3-8B-Instruct-Gradient-1048k --local-dir Llama-3-8B-Instruct-Gradient-1048k\nhuggingface-cli download gradientai\u002FLlama-3-8B-Instruct-Gradient-4194k --local-dir Llama-3-8B-Instruct-Gradient-4194k\nhuggingface-cli download mistralai\u002FMistral-7B-Instruct-v0.2 --local-dir Mistral-7B-Instruct-v0.2\nhuggingface-cli download mistralai\u002FMistral-7B-Instruct-v0.3 --local-dir Mistral-7B-Instruct-v0.3\n\n# 用于演示的 W8A8KV4 模型，采用 SmoothQuant 和 QServe 技术\nhuggingface-cli download mit-han-lab\u002FLlama-3-8B-Instruct-Gradient-1048k-w8a8kv4-per-channel --local-dir Llama-3-8B-Instruct-Gradient-1048k-w8a8kv4-per-channel\nhuggingface-cli download mit-han-lab\u002FLlama-3-8B-Instruct-Gradient-4194k-w8a8kv4-per-channel --local-dir Llama-3-8B-Instruct-Gradient-4194k-w8a8kv4-per-channel\n```\n\n## DuoAttention 快速入门\n我们提供了一个简单的单击补丁，可在 HuggingFace 模型上启用 DuoAttention 优化，包括 Llama 和 Mistral 系列。五个长上下文模型的预训练检索头模式已存储在 `attn_patterns` 目录中：`Llama-2-7B-32K-Instruct`、`Llama-3-8B-Instruct-Gradient-1048k`、`Llama-3-8B-Instruct-Gradient-4194k`、`Mistral-7B-Instruct-v0.2`、`Mistral-7B-Instruct-v0.3` 以及 `Meta-Llama-3.1-8B-Instruct`。如果您希望训练自己的检索头模式，可以使用 scripts 目录中提供的训练脚本。以下是如何在 `Llama-3-8B-Instruct-Gradient-1048k` 模型上启用 DuoAttention 的示例。\n\n\n```python\nfrom duo_attn.utils import load_attn_pattern, sparsify_attention_heads\nfrom duo_attn.patch import enable_duo_attention_eval\nimport transformers\nimport torch\n\n# 加载模型\nmodel = transformers.AutoModelForCausalLM.from_pretrained(\n    \"models\u002FLlama-3-8B-Instruct-Gradient-1048k\",\n    torch_dtype=torch.bfloat16,\n    low_cpu_mem_usage=True,\n    attn_implementation=\"eager\",\n)\n\n# 加载注意力模式\nattn_heads, sink_size, recent_size = load_attn_pattern(\n    \"attn_patterns\u002FLlama-3-8B-Instruct-Gradient-1048k\u002Flr=0.02-reg=0.05-ctx=1000_32000-multi_passkey10\"\n)\n\n# 稀疏化注意力头\nattn_heads, sparsity = sparsify_attention_heads(attn_heads, sparsity=0.5)\n\n# 启用 DuoAttention\nenable_duo_attention_eval(\n    model,\n    attn_heads,\n    sink_size=64,\n    recent_size=256,\n)\n\n# 将模型转移到 GPU\nmodel = model.cuda()\n\n# 准备推理！\n```\n\n## 演示\n完成环境搭建后，您可以运行以下脚本，在 `Llama-3-8B-Instruct-Gradient-4194k` 模型上执行 W4A8KV4 结合 DuoAttention 的演示。该演示设计为在单张 A100 GPU 上运行，支持高达 330 万 token 的上下文长度。\n\n```bash\nbash scripts\u002Frun_demo.sh\n```\n\n## 结果\n\n### 检索头识别\n在准备好数据集和模型后，可以运行训练脚本以识别检索头。对于我们评估的模型，相应的注意力模式可在 `attn_patterns` 目录中找到。\n\n```bash\nbash scripts\u002Frun_train.sh\n```\n\n### 草堆里的针（NIAH）\nDuoAttention 在“草堆里的针”基准测试上，使用 MHA 模型时全注意力比例为 25%，GQA 模型时为 50%，即可达到与全注意力相当的准确率。\n\n```bash\nbash scripts\u002Frun_niah.sh\n```\n\n![niah](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_8bae986001e0.jpg)\n\n### LongBench\n\n```bash\nbash scripts\u002Frun_longbench.sh\n```\n\nDuoAttention 在 LongBench 基准测试中提供了更好的 KV 预算与准确率之间的权衡。\n\n![longbench](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_08603755a46d.jpg)\n\n### 效率\n\n```bash\nbash scripts\u002Frun_efficiency.sh\n```\n\n- 不同上下文长度下，DuoAttention 相较于全注意力的每 token 解码延迟和内存占用情况。对于 Llama-2-7B（MHA），DuoAttention 使用 25% 的检索头比例；对于 Llama-3-8B（GQA），则使用 50% 的检索头比例。随着上下文长度的增加，DuoAttention 在 MHA 模型上可实现最高 2.45 倍的内存减少和 2.13 倍的延迟降低，在 GQA 模型上则分别达到 1.65 倍和 1.5 倍的减少效果。这些优化效果接近于检索头比例的倒数。内存不足（OOM）的结果基于实测数据进行线性外推。\n\n![efficiency_decoding](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_08a5422499fe.jpg)\n\n- 不同预填充块大小下，DuoAttention 相较于全注意力的预填充延迟和内存占用情况。对于 Llama-2-7B（MHA），DuoAttention 使用 25% 的检索头比例，预填充 10 万 token 的上下文；而对于 Llama-3-8B（GQA），则使用 50% 的检索头比例，预填充 32 万 token 的上下文。随着预填充块大小的减小，DuoAttention 在 MHA 模型上可实现最高 1.73 倍的延迟降低，在 GQA 模型上则可达 1.63 倍；同时，内存占用方面，MHA 模型可减少至多 2.38 倍，GQA 模型则可减少至多 1.53 倍。\n\n![efficiency_prefilling](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_5a35a0ec89f2.jpg)\n\n- DuoAttention 在固定上下文长度下的解码内存和延迟与 KV 预算的关系。当检索头比例降低时，内存和延迟会线性下降。DuoAttention 在 MHA 模型上可实现最高 2.55 倍的内存减少和 2.18 倍的延迟降低，在 GQA 模型上则分别达到 1.67 倍和 1.5 倍的减少效果。\n\n![efficiency_curve](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_6846e2fce8db.jpg)\n\n- 结合 8 位权重和 4 位 KV 缓存量化技术，DuoAttention 可使 Llama-3-8B 模型在单张 A100-80G GPU 上容纳 330 万 token。\n\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_readme_537df4070350.jpg\" alt=\"kv_capacity\" width=\"400\"\u002F>\n\u003C\u002Fp>\n\n## 引用\n\n如果您认为 DuoAttention 对您的项目或研究有用或具有参考价值，请引用我们的论文：\n\n```bibtex\n@article{xiao2024duo,\n        title={DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads},\n        author={Xiao, Guangxuan and Tang, Jiaming and Zuo, Jingwei and Guo, Junxian and Yang, Shang and Tang, Haotian and Fu, Yao and Han, Song},\n        journal={arXiv},\n        year={2024}\n}\n```","# DuoAttention 快速上手指南\n\nDuoAttention 是一个用于长上下文大语言模型（LLM）高效推理的框架。它通过区分“检索头”（Retrieval Heads）和“流式头”（Streaming Heads），仅对关键头部保留完整 KV 缓存，从而显著降低显存占用并提升推理速度，同时保持长上下文能力。\n\n## 1. 环境准备\n\n### 系统要求\n- **操作系统**: Linux (推荐 Ubuntu)\n- **GPU**: NVIDIA GPU (支持 CUDA 12.4)\n- **Python**: 3.10\n- **显存**: 建议 24GB+ (根据模型大小和上下文长度而定)\n\n### 前置依赖\n确保已安装 `conda` 和 `git`。本指南使用 Conda 管理环境以避免依赖冲突。\n\n## 2. 安装步骤\n\n以下命令将创建一个名为 `duo` 的独立环境并安装所有必要依赖。\n\n```bash\n# 1. 创建并激活 Conda 环境\nconda create -yn duo python=3.10\nconda activate duo\n\n# 2. 安装基础工具与 CUDA  toolkit (版本 12.4)\nconda install -y git\nconda install -y nvidia\u002Flabel\u002Fcuda-12.4.0::cuda-toolkit\nconda install -y nvidia::cuda-cudart-dev\n\n# 3. 安装 PyTorch (CUDA 12.4 版本)\n# 国内用户可使用清华源加速：-i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\npip install torch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia\n\n# 4. 安装 Python 依赖包\npip install transformers==4.45.2 accelerate sentencepiece datasets wandb zstandard matplotlib huggingface_hub==0.25.2\npip install tensor_parallel==2.0.0\npip install ninja packaging\n\n# 5. 安装 Flash Attention (需无隔离构建)\npip install flash-attn==2.6.3 --no-build-isolation\n\n# 6. 安装评估工具 (可选，如需运行 LongBench)\npip install seaborn rouge_score einops pandas\n\n# 7. 安装 FlashInfer (注意 CUDA 和 Torch 版本匹配)\npip install flashinfer -i https:\u002F\u002Fflashinfer.ai\u002Fwhl\u002Fcu121\u002Ftorch2.4\u002F\n\n# 8. 安装 DuoAttention\n# 先克隆仓库或下载源码后，在根目录执行：\npip install -e .\n\n# 9. 安装 Block Sparse Streaming Attention 组件\ngit clone https:\u002F\u002Fgithub.com\u002Fmit-han-lab\u002FBlock-Sparse-Attention\ncd Block-Sparse-Attention\npython setup.py install\ncd ..\n```\n\n> **提示**：如果在国内网络环境下安装 `pip` 包较慢，可在每条 `pip install` 命令后添加 `-i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`。\n\n## 3. 基本使用\n\nDuoAttention 提供了一键补丁功能，可快速应用于 HuggingFace 模型（如 Llama-3, Mistral 等）。以下是启用优化的最小代码示例。\n\n### 前置准备：下载模型与注意力模式\n在使用前，请确保已下载支持的模型和预训练的注意力头模式文件。\n\n```bash\nmkdir -p models attn_patterns\ncd models\n\n# 示例：下载 Llama-3-8B-Instruct-Gradient-1048k 模型\nhuggingface-cli download gradientai\u002FLlama-3-8B-Instruct-Gradient-1048k --local-dir Llama-3-8B-Instruct-Gradient-1048k\n\n# 注意：attn_patterns 目录需包含对应的模式文件，可从项目 releases 或仓库获取\n# 此处假设已存在文件：attn_patterns\u002FLlama-3-8B-Instruct-Gradient-1048k\u002Flr=0.02-reg=0.05-ctx=1000_32000-multi_passkey10\n```\n\n### Python 推理示例\n\n```python\nfrom duo_attn.utils import load_attn_pattern, sparsify_attention_heads\nfrom duo_attn.patch import enable_duo_attention_eval\nimport transformers\nimport torch\n\n# 1. 加载模型\nmodel = transformers.AutoModelForCausalLM.from_pretrained(\n    \"models\u002FLlama-3-8B-Instruct-Gradient-1048k\",\n    torch_dtype=torch.bfloat16,\n    low_cpu_mem_usage=True,\n    attn_implementation=\"eager\",\n)\n\n# 2. 加载预训练的注意力头模式\n# 路径需对应具体模型和训练配置\nattn_heads, sink_size, recent_size = load_attn_pattern(\n    \"attn_patterns\u002FLlama-3-8B-Instruct-Gradient-1048k\u002Flr=0.02-reg=0.05-ctx=1000_32000-multi_passkey10\"\n)\n\n# 3. 稀疏化注意力头 (例如设置 50% 的检索头比例)\nattn_heads, sparsity = sparsify_attention_heads(attn_heads, sparsity=0.5)\n\n# 4. 启用 DuoAttention 优化\nenable_duo_attention_eval(\n    model,\n    attn_heads,\n    sink_size=64,      # 注意力汇大小\n    recent_size=256,   # 近期 token 窗口大小\n)\n\n# 5. 将模型移至 GPU\nmodel = model.cuda()\n\n# 6. 开始推理\ninputs = tokenizer(\"你的长文本输入...\", return_tensors=\"pt\").to(model.device)\noutputs = model.generate(**inputs, max_new_tokens=100)\nprint(tokenizer.decode(outputs[0]))\n```\n\n### 运行官方 Demo\n若需体验极致长上下文（如 3.3M tokens），可运行官方提供的 Demo 脚本（需单卡 A100）：\n\n```bash\nbash scripts\u002Frun_demo.sh\n```","某法律科技团队正在构建基于 Llama-3-8B 的智能合同审查系统，需要处理长达数万字的复杂并购协议并实时提取风险条款。\n\n### 没有 duo-attention 时\n- **显存严重溢出**：全量缓存所有注意力头的 KV 状态导致单张 A100 显卡无法承载超过百万 token 的上下文，被迫将长文档切片处理，破坏了文档的整体逻辑连贯性。\n- **响应延迟极高**：在预填充（Prefilling）和解码阶段，模型需计算所有 token 的全局注意力，导致首字生成等待时间过长，无法满足律师交互式查询的需求。\n- **硬件成本高昂**：为了勉强运行长上下文推理，不得不部署多卡并行集群，显著推高了基础设施运维成本。\n- **精度与效率难两全**：若采用传统的 KV 剪枝策略加速，往往误删关键历史信息的注意力头，导致模型遗漏跨段落的风险关联，审查准确率大幅下降。\n\n### 使用 duo-attention 后\n- **显存占用骤降**：duo-attention 智能识别仅少数“检索头”需全量缓存，对其余“流式头”采用轻量定长缓存，使单卡可支持高达 330 万 token 的上下文，完整读入整本合同。\n- **推理速度倍增**：通过差异化处理机制，解码速度提升最高达 2.18 倍，预填充加速 1.73 倍，实现了近乎实时的长文档问答体验。\n- **单机即可部署**：结合量化技术，单张 A100 即可流畅运行超长上下文任务，无需多卡互联，大幅降低了硬件门槛和运营成本。\n- **能力无损保留**：在显著提升效率的同时，精准保留了模型对长距离依赖的捕捉能力，确保跨章节的法律风险点被准确识别，准确率与全注意力机制基本持平。\n\nduo-attention 通过区分关键检索头与普通流式头，打破了长上下文大模型在显存与速度上的瓶颈，让单机实时处理百万级文档成为现实。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmit-han-lab_duo-attention_83cb76b0.png","mit-han-lab","MIT HAN Lab","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fmit-han-lab_65e6a38d.png","Efficient AI Computing. PI: Song Han",null,"songhan_mit","https:\u002F\u002Fhanlab.mit.edu","https:\u002F\u002Fgithub.com\u002Fmit-han-lab",[85,89],{"name":86,"color":87,"percentage":88},"Python","#3572A5",97.5,{"name":90,"color":91,"percentage":92},"Shell","#89e051",2.5,535,40,"2026-04-03T02:34:26","MIT",4,"Linux","必需 NVIDIA GPU。训练\u002F评估环境需 CUDA 12.4；Demo 环境支持 CUDA 12.1 或 12.3。官方演示在单张 A100 (80GB) 上运行，支持高达 330 万上下文长度（结合量化）。","未说明（取决于模型大小和上下文长度，长上下文场景建议高内存）",{"notes":102,"python":103,"dependencies":104},"1. 该项目主要面向 Linux 环境，依赖特定的 CUDA 版本（主要为 12.4），安装 flash-attn 和 flashinfer 时需严格匹配 PyTorch 和 CUDA 版本。2. 首次使用需下载特定长上下文模型（如 Llama-3-8B-Instruct-Gradient-1048k）及预训练的注意力头模式文件。3. Demo 功能依赖额外的 'Block-Sparse-Attention' 和 'QServe' 库，需单独克隆并编译安装。4. 若需自行训练检索头模式，需准备特定的数据集（如 booksum.jsonl.zst）。","3.10",[105,106,107,108,109,110,111,112,113,114],"torch==2.4.0 (CUDA 12.4)","transformers==4.45.2","flash-attn==2.6.3","flashinfer","accelerate","tensor_parallel==2.0.0","ninja","datasets","huggingface_hub==0.25.2","qserve (仅 Demo 环境)",[26,13],"2026-03-27T02:49:30.150509","2026-04-06T07:14:04.969537",[],[]]