[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-chengzeyi--stable-fast":3,"tool-chengzeyi--stable-fast":61},[4,18,28,37,45,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":24,"last_commit_at":25,"category_tags":26,"status":17},9989,"n8n","n8n-io\u002Fn8n","n8n 是一款面向技术团队的公平代码（fair-code）工作流自动化平台，旨在让用户在享受低代码快速构建便利的同时，保留编写自定义代码的灵活性。它主要解决了传统自动化工具要么过于封闭难以扩展、要么完全依赖手写代码效率低下的痛点，帮助用户轻松连接 400 多种应用与服务，实现复杂业务流程的自动化。\n\nn8n 特别适合开发者、工程师以及具备一定技术背景的业务人员使用。其核心亮点在于“按需编码”：既可以通过直观的可视化界面拖拽节点搭建流程，也能随时插入 JavaScript 或 Python 代码、调用 npm 包来处理复杂逻辑。此外，n8n 原生集成了基于 LangChain 的 AI 能力，支持用户利用自有数据和模型构建智能体工作流。在部署方面，n8n 提供极高的自由度，支持完全自托管以保障数据隐私和控制权，也提供云端服务选项。凭借活跃的社区生态和数百个现成模板，n8n 让构建强大且可控的自动化系统变得简单高效。",184740,2,"2026-04-19T23:22:26",[16,14,13,15,27],"插件",{"id":29,"name":30,"github_repo":31,"description_zh":32,"stars":33,"difficulty_score":10,"last_commit_at":34,"category_tags":35,"status":17},10095,"AutoGPT","Significant-Gravitas\u002FAutoGPT","AutoGPT 是一个旨在让每个人都能轻松使用和构建 AI 的强大平台，核心功能是帮助用户创建、部署和管理能够自动执行复杂任务的连续型 AI 智能体。它解决了传统 AI 应用中需要频繁人工干预、难以自动化长流程工作的痛点，让用户只需设定目标，AI 即可自主规划步骤、调用工具并持续运行直至完成任务。\n\n无论是开发者、研究人员，还是希望提升工作效率的普通用户，都能从 AutoGPT 中受益。开发者可利用其低代码界面快速定制专属智能体；研究人员能基于开源架构探索多智能体协作机制；而非技术背景用户也可直接选用预置的智能体模板，立即投入实际工作场景。\n\nAutoGPT 的技术亮点在于其模块化“积木式”工作流设计——用户通过连接功能块即可构建复杂逻辑，每个块负责单一动作，灵活且易于调试。同时，平台支持本地自托管与云端部署两种模式，兼顾数据隐私与使用便捷性。配合完善的文档和一键安装脚本，即使是初次接触的用户也能在几分钟内启动自己的第一个 AI 智能体。AutoGPT 正致力于降低 AI 应用门槛，让人人都能成为 AI 的创造者与受益者。",183572,"2026-04-20T04:47:55",[13,36,27,14,15],"语言模型",{"id":38,"name":39,"github_repo":40,"description_zh":41,"stars":42,"difficulty_score":10,"last_commit_at":43,"category_tags":44,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":46,"name":47,"github_repo":48,"description_zh":49,"stars":50,"difficulty_score":24,"last_commit_at":51,"category_tags":52,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",161692,"2026-04-20T11:33:57",[14,13,36],{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":24,"last_commit_at":59,"category_tags":60,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":78,"owner_email":79,"owner_twitter":73,"owner_website":80,"owner_url":81,"languages":82,"stars":95,"forks":96,"last_commit_at":97,"license":98,"difficulty_score":10,"env_os":99,"env_gpu":100,"env_ram":101,"env_deps":102,"category_tags":110,"github_topics":112,"view_count":24,"oss_zip_url":123,"oss_zip_packed_at":123,"status":17,"created_at":124,"updated_at":125,"faqs":126,"releases":156},10173,"chengzeyi\u002Fstable-fast","stable-fast","https:\u002F\u002Fwavespeed.ai\u002F Best inference performance optimization framework for HuggingFace Diffusers on NVIDIA GPUs.","stable-fast 是一个专为 NVIDIA GPU 设计的推理优化框架，旨在大幅提升 HuggingFace Diffusers 库中各类生成式模型（如图像生成、视频生成模型）的运行速度。它主要解决了传统加速方案（如 TensorRT）编译耗时过长的问题：以往可能需要数十分钟才能完成模型编译，而 stable-fast 仅需数秒即可就绪，同时还能保持业界领先的推理性能。\n\n该工具特别适合需要高效部署扩散模型的开发者、研究人员以及追求本地快速生成的技术爱好者。无论是运行最新的 FLUX、Wan 2.1 视频模型，还是使用 Stable Video Diffusion，stable-fast 都能提供流畅的体验。其独特的技术亮点在于“开箱即用”地支持动态形状（dynamic shape）、LoRA 微调权重以及 ControlNet 控制网络，无需复杂的额外配置。这意味着用户在尝试不同分辨率或叠加多种插件时，无需重新编译模型，极大地提升了工作流的灵活性与效率。虽然目前针对最新架构的主动开发已转向新项目，但 stable-fast 依然是当前兼容旧款主流扩散模型、实现极速推理的可靠选择","stable-fast 是一个专为 NVIDIA GPU 设计的推理优化框架，旨在大幅提升 HuggingFace Diffusers 库中各类生成式模型（如图像生成、视频生成模型）的运行速度。它主要解决了传统加速方案（如 TensorRT）编译耗时过长的问题：以往可能需要数十分钟才能完成模型编译，而 stable-fast 仅需数秒即可就绪，同时还能保持业界领先的推理性能。\n\n该工具特别适合需要高效部署扩散模型的开发者、研究人员以及追求本地快速生成的技术爱好者。无论是运行最新的 FLUX、Wan 2.1 视频模型，还是使用 Stable Video Diffusion，stable-fast 都能提供流畅的体验。其独特的技术亮点在于“开箱即用”地支持动态形状（dynamic shape）、LoRA 微调权重以及 ControlNet 控制网络，无需复杂的额外配置。这意味着用户在尝试不同分辨率或叠加多种插件时，无需重新编译模型，极大地提升了工作流的灵活性与效率。虽然目前针对最新架构的主动开发已转向新项目，但 stable-fast 依然是当前兼容旧款主流扩散模型、实现极速推理的可靠选择。","# 🚀Stable Fast\n\n[Blazing Fast FLUX-dev with LoRAs](https:\u002F\u002Fwavespeed.ai\u002Fmodels\u002Fwavespeed-ai\u002Fflux-dev-lora)\n\n[Blazing Fast Wan 2.1 T2V with LoRAs](https:\u002F\u002Fwavespeed.ai\u002Fmodels\u002Fwavespeed-ai\u002Fwan-2.1\u002Ft2v-480p)\n\n[Blazing Fast Wan 2.1 I2V with LoRAs](https:\u002F\u002Fwavespeed.ai\u002Fmodels\u002Fwavespeed-ai\u002Fwan-2.1\u002Fi2v-480p)\n\n## 🎉Important Announcement🎉\n\nAfter one year of delay, I am happy to announce I plan to build a new project [Comfy-WaveSpeed](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002FComfy-WaveSpeed) to provide the fastest inference speed for all models running with `ComfyUI`.\nIt's just started and I hope it will be a great project👏.. Please keep focusing on it and give me feedbacks👍!\n\n[![wheels](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fwheels.yml\u002Fbadge.svg?branch=main)](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fwheels.yml)\n[![Upload Python Package](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fpython-publish.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fpython-publish.yml)\n[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fchengzeyi\u002Fstable-fast-colab\u002Fblob\u002Fmain\u002Fstable_fast_colab.ipynb)\n\n__NOTE__\n\nActive development on `stable-fast` has been paused. I am currently working on a new `torch._dynamo` based project targeting new models such as `stable-cascade`, `SD3` and `Sora` like mmodels.\nIt would be faster and more flexible, as well as supporting more hardware backends rather than `CUDA`.\n\nContact is welcomed.\n\n[Discord Channel](https:\u002F\u002Fdiscord.gg\u002FkQFvfzM4SJ)\n\n`stable-fast` achieves SOTA inference performance on __ALL__ kinds of diffuser models, even with the latest `StableVideoDiffusionPipeline`.\nAnd unlike `TensorRT` or `AITemplate`, which takes dozens of minutes to compile a model, `stable-fast` only takes a few seconds to compile a model.\n`stable-fast` also supports `dynamic shape`, `LoRA` and `ControlNet` out of the box.\n\n[![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fchengzeyi_stable-fast_readme_b54e33e740dc.png)](https:\u002F\u002Fmermaid.live\u002Fedit#pako:eNpVUsFu2zAM_RVCQJCLnchOncQ-DBjWHXbYpc1hWNUDbdO2AFsyLKVNFvjfR8VFhwkwQT5LfOQjb6KyNYlCxHGsTGVNo9tCGeBzuX7rcPIfUTjvuvZdAUcp_2Ed6bbzBaQZg_ckq9VNG83Qbe07GmhdwLqxEzm_nmFerZS5XKuQOS7JI3R20n-s8dgr47XvCZQ46YGA30BLhib02rRgDYEesCUuw3fw_AjJJosgS9ILf-xIcJ5GF_FNeDr9ggd5lEowW4wX7eBFiTc0uu8RvJ2qTomIme7uprLDqHtaoK8_TjSMPfqPmImb3r4vwYmMs9PTCfKN3CQL5jyWPcUNOq_EqzLXhVCJgdm0I1a1dkqAhDj-AqkM8ilT4gQvyTE_RJBkiWSbZEe2Uu4iyPMkmOxVRGKgaUBd84xuQXauOaiqRMFuiY5CjzPfw7O3z1dTicJPZ4rEeay5h0eN7YTD_-D3WnPjomiwdwz2Fmvi8Cb8dQzL0GrnOeOyDgE_Tz3DnfejK7bb8HvT8hTOZZBu63Qd5tm95fvtPt0fMd3R_rDDbLerqzLJj036kDT1QSYpinmOxIjmt7XDZwF0r-fnson3hZz_AskE0h8)\n\n[![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fchengzeyi_stable-fast_readme_19642c69aeb1.png)](https:\u002F\u002Fmermaid.live\u002Fedit#pako:eNpFUk1v2zAM_SuEgCAXu3HsfLg-7LL22MsaDMWqHmiLtgXYUmAxWTLD_310UrQ8SY8S3-MjR1V5Q6pQcRxrV3lX26bQDiQu158tDvx5m-OvNdwWkCfJN9aSbVouIE0FvBVZLEbrrEDjklvqaVnAsvYDBV5OMC0W2l2u1Vw5LokRWj_Yf94xdtqx5Y5Aq4PtCeQPNORoQLauAe8IztaQFxncwuvvp_jtEME6STeX7X4XQbqFesCeQiRv4dfhDTZJnmglfDFebIB3rc7obNchsB-qVqtIuG7Hh8r3R9vRHQqMZUdxjYG1-tDuev8vCRKHTNAKEojjH0IuTWtX4gDveRbBPolgk3-oSPU09GiNGDvOXgnNbIVWhRxLDDTLmuQdnti_Xl2lCh5OFKnT0SDTk8VGWlFFjV34Qp-NFbFfYOfRkFxHxdfjPMLGBpaS9yHO-GnoBG6Zj6FYreb0QyPencq53VWwZp5Ce37crXbpLsc0o90-w22WmapcP-Z1ulnXZp-sU1TTFKkjuj_ef6uim56X-_7c1mj6D1vdvkY)\n\n| Model       | torch | torch.compile | AIT  | oneflow | TensorRT | __stable-fast__ |\n| ----------- | ----- | ------------- | ---- | ------- | -------- | --------------- |\n| SD 1.5 (ms) | 1897  | 1510          | 1158 | 1003    | 991      | __995__         |\n| SVD-XT (s)  | 83    | 70            |      |         |          | __47__          |\n\n__NOTE__: During benchmarking, `TensorRT` is tested with `static batch size` and `CUDA Graph enabled` while `stable-fast` is running with dynamic shape.\n\n- [🚀Stable Fast](#stable-fast)\n  - [Introduction](#introduction)\n    - [What is this?](#what-is-this)\n    - [Differences With Other Acceleration Libraries](#differences-with-other-acceleration-libraries)\n  - [Installation](#installation)\n    - [Install Prebuilt Wheels](#install-prebuilt-wheels)\n    - [Install From Source](#install-from-source)\n  - [Usage](#usage)\n    - [Optimize StableDiffusionPipeline](#optimize-stablediffusionpipeline)\n    - [Optimize LCM Pipeline](#optimize-lcm-pipeline)\n    - [Optimize StableVideoDiffusionPipeline](#optimize-stablevideodiffusionpipeline)\n    - [Dynamically Switch LoRA](#dynamically-switch-lora)\n    - [Model Quantization](#model-quantization)\n    - [Some Common Methods To Speed Up PyTorch](#some-common-methods-to-speed-up-pytorch)\n  - [Performance Comparison](#performance-comparison)\n    - [RTX 4080 (512x512, batch size 1, fp16, in WSL2)](#rtx-4080-512x512-batch-size-1-fp16-in-wsl2)\n    - [H100](#h100)\n    - [A100](#a100)\n  - [Compatibility](#compatibility)\n  - [Troubleshooting](#troubleshooting)\n\n## Introduction\n\n### What is this?\n\n`stable-fast` is an ultra lightweight inference optimization framework for __HuggingFace Diffusers__ on __NVIDIA GPUs__.\n`stable-fast` provides super fast inference optimization by utilizing some key techniques and features:\n\n- __CUDNN Convolution Fusion__: `stable-fast` implements a series of fully-functional and fully-compatible CUDNN convolution fusion operators for all kinds of combinations of `Conv + Bias + Add + Act` computation patterns.\n- __Low Precision & Fused GEMM__: `stable-fast` implements a series of fused GEMM operators that compute with `fp16` precision, which is fast than PyTorch's defaults (read & write with `fp16` while compute with `fp32`).\n- __Fused Linear GEGLU__: `stable-fast` is able to fuse `GEGLU(x, W, V, b, c) = GELU(xW + b) ⊗ (xV + c)` into one CUDA kernel.\n- __NHWC & Fused GroupNorm__: `stable-fast` implements a highly optimized fused NHWC `GroupNorm + Silu` operator with OpenAI's `Triton`, which eliminates the need of memory format permutation operators.\n- __Fully Traced Model__: `stable-fast` improves the `torch.jit.trace` interface to make it more proper for tracing complex models. Nearly every part of `StableDiffusionPipeline\u002FStableVideoDiffusionPipeline` can be traced and converted to __TorchScript__. It is more stable than `torch.compile` and has a significantly lower CPU overhead than `torch.compile` and supports __ControlNet__ and __LoRA__.\n- __CUDA Graph__: `stable-fast` can capture the `UNet`, `VAE` and `TextEncoder` into CUDA Graph format, which can reduce the CPU overhead when the batch size is small. This implemention also supports dynamic shape.\n- __Fused Multihead Attention__: `stable-fast` just uses xformers and makes it compatible with __TorchScript__.\n\nMy next goal is to keep `stable-fast` as one of the fastest inference optimization frameworks for `diffusers` and also\nprovide both speedup and VRAM reduction for `transformers`.\nIn fact, I already use `stable-fast` to optimize LLMs and achieve a significant speedup.\nBut I still need to do some work to make it more stable and easy to use and provide a stable user interface.\n\n### Differences With Other Acceleration Libraries\n\n- __Fast__: `stable-fast` is specialy optimized for __HuggingFace Diffusers__. It achieves a high performance across many libraries. And it provides a very fast compilation speed within only a few seconds. It is significantly faster than `torch.compile`, `TensorRT` and `AITemplate` in compilation time.\n- __Minimal__: `stable-fast` works as a plugin framework for `PyTorch`. It utilizes existing `PyTorch` functionality and infrastructures and is compatible with other acceleration techniques, as well as popular fine-tuning techniques and deployment solutions.\n- __Maximum Compatibility__: `stable-fast` is compatible with all kinds of `HuggingFace Diffusers` and `PyTorch` versions. It is also compatible with `ControlNet` and `LoRA`. And it even supports the latest `StableVideoDiffusionPipeline` out of the box!\n\n## Installation\n\n__NOTE__: `stable-fast` is currently only tested on `Linux` and `WSL2 in Windows`.\nYou need to install PyTorch with CUDA support at first (versions from 1.12 to 2.1 are suggested).\n\nI only test `stable-fast` with `torch>=2.1.0`, `xformers>=0.0.22` and `triton>=2.1.0` on `CUDA 12.1` and `Python 3.10`.\nOther versions might build and run successfully but that's not guaranteed.\n\n### Install Prebuilt Wheels\n\nDownload the wheel corresponding to your system from the [Releases Page](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Freleases) and install it with `pip3 install \u003Cwheel file>`.\n\nCurrently both __Linux__ and __Windows__ wheels are available.\n\n```bash\n# Change cu121 to your CUDA version and \u003Cwheel file> to the path of the wheel file.\n# And make sure the wheel file is compatible with your PyTorch version.\npip3 install --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu121 \\\n    'torch>=2.1.0' 'xformers>=0.0.22' 'triton>=2.1.0' 'diffusers>=0.19.3' \\\n    '\u003Cwheel file>'\n```\n\n### Install From Source\n\n```bash\n# Make sure you have CUDNN\u002FCUBLAS installed.\n# https:\u002F\u002Fdeveloper.nvidia.com\u002Fcudnn\n# https:\u002F\u002Fdeveloper.nvidia.com\u002Fcublas\n\n# Install PyTorch with CUDA and other packages at first.\n# Windows user: Triton might be not available, you could skip it.\n# NOTE: 'wheel' is required or you will meet `No module named 'torch'` error when building.\npip3 install wheel 'torch>=2.1.0' 'xformers>=0.0.22' 'triton>=2.1.0' 'diffusers>=0.19.3'\n\n# (Optional) Makes the build much faster.\npip3 install ninja\n\n# Set TORCH_CUDA_ARCH_LIST if running and building on different GPU types.\n# You can also install the latest stable release from PyPI.\n# pip3 install -v -U stable-fast\npip3 install -v -U git+https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast.git@main#egg=stable-fast\n# (this can take dozens of minutes)\n```\n\n__NOTE__: Any usage outside `sfast.compilers` is not guaranteed to be backward compatible.\n\n__NOTE__: To get the best performance, `xformers` and OpenAI's `triton>=2.1.0` need to be installed and enabled.\nYou might need to build `xformers` from source to make it compatible with your `PyTorch`.\n\n## Usage\n\n### Optimize StableDiffusionPipeline\n\n`stable-fast` is able to optimize `StableDiffusionPipeline` and `StableDiffusionPipelineXL` directly.\n\n```python\nimport time\nimport torch\nfrom diffusers import (StableDiffusionPipeline,\n                       EulerAncestralDiscreteScheduler)\nfrom sfast.compilers.diffusion_pipeline_compiler import (compile,\n                                                         CompilationConfig)\n\ndef load_model():\n    model = StableDiffusionPipeline.from_pretrained(\n        'runwayml\u002Fstable-diffusion-v1-5',\n        torch_dtype=torch.float16)\n\n    model.scheduler = EulerAncestralDiscreteScheduler.from_config(\n        model.scheduler.config)\n    model.safety_checker = None\n    model.to(torch.device('cuda'))\n    return model\n\nmodel = load_model()\n\nconfig = CompilationConfig.Default()\n# xformers and Triton are suggested for achieving best performance.\ntry:\n    import xformers\n    config.enable_xformers = True\nexcept ImportError:\n    print('xformers not installed, skip')\ntry:\n    import triton\n    config.enable_triton = True\nexcept ImportError:\n    print('Triton not installed, skip')\n# CUDA Graph is suggested for small batch sizes and small resolutions to reduce CPU overhead.\n# But it can increase the amount of GPU memory used.\n# For StableVideoDiffusionPipeline it is not needed.\nconfig.enable_cuda_graph = True\n\nmodel = compile(model, config)\n\nkwarg_inputs = dict(\n    prompt=\n    '(masterpiece:1,2), best quality, masterpiece, best detailed face, a beautiful girl',\n    height=512,\n    width=512,\n    num_inference_steps=30,\n    num_images_per_prompt=1,\n)\n\n# NOTE: Warm it up.\n# The initial calls will trigger compilation and might be very slow.\n# After that, it should be very fast.\nfor _ in range(3):\n    output_image = model(**kwarg_inputs).images[0]\n\n# Let's see it!\n# Note: Progress bar might work incorrectly due to the async nature of CUDA.\nbegin = time.time()\noutput_image = model(**kwarg_inputs).images[0]\nprint(f'Inference time: {time.time() - begin:.3f}s')\n\n# Let's view it in terminal!\nfrom sfast.utils.term_image import print_image\n\nprint_image(output_image, max_width=80)\n```\n\nRefer to [examples\u002Foptimize_stable_diffusion_pipeline.py](examples\u002Foptimize_stable_diffusion_pipeline.py) for more details.\n\nYou can check this Colab to see how it works on T4 GPU: [![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fchengzeyi\u002Fstable-fast-colab\u002Fblob\u002Fmain\u002Fstable_fast_colab.ipynb)\n\n### Optimize LCM Pipeline\n\n`stable-fast` is able to optimize the newest `latent consistency model` pipeline and achieve a significant speedup.\n\nRefer to [examples\u002Foptimize_lcm_pipeline.py](examples\u002Foptimize_lcm_lora.py) for more details about how to optimize normal SD model with LCM LoRA.\nRefer to [examples\u002Foptimize_lcm_pipeline.py](examples\u002Foptimize_lcm_pipeline.py) for more details about how to optimize the standalone LCM model.\n\n### Optimize StableVideoDiffusionPipeline\n\n`stable-fast` is able to optimize the newest `StableVideoDiffusionPipeline` and achieve a `2x` speedup\n\nRefer to [examples\u002Foptimize_stable_video_diffusion_pipeline.py](examples\u002Foptimize_stable_video_diffusion_pipeline.py) for more details\n\n### Dynamically Switch LoRA\n\nSwitching LoRA dynamically is supported but you need to do some extra work.\nIt is possible because the compiled graph and `CUDA Graph` share the same\nunderlaying data (pointers) with the original UNet model. So all you need to do\nis to update the original UNet model's parameters inplace.\n\nThe following code assumes you have already load a LoRA and compiled the model,\nand you want to switch to another LoRA.\n\nIf you don't enable CUDA graph and keep `preserve_parameters = True`, things could be much easier.\nThe following code might not even be needed.\n\n```python\n# load_state_dict with assign=True requires torch >= 2.1.0\n\ndef update_state_dict(dst, src):\n    for key, value in src.items():\n        # Do inplace copy.\n        # As the traced forward function shares the same underlaying data (pointers),\n        # this modification will be reflected in the traced forward function.\n        dst[key].copy_(value)\n\n# Switch \"another\" LoRA into UNet\ndef switch_lora(unet, lora):\n    # Store the original UNet parameters\n    state_dict = unet.state_dict()\n    # Load another LoRA into unet\n    unet.load_attn_procs(lora)\n    # Inplace copy current UNet parameters to the original unet parameters\n    update_state_dict(state_dict, unet.state_dict())\n    # Load the original UNet parameters back.\n    # We use assign=True because we still want to hold the references\n    # of the original UNet parameters\n    unet.load_state_dict(state_dict, assign=True)\n\nswitch_lora(compiled_model.unet, lora_b_path)\n```\n\n### Model Quantization\n\n`stable-fast` extends PyTorch's `quantize_dynamic` functionality and provides a dynamically quantized linear operator on CUDA backend.\nBy enabling it, you could get a slight VRAM reduction for `diffusers` and significant VRAM reduction for `transformers`,\nand cound get a potential speedup (not always).\n\nFor `SD XL`, it is expected to see VRAM reduction of `2GB` with an image size of `1024x1024`.\n\n```python\ndef quantize_unet(m):\n    from diffusers.utils import USE_PEFT_BACKEND\n    assert USE_PEFT_BACKEND\n    m = torch.quantization.quantize_dynamic(m, {torch.nn.Linear},\n                                            dtype=torch.qint8,\n                                            inplace=True)\n    return m\n\nmodel.unet = quantize_unet(model.unet)\nif hasattr(model, 'controlnet'):\n    model.controlnet = quantize_unet(model.controlnet)\n```\n\nRefer to [examples\u002Foptimize_stable_diffusion_pipeline.py](examples\u002Foptimize_stable_diffusion_pipeline.py) for more details.\n\n### Some Common Methods To Speed Up PyTorch\n\n```bash\n# TCMalloc is highly suggested to reduce CPU overhead\n# https:\u002F\u002Fgithub.com\u002Fgoogle\u002Ftcmalloc\nLD_PRELOAD=\u002Fpath\u002Fto\u002Flibtcmalloc.so python3 ...\n```\n\n```python\nimport packaging.version\nimport torch\n\nif packaging.version.parse(torch.__version__) >= packaging.version.parse('1.12.0'):\n    torch.backends.cuda.matmul.allow_tf32 = True\n```\n\n## Performance Comparison\n\nPerformance varies very greatly across different hardware\u002Fsoftware\u002Fplatform\u002Fdriver configurations.\nIt is very hard to benchmark accurately. And preparing the environment for benchmarking is also a hard job.\nI have tested on some platforms before but the results may still be inaccurate.\nNote that when benchmarking, the progress bar showed by `tqdm` may be inaccurate because of the asynchronous nature of CUDA.\nTo solve this problem, I use `CUDA Event` to measure the speed of iterations per second accurately.\n\n`stable-fast` is expected to work better on newer GPUs and newer CUDA versions.\n__On older GPUs, the performance increase might be limited.__\n__During benchmarking, the progress bar might work incorrectly because of the asynchronous nature of CUDA.__\n\n### RTX 4080 (512x512, batch size 1, fp16, in WSL2)\n\nThis is my personal gaming PC😄. It has a more powerful CPU than those from cloud server providers.\n\n| Framework                                | SD 1.5        | SD XL (1024x1024) | SD 1.5 ControlNet |\n| ---------------------------------------- | ------------- | ----------------- | ----------------- |\n| Vanilla PyTorch (2.1.0)                  | 29.5 it\u002Fs     | 4.6 it\u002Fs          | 19.7 it\u002Fs         |\n| torch.compile (2.1.0, max-autotune)      | 40.0 it\u002Fs     | 6.1 it\u002Fs          | 21.8 it\u002Fs         |\n| AITemplate                               | 44.2 it\u002Fs     |                   |                   |\n| OneFlow                                  | 53.6 it\u002Fs     |                   |                   |\n| AUTO1111 WebUI                           | 17.2 it\u002Fs     | 3.6 it\u002Fs          |                   |\n| AUTO1111 WebUI (with SDPA)               | 24.5 it\u002Fs     | 4.3 it\u002Fs          |                   |\n| TensorRT (AUTO1111 WebUI)                | 40.8 it\u002Fs     |                   |                   |\n| TensorRT Official Demo                   | 52.6 it\u002Fs     |                   |                   |\n| __stable-fast (with xformers & Triton)__ | __51.6 it\u002Fs__ | __9.1 it\u002Fs__      | __36.7 it\u002Fs__     |\n\n### H100\n\nThanks for __@Consceleratus__ and __@harishp__'s help, I have tested speed on H100.\n\n| Framework                                | SD 1.5         | SD XL (1024x1024) | SD 1.5 ControlNet |\n| ---------------------------------------- | -------------- | ----------------- | ----------------- |\n| Vanilla PyTorch (2.1.0)                  | 54.5 it\u002Fs      | 14.9 it\u002Fs         | 35.8 it\u002Fs         |\n| torch.compile (2.1.0, max-autotune)      | 66.0 it\u002Fs      | 18.5 it\u002Fs         |                   |\n| __stable-fast (with xformers & Triton)__ | __104.6 it\u002Fs__ | __21.6 it\u002Fs__     | __72.6 it\u002Fs__     |\n\n### A100\n\nThanks for __@SuperSecureHuman__ and __@jon-chuang__'s help, benchmarking on A100 is available now.\n\n| Framework                                | SD 1.5        | SD XL (1024x1024) | SD 1.5 ControlNet |\n| ---------------------------------------- | ------------- | ----------------- | ----------------- |\n| Vanilla PyTorch (2.1.0)                  | 35.6 it\u002Fs     | 8.7 it\u002Fs          | 25.1 it\u002Fs         |\n| torch.compile (2.1.0, max-autotune)      | 41.9 it\u002Fs     | 10.0 it\u002Fs         |                   |\n| __stable-fast (with xformers & Triton)__ | __61.8 it\u002Fs__ | __11.9 it\u002Fs__     | __41.1 it\u002Fs__     |\n\n## Compatibility\n\n| Model                               | Supported |\n| ----------------------------------- | --------- |\n| Hugging Face Diffusers (1.5\u002F2.1\u002FXL) | Yes       |\n| With ControlNet                     | Yes       |\n| With LoRA                           | Yes       |\n| Latent Consistency Model            | Yes       |\n| SDXL Turbo                          | Yes       |\n| Stable Video Diffusion              | Yes       |\n\n| Functionality                       | Supported |\n| ----------------------------------- | --------- |\n| Dynamic Shape                       | Yes       |\n| Text to Image                       | Yes       |\n| Image to Image                      | Yes       |\n| Image Inpainting                    | Yes       |\n\n| UI Framework                        | Supported | Link                                                                    |\n| ----------------------------------- | --------- | ----------------------------------------------------------------------- |\n| AUTOMATIC1111                       | WIP       |                                                                         |\n| SD Next                             | Yes       | [`SD Next`](https:\u002F\u002Fgithub.com\u002Fvladmandic\u002Fautomatic)                    |\n| ComfyUI                             | Yes       | [`ComfyUI_stable_fast`](https:\u002F\u002Fgithub.com\u002Fgameltb\u002FComfyUI_stable_fast) |\n\n| Operating System                    | Supported |\n| ----------------------------------- | --------- |\n| Linux                               | Yes       |\n| Windows                             | Yes       |\n| Windows WSL                         | Yes       |\n\n## Troubleshooting\n\nRefer to [doc\u002Ftroubleshooting.md](doc\u002Ftroubleshooting.md) for more details.\n\nAnd you can join the [Discord Channel](https:\u002F\u002Fdiscord.gg\u002FkQFvfzM4SJ) to ask for help.\n","# 🚀稳定快速\n\n[使用LoRA的极速FLUX-dev](https:\u002F\u002Fwavespeed.ai\u002Fmodels\u002Fwavespeed-ai\u002Fflux-dev-lora)\n\n[使用LoRA的极速Wan 2.1 T2V](https:\u002F\u002Fwavespeed.ai\u002Fmodels\u002Fwavespeed-ai\u002Fwan-2.1\u002Ft2v-480p)\n\n[使用LoRA的极速Wan 2.1 I2V](https:\u002F\u002Fwavespeed.ai\u002Fmodels\u002Fwavespeed-ai\u002Fwan-2.1\u002Fi2v-480p)\n\n## 🎉重要公告🎉\n\n经过一年的延迟，我很高兴地宣布我计划构建一个新项目[Comfy-WaveSpeed](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002FComfy-WaveSpeed)，以提供使用`ComfyUI`运行的所有模型的最快推理速度。\n它刚刚开始，我希望它会成为一个很棒的项目👏。请继续关注它，并给我反馈👍！\n\n[![轮子](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fwheels.yml\u002Fbadge.svg?branch=main)](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fwheels.yml)\n[![上传Python包](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fpython-publish.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Factions\u002Fworkflows\u002Fpython-publish.yml)\n[![在Colab中打开](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fchengzeyi\u002Fstable-fast-colab\u002Fblob\u002Fmain\u002Fstable_fast_colab.ipynb)\n\n__注意__\n\n`stable-fast`的积极开发已被暂停。目前，我正在从事一个基于`torch._dynamo`的新项目，目标是针对像`stable-cascade`、`SD3`和`Sora`这样的新型模型。\n它将更快、更灵活，并且支持更多的硬件后端，而不仅仅是`CUDA`。\n\n欢迎联系。\n\n[Discord频道](https:\u002F\u002Fdiscord.gg\u002FkQFvfzM4SJ)\n\n`stable-fast`在__所有__类型的扩散模型上都实现了最先进的推理性能，即使是使用最新的`StableVideoDiffusionPipeline`也是如此。\n与需要花费数十分钟来编译模型的`TensorRT`或`AITemplate`不同，`stable-fast`只需几秒钟即可完成模型编译。\n`stable-fast`还开箱即用地支持`动态形状`、`LoRA`和`ControlNet`。\n\n[![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fchengzeyi_stable-fast_readme_b54e33e740dc.png)](https:\u002F\u002Fmermaid.live\u002Fedit#pako:eNpVUsFu2zAM_RVCQJCLnchOncQ-DBjWHXbYpc1hWNUDbdO2AFsyLKVNFvjfR8VFhwkwQT5LfOQjb6KyNYlCxHGsTGVNo9tCGeBzuX7rcPIfUTjvuvZdAUcp_2Ed6bbzBaQZg_ckq9VNG83Qbe07GmhdwLqxEzm_nmFerZS5XKuQOS7JI3R20n-s8dgr47XvCZQ46YGA30BLhib02rRgDYEesCUuw3fw_AjJJosgS9ILf-xIcJ5GF_FNeDr9ggd5lEowW4wX7eBFiTc0uu8RvJ2qTomIme7uprLDqHtaoK8_TjSMPfqPmImb3r4vwYmMs9PTCfKN3CQL5jyWPcUNOq_EqzLXhVCJgdm0I1a1dkqAhDj-AqkM8ilT4gQvyTE_RJBkiWSbZEe2Uu4iyPMkmOxVRGKgaUBd84xuQXauOaiqRMFuiY5CjzPfw7O3z1dTicJPZ4rEeay5h0eN7YTD_-D3WnPjomiwdwz2Fmvi8Cb8dQzL0GrnOeOyDgE_Tz3DnfejK7bb8HvT8hTOZZBu63Qd5tm95fvtPt0fMd3R_rDDbLerqzLJj036kDT1QSYpinmOxIjmt7XDZwF0r-fnson3hZz_AskE0h8)\n\n[![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fchengzeyi_stable-fast_readme_19642c69aeb1.png)](https:\u002F\u002Fmermaid.live\u002Fedit#pako:eNpFUk1v2zAM_SuEgCAXu3HsfLg-7LL22MsaDMWqHmiLtgXYUmAxWTLD_310UrQ8SY8S3-MjR1V5Q6pQcRxrV3lX26bQDiQu158tDvx5m-OvNdwWkCfJN9aSbVouIE0FvBVZLEbrrEDjklvqaVnAsvYDBV5OMC0W2l2u1Vw5LokRWj_Yf94xdtqx5Y5Aq4PtCeQPNORoQLauAe8IztaQFxncwuvvp_jtEME6STeX7X4XQbqFesCeQiRv4dfhDTZJnmglfDFebIB3rc7obNchsB-qVqtIuG7Hh8r3R9vRHQqMZUdxjYG1-tDuev8vCRKHTNAKEojjH0IuTWtX4gDveRbBPolgk3-oSPU09GiNGDvOXgnNbIVWhRxLDDTLmuQdnti_Xl2lCh5OFKnT0SDTk8VGWlFFjV34Qp-NFbFfYOfRkFxHxdfjPMLGBpaS9yHO-GnoBG6Zj6FYreb0QyPencq53VWwZp5Ce37crXbpLsc0o90-w22WmapcP-Z1ulnXZp-sU1TTFKkjuj_ef6uim56X-_7c1mj6D1vdvkY)\n\n| 模型       | torch | torch.compile | AIT  | oneflow | TensorRT | __stable-fast__ |\n| ----------- | ----- | ------------- | ---- | ------- | -------- | --------------- |\n| SD 1.5 (毫秒) | 1897  | 1510          | 1158 | 1003    | 991      | __995__         |\n| SVD-XT (秒)  | 83    | 70            |      |         |          | __47__          |\n\n__注意__: 在基准测试期间，`TensorRT`是在`静态批量大小`和`启用CUDA图`的情况下进行测试的，而`stable-fast`则是在动态形状下运行的。\n\n- [🚀稳定快速](#stable-fast)\n  - [简介](#introduction)\n    - [这是什么？](#what-is-this)\n    - [与其他加速库的区别](#differences-with-other-acceleration-libraries)\n  - [安装](#installation)\n    - [安装预构建的轮子](#install-prebuilt-wheels)\n    - [从源代码安装](#install-from-source)\n  - [使用方法](#usage)\n    - [优化StableDiffusionPipeline](#optimize-stablediffusionpipeline)\n    - [优化LCM Pipeline](#optimize-lcm-pipeline)\n    - [优化StableVideoDiffusionPipeline](#optimize-stablevideodiffusionpipeline)\n    - [动态切换LoRA](#dynamically-switch-lora)\n    - [模型量化](#model-quantization)\n    - [一些加快PyTorch速度的常用方法](#some-common-methods-to-speed-up-pytorch)\n  - [性能对比](#performance-comparison)\n    - [RTX 4080 (512x512, 批量大小1, fp16, 在WSL2中)](#rtx-4080-512x512-batch-size-1-fp16-in-wsl2)\n    - [H100](#h100)\n    - [A100](#a100)\n  - [兼容性](#compatibility)\n  - [故障排除](#troubleshooting)\n\n## 简介\n\n### 这是什么？\n\n`stable-fast` 是一个针对 __HuggingFace Diffusers__ 在 __NVIDIA GPU__ 上的超轻量级推理优化框架。\n`stable-fast` 通过利用一些关键技术与特性，提供了极快的推理优化：\n\n- __CUDNN 卷积融合__: `stable-fast` 实现了一系列功能完备且完全兼容的 CUDNN 卷积融合算子，适用于各种 `Conv + Bias + Add + Act` 计算模式的组合。\n- __低精度与融合 GEMM__: `stable-fast` 实现了一组以 `fp16` 精度进行计算的融合 GEMM 算子，其速度比 PyTorch 的默认设置更快（即以 `fp16` 读写数据，但以 `fp32` 进行计算）。\n- __融合线性 GEGLU__: `stable-fast` 能够将 `GEGLU(x, W, V, b, c) = GELU(xW + b) ⊗ (xV + c)` 融合为一个 CUDA 核心。\n- __NHWC 与融合 GroupNorm__: `stable-fast` 使用 OpenAI 的 `Triton` 实现了一个高度优化的 NHWC `GroupNorm + Silu` 融合算子，从而消除了对内存格式转换算子的需求。\n- __全追踪模型__: `stable-fast` 改进了 `torch.jit.trace` 接口，使其更适合追踪复杂模型。几乎 `StableDiffusionPipeline\u002FStableVideoDiffusionPipeline` 的每一个部分都可以被追踪并转换为 __TorchScript__。它比 `torch.compile` 更加稳定，且 CPU 开销显著低于 `torch.compile`，同时支持 __ControlNet__ 和 __LoRA__。\n- __CUDA 图__: `stable-fast` 可以将 `UNet`、`VAE` 和 `TextEncoder` 捕获为 CUDA 图格式，从而在小批量情况下降低 CPU 开销。该实现还支持动态形状。\n- __融合多头注意力__: `stable-fast` 直接使用 xformers，并使其与 __TorchScript__ 兼容。\n\n我的下一个目标是让 `stable-fast` 继续保持作为 `diffusers` 最快的推理优化框架之一，同时为 `transformers` 提供加速和显存占用减少的功能。\n事实上，我已经使用 `stable-fast` 来优化 LLM，并取得了显著的加速效果。\n不过，我还需要进一步完善，使其更加稳定、易于使用，并提供一个稳定的用户界面。\n\n### 与其他加速库的区别\n\n- __快速__: `stable-fast` 专为 __HuggingFace Diffusers__ 优化。它在众多库中都能达到高性能，并且仅需几秒钟即可完成非常快速的编译。在编译时间上，它显著快于 `torch.compile`、`TensorRT` 和 `AITemplate`。\n- __极简__: `stable-fast` 作为一个 PyTorch 的插件框架运行。它充分利用现有的 PyTorch 功能和基础设施，能够与其他加速技术以及流行的微调技术和部署方案兼容。\n- __最大兼容性__: `stable-fast` 兼容所有版本的 `HuggingFace Diffusers` 和 `PyTorch`。它同样兼容 `ControlNet` 和 `LoRA`，甚至开箱即用就支持最新的 `StableVideoDiffusionPipeline`！\n\n## 安装\n\n__注意__: `stable-fast` 目前仅在 `Linux` 和 `Windows` 的 WSL2 上进行了测试。\n您需要先安装支持 CUDA 的 PyTorch（建议使用 1.12 至 2.1 版本）。\n\n我仅在 `CUDA 12.1` 和 `Python 3.10` 环境下，使用 `torch>=2.1.0`、`xformers>=0.0.22` 和 `triton>=2.1.0` 对 `stable-fast` 进行了测试。\n其他版本或许也能成功构建和运行，但无法保证。\n\n### 安装预编译的 wheel 包\n\n从 [Releases 页面](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Freleases) 下载适合您系统的 wheel 文件，并使用 `pip3 install \u003Cwheel file>` 进行安装。\n\n目前已有适用于 __Linux__ 和 __Windows__ 的 wheel 包。\n\n```bash\n# 将 cu121 替换为您使用的 CUDA 版本，\u003Cwheel file> 替换为 wheel 文件的路径。\n# 请确保 wheel 文件与您的 PyTorch 版本兼容。\npip3 install --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu121 \\\n    'torch>=2.1.0' 'xformers>=0.0.22' 'triton>=2.1.0' 'diffusers>=0.19.3' \\\n    '\u003Cwheel file>'\n```\n\n### 从源码安装\n\n```bash\n# 请确保已安装 CUDNN\u002FCUBLAS。\n# https:\u002F\u002Fdeveloper.nvidia.com\u002Fcudnn\n# https:\u002F\u002Fdeveloper.nvidia.com\u002Fcublas\n\n# 首先安装带有 CUDA 支持的 PyTorch 及其他依赖包。\n# Windows 用户：Triton 可能不可用，可跳过。\n# 注意：必须安装 wheel 包，否则在构建时会遇到 `No module named 'torch'` 错误。\npip3 install wheel 'torch>=2.1.0' 'xformers>=0.0.22' 'triton>=2.1.0' 'diffusers>=0.19.3'\n\n# （可选）这会使构建过程更快。\npip3 install ninja\n\n# 如果您在不同类型的 GPU 上运行和构建，请设置 TORCH_CUDA_ARCH_LIST。\n# 您也可以从 PyPI 安装最新稳定版。\n# pip3 install -v -U stable-fast\npip3 install -v -U git+https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast.git@main#egg=stable-fast\n# （这可能需要数十分钟）\n```\n\n__注意__: 任何不在 `sfast.compilers` 中的用法均不保证向后兼容。\n\n__注意__: 为了获得最佳性能，需要安装并启用 `xformers` 和 OpenAI 的 `triton>=2.1.0`。\n您可能需要从源码构建 `xformers`，以使其与您的 `PyTorch` 兼容。\n\n## 使用方法\n\n### 优化 StableDiffusionPipeline\n\n`stable-fast` 能直接优化 `StableDiffusionPipeline` 和 `StableDiffusionPipelineXL`。\n\n```python\nimport time\nimport torch\nfrom diffusers import (StableDiffusionPipeline,\n                       EulerAncestralDiscreteScheduler)\nfrom sfast.compilers.diffusion_pipeline_compiler import (compile,\n                                                         CompilationConfig)\n\ndef load_model():\n    model = StableDiffusionPipeline.from_pretrained(\n        'runwayml\u002Fstable-diffusion-v1-5',\n        torch_dtype=torch.float16)\n\n    model.scheduler = EulerAncestralDiscreteScheduler.from_config(\n        model.scheduler.config)\n    model.safety_checker = None\n    model.to(torch.device('cuda'))\n    return model\n\nmodel = load_model()\n\nconfig = CompilationConfig.Default()\n# 建议启用 xformers 和 Triton 以获得最佳性能。\ntry:\n    import xformers\n    config.enable_xformers = True\nexcept ImportError:\n    print('xformers 未安装，跳过')\ntry:\n    import triton\n    config.enable_triton = True\nexcept ImportError:\n    print('Triton 未安装，跳过')\n# 对于小批量和低分辨率的情况，建议启用 CUDA 图以减少 CPU 开销。\n# 但这样做可能会增加 GPU 显存的使用量。\n# 对于 StableVideoDiffusionPipeline，则无需启用。\nconfig.enable_cuda_graph = True\n\nmodel = compile(model, config)\n\nkwarg_inputs = dict(\n    prompt=\n    '(masterpiece:1,2), best quality, masterpiece, best detailed face, a beautiful girl',\n    height=512,\n    width=512,\n    num_inference_steps=30,\n    num_images_per_prompt=1,\n)\n\n# 注意：请先预热。\n# 初始几次调用会触发编译，可能会非常慢。\n# 之后应该会非常快。\nfor _ in range 3:\n    output_image = model(**kwarg_inputs).images[0]\n\n# 让我们看看吧！\n\n# 注意：由于 CUDA 的异步特性，进度条可能会显示不准确。\nbegin = time.time()\noutput_image = model(**kwarg_inputs).images[0]\nprint(f'推理时间: {time.time() - begin:.3f}秒')\n\n# 让我们在终端中查看它！\nfrom sfast.utils.term_image import print_image\n\nprint_image(output_image, max_width=80)\n```\n\n更多详情请参考 [examples\u002Foptimize_stable_diffusion_pipeline.py](examples\u002Foptimize_stable_diffusion_pipeline.py)。\n\n您可以通过此 Colab 了解其在 T4 GPU 上的运行情况：[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fchengzeyi\u002Fstable-fast-colab\u002Fblob\u002Fmain\u002Fstable_fast_colab.ipynb)\n\n### 优化 LCM 流水线\n\n`stable-fast` 能够优化最新的 `latent consistency model` 流水线，并实现显著的速度提升。\n\n有关如何使用 LCM LoRA 优化普通 SD 模型的详细信息，请参阅 [examples\u002Foptimize_lcm_lora.py](examples\u002Foptimize_lcm_lora.py)。有关如何优化独立的 LCM 模型的详细信息，请参阅 [examples\u002Foptimize_lcm_pipeline.py](examples\u002Foptimize_lcm_pipeline.py)。\n\n### 优化 StableVideoDiffusionPipeline\n\n`stable-fast` 能够优化最新的 `StableVideoDiffusionPipeline`，并实现 `2 倍` 的速度提升。\n\n更多详情请参阅 [examples\u002Foptimize_stable_video_diffusion_pipeline.py](examples\u002Foptimize_stable_video_diffusion_pipeline.py)。\n\n### 动态切换 LoRA\n\n支持动态切换 LoRA，但需要进行一些额外的操作。\n之所以可行，是因为编译后的图和 `CUDA Graph` 与原始 UNet 模型共享相同的底层数据（指针）。因此，您只需就地更新原始 UNet 模型的参数即可。\n\n以下代码假设您已经加载了一个 LoRA 并编译了模型，现在想要切换到另一个 LoRA。\n\n如果您未启用 CUDA 图且保持 `preserve_parameters = True`，操作会简单得多。甚至可能不需要执行以下代码。\n\n```python\n# load_state_dict with assign=True 需要 torch >= 2.1.0\n\ndef update_state_dict(dst, src):\n    for key, value in src.items():\n        # 进行就地复制。\n        # 由于追踪的前向函数共享相同的底层数据（指针），此修改将反映在追踪的前向函数中。\n        dst[key].copy_(value)\n\n# 将“另一个”LoRA 切换到 UNet\ndef switch_lora(unet, lora):\n    # 存储原始 UNet 参数\n    state_dict = unet.state_dict()\n    # 加载另一个 LoRA 到 unet\n    unet.load_attn_procs(lora)\n    # 将当前 UNet 参数就地复制到原始 unet 参数\n    update_state_dict(state_dict，unet.state_dict())\n    # 将原始 UNet 参数重新加载回模型。\n    # 我们使用 assign=True，因为我们仍希望保留原始 UNet 参数的引用\n    unet.load_state_dict(state_dict，assign=True)\n\nswitch_lora(compiled_model.unet，lora_b_path)\n```\n\n### 模型量化\n\n`stable-fast` 扩展了 PyTorch 的 `quantize_dynamic` 功能，在 CUDA 后端提供了一种动态量化的线性算子。\n启用后，您可以为 `diffusers` 获得轻微的 VRAM 减少，为 `transformers` 获得显著的 VRAM 减少，并有可能提高速度（但并非总是如此）。\n\n对于 `SD XL`，预计在图像尺寸为 `1024x1024` 时，VRAM 可减少 `2GB`。\n\n```python\ndef quantize_unet(m):\n    from diffusers.utils import USE_PEFT_BACKEND\n    assert USE_PEFT_BACKEND\n    m = torch.quantization.quantize_dynamic(m，{torch.nn.Linear},\n                                            dtype=torch.qint8，\n                                            inplace=True)\n    return m\n\nmodel.unet = quantize_unet(model.unet)\nif hasattr(model，'controlnet'):\n    model.controlnet = quantize_unet(model.controlnet)\n```\n\n更多详情请参阅 [examples\u002Foptimize_stable_diffusion_pipeline.py](examples\u002Foptimize_stable_diffusion_pipeline.py)。\n\n### 提高 PyTorch 性能的一些常用方法\n\n```bash\n# 强烈建议使用 TCMalloc 来减少 CPU 开销\n# https:\u002F\u002Fgithub.com\u002Fgoogle\u002Ftcmalloc\nLD_PRELOAD=\u002Fpath\u002Fto\u002Flibtcmalloc.so python3 ...\n```\n\n```python\nimport packaging.version\nimport torch\n\nif packaging.version.parse(torch.__version__) >= packaging.version.parse('1.12.0'):\n    torch.backends.cuda.matmul.allow_tf32 = True\n```\n\n## 性能对比\n\n不同硬件\u002F软件\u002F平台\u002F驱动程序配置下的性能差异非常大。\n准确地进行基准测试非常困难，而准备基准测试环境本身也是一项艰巨的任务。\n我曾在一些平台上进行过测试，但结果仍然可能存在误差。\n需要注意的是，在进行基准测试时，由于 CUDA 的异步特性，`tqdm` 显示的进度条可能会不准确。\n为了解决这个问题，我使用 `CUDA Event` 来精确测量每秒迭代次数。\n\n`stable-fast` 在较新的 GPU 和较新版本的 CUDA 上表现更佳。\n__在较旧的 GPU 上，性能提升可能有限。__\n__在基准测试过程中，由于 CUDA 的异步特性，进度条可能会显示不准确。__\n\n### RTX 4080（512x512，批次大小 1，fp16，在 WSL2 中）\n\n这是我个人的游戏电脑😄。它的 CPU 比云服务器提供商的更强大。\n\n| 框架                                | SD 1.5        | SD XL (1024x1024) | SD 1.5 ControlNet |\n| ---------------------------------------- | ------------- | ----------------- | ----------------- |\n| Vanilla PyTorch (2.1.0)                  | 29.5 it\u002Fs     | 4.6 it\u002Fs          | 19.7 it\u002Fs         |\n| torch.compile (2.1.0，max-autotune)      | 40.0 it\u002Fs     | 6.1 it\u002Fs          | 21.8 it\u002Fs         |\n| AITemplate                               | 44.2 it\u002Fs     |                   |                   |\n| OneFlow                                  | 53.6 it\u002Fs     |                   |                   |\n| AUTO1111 WebUI                           | 17.2 it\u002Fs     | 3.6 it\u002Fs          |                   |\n| AUTO1111 WebUI（带 SDPA）               | 24.5 it\u002Fs     | 4.3 it\u002Fs          |                   |\n| TensorRT（AUTO1111 WebUI）                | 40.8 it\u002Fs     |                   |                   |\n| TensorRT 官方演示                       | 52.6 it\u002Fs     |                   |                   |\n| __stable-fast（带 xformers & Triton）__ | __51.6 it\u002Fs__ | __9.1 it\u002Fs__      | __36.7 it\u002Fs__     |\n\n### H100\n\n感谢 __@Consceleratus__ 和 __@harishp__ 的帮助，我在 H100 上进行了速度测试。\n\n| 框架                                | SD 1.5         | SD XL (1024x1024) | SD 1.5 ControlNet |\n| ---------------------------------------- | -------------- | ----------------- | ----------------- |\n| Vanilla PyTorch (2.1.0)                  | 54.5 it\u002Fs      | 14.9 it\u002Fs         | 35.8 it\u002Fs         |\n| torch.compile (2.1.0，max-autotune)      | 66.0 it\u002Fs      | 18.5 it\u002Fs         |                   |\n| __stable-fast（带 xformers & Triton）__ | __104.6 it\u002Fs__ | __21.6 it\u002Fs__     | __72.6 it\u002Fs__     |\n\n### A100\n\n感谢 __@SuperSecureHuman__ 和 __@jon-chuang__ 的帮助，现在可以在 A100 上进行基准测试了。\n\n| 框架                                | SD 1.5        | SD XL (1024×1024) | SD 1.5 ControlNet |\n| ---------------------------------------- | ------------- | ----------------- | ----------------- |\n| 原生 PyTorch (2.1.0)                  | 35.6 it\u002Fs     | 8.7 it\u002Fs          | 25.1 it\u002Fs         |\n| torch.compile (2.1.0, max-autotune)      | 41.9 it\u002Fs     | 10.0 it\u002Fs         |                   |\n| __stable-fast (使用 xformers 和 Triton)__ | __61.8 it\u002Fs__ | __11.9 it\u002Fs__     | __41.1 it\u002Fs__     |\n\n## 兼容性\n\n| 模型                               | 支持 |\n| ----------------------------------- | ---- |\n| Hugging Face Diffusers (1.5\u002F2.1\u002FXL) | 是   |\n| 使用 ControlNet                     | 是   |\n| 使用 LoRA                           | 是   |\n| 隐式一致性模型                    | 是   |\n| SDXL Turbo                          | 是   |\n| Stable Video Diffusion              | 是   |\n\n| 功能                               | 支持 |\n| ----------------------------------- | ---- |\n| 动态形状                       | 是   |\n| 文本到图像                       | 是   |\n| 图像到图像                      | 是   |\n| 图像修复                        | 是   |\n\n| UI 框架                        | 支持 | 链接                                                                    |\n| ----------------------------------- | ---- | ----------------------------------------------------------------------- |\n| AUTOMATIC1111                       | 开发中       |                                                                         |\n| SD Next                             | 是       | [`SD Next`](https:\u002F\u002Fgithub.com\u002Fvladmandic\u002Fautomatic)                    |\n| ComfyUI                             | 是       | [`ComfyUI_stable_fast`](https:\u002F\u002Fgithub.com\u002Fgameltb\u002FComfyUI_stable_fast) |\n\n| 操作系统                    | 支持 |\n| ----------------------------------- | ---- |\n| Linux                               | 是   |\n| Windows                             | 是   |\n| Windows WSL                         | 是   |\n\n## 故障排除\n\n更多详情请参阅 [doc\u002Ftroubleshooting.md](doc\u002Ftroubleshooting.md)。\n\n您也可以加入 [Discord 频道](https:\u002F\u002Fdiscord.gg\u002FkQFvfzM4SJ) 寻求帮助。","# Stable Fast 快速上手指南\n\n`stable-fast` 是一个专为 NVIDIA GPU 设计的超轻量级推理优化框架，旨在加速 HuggingFace Diffusers 库。相比 TensorRT 或 AITemplate 需要数十分钟的编译时间，`stable-fast` 仅需数秒即可完成模型编译，并原生支持动态形状（Dynamic Shape）、LoRA 和 ControlNet。\n\n## 环境准备\n\n在开始之前，请确保满足以下系统和依赖要求：\n\n*   **操作系统**：Linux 或 Windows (WSL2)。\n*   **GPU**：NVIDIA GPU。\n*   **Python 版本**：推荐 Python 3.10。\n*   **CUDA 版本**：推荐 CUDA 12.1（支持范围 11.x - 12.x）。\n*   **核心依赖**：\n    *   `torch >= 2.1.0` (必须带 CUDA 支持)\n    *   `xformers >= 0.0.22` (强烈建议安装以获得最佳性能)\n    *   `triton >= 2.1.0` (强烈建议安装)\n    *   `diffusers >= 0.19.3`\n    *   `ninja` (可选，用于加速构建过程)\n\n> **注意**：虽然其他版本可能也能运行，但作者仅在上述特定版本组合下进行了充分测试。Windows 用户若无法安装 Triton 可跳过，但性能可能受影响。\n\n## 安装步骤\n\n### 方法一：安装预编译包（推荐）\n\n从 [Releases 页面](https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Freleases) 下载对应你系统环境和 PyTorch\u002FCUDA 版本的 `.whl` 文件，然后使用 pip 安装。\n\n```bash\n# 请将 \u003Cwheel file> 替换为实际下载的文件路径\n# 确保 --index-url 对应的 CUDA 版本与你安装的 PyTorch 版本一致 (例如 cu121)\npip3 install --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu121 \\\n    'torch>=2.1.0' 'xformers>=0.0.22' 'triton>=2.1.0' 'diffusers>=0.19.3' \\\n    '\u003Cwheel file>'\n```\n\n### 方法二：从源码安装\n\n如果你需要最新功能或没有匹配的预编译包，可以从源码安装。\n\n```bash\n# 1. 安装基础依赖 (确保已安装 CUDNN 和 CUBLAS)\n# Windows 用户如果无法安装 triton 可移除该项\npip3 install wheel 'torch>=2.1.0' 'xformers>=0.0.22' 'triton>=2.1.0' 'diffusers>=0.19.3'\n\n# 2. (可选) 安装 ninja 以加速编译\npip3 install ninja\n\n# 3. (可选) 如果你的运行环境和编译环境 GPU 架构不同，需设置环境变量\n# export TORCH_CUDA_ARCH_LIST=\"8.0;8.6;9.0\" \n\n# 4. 从 GitHub 安装\npip3 install -v -U git+https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast.git@main#egg=stable-fast\n```\n\n## 基本使用\n\n`stable-fast` 可以直接优化 `StableDiffusionPipeline` 和 `StableDiffusionXLPipeline`。以下是优化 SD 1.5 模型的最简示例：\n\n```python\nimport torch\nfrom diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler\nfrom sfast.compilers.diffusion_pipeline_compiler import compile, CompilationConfig\n\ndef load_and_optimize_model():\n    # 1. 加载原始模型\n    model = StableDiffusionPipeline.from_pretrained(\n        'runwayml\u002Fstable-diffusion-v1-5',\n        torch_dtype=torch.float16\n    )\n    \n    model.scheduler = EulerAncestralDiscreteScheduler.from_config(\n        model.scheduler.config\n    )\n    model.safety_checker = None\n    model.to(torch.device('cuda'))\n\n    # 2. 配置编译选项\n    config = CompilationConfig.Default()\n    # 启用 xformers 和 Triton 以获得最佳性能\n    config.enable_xformers = True\n    config.enable_triton = True\n    \n    # 3. 编译模型 (仅需数秒)\n    # compile 函数会原地修改 model 对象，使其获得加速\n    compiled_model = compile(model, config)\n    \n    return compiled_model\n\n# 初始化优化后的模型\npipe = load_and_optimize_model()\n\n# 像往常一样使用\nprompt = \"a photo of an astronaut riding a horse on mars\"\nimage = pipe(prompt).images[0]\nimage.save(\"astronaut_rides_horse.png\")\n```\n\n### 关键特性说明\n*   **动态形状支持**：编译后的模型支持不同的分辨率和 Batch Size，无需重新编译。\n*   **LoRA 支持**：优化后的管道可以直接加载和切换 LoRA 权重，无需特殊处理。\n*   **显存优化**：除了速度提升，该框架还能有效降低显存占用。","一家专注于短视频营销的创意工作室，正利用 FLUX-dev 和 Wan 2.1 模型为客户批量生成带有特定品牌风格（LoRA）的高清宣传视频。\n\n### 没有 stable-fast 时\n- **编译等待漫长**：每次切换新的 LoRA 风格或调整分辨率，基于 TensorRT 的方案都需要数十分钟重新编译模型，严重打断创作流。\n- **动态适配困难**：无法灵活支持动态形状（dynamic shape），一旦视频尺寸微调，就必须重新构建优化引擎，缺乏灵活性。\n- **硬件资源闲置**：在漫长的编译和低速推理过程中，昂贵的 NVIDIA GPU 利用率波动大，单位时间内的产出量极低。\n- **新功能兼容慢**：面对最新的 StableVideoDiffusionPipeline 等更新，旧有优化框架往往滞后，无法立即享受性能红利。\n\n### 使用 stable-fast 后\n- **秒级即时编译**：stable-fast 将模型编译时间从几十分钟压缩至几秒钟，切换 LoRA 或调整参数几乎无需等待，实现“即改即看”。\n- **原生动态支持**：开箱即用支持动态形状、LoRA 和 ControlNet，无论视频分辨率如何变化，都能自动适配并保持高速推理。\n- **推理性能飞跃**：在 NVIDIA GPU 上达成 SOTA 级别的推理速度，显著缩短单视频生成耗时，使批量生产成为可能。\n- **前沿模型同步**：无缝支持最新的 FLUX-dev 和 Wan 2.1 架构，确保团队始终能用最快速度部署业界最先进的生成模型。\n\nstable-fast 通过秒级编译与极致推理加速，将 AI 视频工作流从“等待编译”的瓶颈中解放，真正实现了创意到成片的实时转化。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fchengzeyi_stable-fast_b54e33e7.png","chengzeyi","C","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fchengzeyi_fe5bd14c.jpg","CEO at @WaveSpeedAI ","https:\u002F\u002Fwavespeed.ai\u002F","wavespeed.ai","ichengzeyi@gmail.com","https:\u002F\u002Fchengzeyi.github.io\u002Fmarkdown-cv\u002F","https:\u002F\u002Fgithub.com\u002Fchengzeyi",[83,87,91],{"name":84,"color":85,"percentage":86},"Python","#3572A5",56.9,{"name":88,"color":89,"percentage":90},"C++","#f34b7d",37.9,{"name":92,"color":93,"percentage":94},"Cuda","#3A4E3A",5.2,1306,92,"2026-04-09T12:48:09","MIT","Linux, Windows (仅限 WSL2)","必需 NVIDIA GPU，需安装 CUDNN 和 CUBLAS，测试环境为 CUDA 12.1","未说明",{"notes":103,"python":104,"dependencies":105},"macOS 不支持。Windows 用户若从源码编译可能无法使用 Triton（可跳过），建议直接安装预编译的 Wheel 包。从源码编译可能需要数十分钟，建议安装 ninja 加速构建。为了获得最佳性能，必须启用 xformers 和 Triton。该工具主要针对 HuggingFace Diffusers 模型进行优化，支持动态形状、LoRA 和 ControlNet。","3.10",[106,107,108,109],"torch>=2.1.0","xformers>=0.0.22","triton>=2.1.0","diffusers>=0.19.3",[111,14,15],"视频",[113,114,115,116,117,118,119,120,121,122],"cuda","diffusers","pytorch","stable-diffusion","deeplearnng","inference-engines","openai-triton","performance-optimizations","torch","stable-video-diffusion",null,"2026-03-27T02:49:30.150509","2026-04-20T20:24:35.894146",[127,132,137,142,147,152],{"id":128,"question_zh":129,"answer_zh":130,"source_url":131},45665,"如何在 SD.Next (Automatic1111) 中安装并解决安装失败的问题？","如果在虚拟环境中运行安装脚本失败，尝试删除现有的 venv 文件夹并重新生成它，通常可以解决依赖冲突或路径错误问题。此外，如果更改过项目目录，请确保 Python 环境读取的是正确的最新目录路径，避免指向旧目录导致安装失败。","https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fissues\u002F119",{"id":133,"question_zh":134,"answer_zh":135,"source_url":136},45666,"启用 xformers 或 Triton 后，IP-Adapter 为何不生效或无法参考风格？","当配置项 config.enable_triton = True 时，IP-Adapter 可能无法正确参考风格图像（结果看起来像只用了 ControlNet）。解决方法是注释掉或禁用该行配置（即设置 enable_triton = False），此时 IP-Adapter 即可正常引用风格。这可能是由于 Triton 优化与 IP-Adapter 的注意力处理器存在冲突。","https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fissues\u002F81",{"id":138,"question_zh":139,"answer_zh":140,"source_url":141},45667,"在 ComfyUI 中使用 Stable Fast 时遇到 'RuntimeError: _Map_base::at' 错误如何解决？","该错误通常与 PyTorch 的 checkpoint 功能不兼容有关。解决方法是在 ComfyUI 中使用 checkpointLoader 节点，并在其配置文件中将 model.params.unet_config.params.use_checkpoint 属性设置为 False，以禁用 checkpoint 功能。","https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fissues\u002F49",{"id":143,"question_zh":144,"answer_zh":145,"source_url":146},45668,"为什么在 A100 等高端显卡上，编译后的 SDXL 模型速度反而比原生 Diffusers 慢？","这是为了兼容性做出的权衡。为了实现极高的速度（如 60+ it\u002Fs），原本需要使用修改版的调度器以减少 CPU 开销，但这会限制用户自由选择调度器。为了让用户能动态切换 LoRA 和使用任意调度器，项目禁用了部分激进优化并使用了启发式规则代替 Triton 自动调优，因此在某些高配硬件上加速效果不明显甚至略慢。","https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fissues\u002F16",{"id":148,"question_zh":149,"answer_zh":150,"source_url":151},45669,"在 Arch Linux 上安装时遇到 GCC 版本不兼容或找不到 'torch' 模块的错误怎么办？","在 Arch Linux 上构建 stable-fast 时，如果遇到 GCC 版本不兼容导致的编译错误，需要在构建前设置环境变量：export NVCC_PREPEND_FLAGS='-ccbin \u002Fusr\u002Fbin\u002Fg++-12'（根据实际安装的 g++ 版本调整路径）。这能强制 NVCC 使用指定版本的编译器，解决链接问题。","https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fissues\u002F55",{"id":153,"question_zh":154,"answer_zh":155,"source_url":136},45670,"如何验证 IP-Adapter 与 ControlNet 同时使用的优化脚本是否正确运行？","可以参考官方提供的社区示例脚本进行验证：https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fblob\u002F9fd07ce57e0cedd62ee59cd78774a577e4f2967b\u002Fcommunity\u002Foptimize_sd15_with_controlnet_and_ip_adapter.py。如果脚本报错但官方运行正常，请检查具体的异常详情及环境配置差异。",[157,162,166,170,174,178,182,187,191,196,200,204,208,213,218,222,226,230,234,238],{"id":158,"version":159,"summary_zh":160,"released_at":161},360574,"nightly","待办：添加每日构建版本说明","2024-11-27T10:48:21",{"id":163,"version":164,"summary_zh":123,"released_at":165},360575,"v1.0.5","2024-05-09T16:27:00",{"id":167,"version":168,"summary_zh":123,"released_at":169},360576,"v1.0.4","2024-03-14T05:35:35",{"id":171,"version":172,"summary_zh":123,"released_at":173},360577,"v1.0.3","2024-01-31T06:08:25",{"id":175,"version":176,"summary_zh":123,"released_at":177},360578,"v1.0.2","2024-01-19T06:28:32",{"id":179,"version":180,"summary_zh":123,"released_at":181},360579,"v1.0.1","2023-12-28T11:50:42",{"id":183,"version":184,"summary_zh":185,"released_at":186},360580,"v1.0.0","🔥🔥🔥 `stable-fast` 现已支持 `StableVideoDiffusionPipeline`🔥🔥🔥\n\n我非常高兴地宣布 `stable-fast` 的 v1.0.0 版本正式发布。现在，它不仅在 `Stable Diffusion` 上取得了 SOTA 效果，还新增了对 `Stable Video Diffusion` 的支持！我相信，在视频生成速度方面，`stable-fast` 无疑是全球最快的！","2023-12-19T07:24:25",{"id":188,"version":189,"summary_zh":123,"released_at":190},360581,"v0.0.15.post1","2023-12-18T10:56:17",{"id":192,"version":193,"summary_zh":194,"released_at":195},360582,"v0.0.15","## 变更内容\n* 由 @chengzeyi 在 https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fpull\u002F74 中优化了融合 GEGLU 的性能\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fcompare\u002Fv0.0.14...v0.0.15","2023-12-13T02:11:29",{"id":197,"version":198,"summary_zh":123,"released_at":199},360583,"v0.0.14","2023-12-11T12:51:45",{"id":201,"version":202,"summary_zh":123,"released_at":203},360584,"v0.0.13.post4","2023-12-11T09:22:10",{"id":205,"version":206,"summary_zh":123,"released_at":207},360585,"v0.0.13.post3","2023-12-05T12:10:47",{"id":209,"version":210,"summary_zh":211,"released_at":212},360586,"v0.0.13.post1","修复 T4 兼容性","2023-12-05T07:46:52",{"id":214,"version":215,"summary_zh":216,"released_at":217},360587,"v0.0.13","## 变更内容\n* 由 @chengzeyi 在 https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fpull\u002F50 中修复了 aten::to 导致的 RuntimeError\n* 由 @chengzeyi 在 https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fpull\u002F51 中进行开发\n* 关闭 #53；由 @chengzeyi 在 https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fpull\u002F54 中修复了 SDXL 和 CUDA 图中参数被释放的问题\n* 由 @chengzeyi 在 https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fpull\u002F61 中进行开发\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fchengzeyi\u002Fstable-fast\u002Fcompare\u002Fv0.0.12...v0.0.13","2023-12-05T03:52:20",{"id":219,"version":220,"summary_zh":123,"released_at":221},360588,"v0.0.12.post6","2023-11-29T15:12:41",{"id":223,"version":224,"summary_zh":123,"released_at":225},360589,"v0.0.12.post5","2023-11-28T05:11:44",{"id":227,"version":228,"summary_zh":123,"released_at":229},360590,"v0.0.12.post4","2023-11-27T14:35:16",{"id":231,"version":232,"summary_zh":123,"released_at":233},360591,"v0.0.12.post3","2023-11-25T08:33:27",{"id":235,"version":236,"summary_zh":123,"released_at":237},360592,"v0.0.12.post2","2023-11-25T08:14:05",{"id":239,"version":240,"summary_zh":123,"released_at":241},360593,"v0.0.12.post1","2023-11-24T08:05:27"]