[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-PixArt-alpha--PixArt-alpha":3,"tool-PixArt-alpha--PixArt-alpha":62},[4,18,26,35,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,2,"2026-04-18T11:18:24",[14,15,13],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":32,"last_commit_at":41,"category_tags":42,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[43,13,15,14],"插件",{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":10,"last_commit_at":50,"category_tags":51,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[52,15,13,14],"语言模型",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[14,15,13,61],"视频",{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":65,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":76,"owner_website":78,"owner_url":79,"languages":80,"stars":100,"forks":101,"last_commit_at":102,"license":103,"difficulty_score":10,"env_os":104,"env_gpu":105,"env_ram":106,"env_deps":107,"category_tags":116,"github_topics":76,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":117,"updated_at":118,"faqs":119,"releases":120},9728,"PixArt-alpha\u002FPixArt-alpha","PixArt-alpha","PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis","PixArt-α 是一款专注于快速训练扩散 Transformer 模型的开源工具，旨在实现高质量的写实风格“文生图”合成。作为 ICLR 2024 的焦点论文成果，它主要解决了传统扩散模型训练成本高昂、收敛速度慢以及对计算资源要求极高的问题。通过引入高效的架构设计与数据策略，PixArt-α 能够在显著减少训练时间和算力的前提下，生成细节丰富、逼真度极高的图像。\n\n该工具特别适合 AI 研究人员探索高效的模型训练范式，同时也为开发者提供了灵活的 PyTorch 代码库和预训练权重，便于进行二次开发或集成到如 ComfyUI 等工作流中。对于希望低成本部署高性能生成模型的企业或技术团队，PixArt-α 也是一个极具价值的选择。其核心技术亮点在于创新性地结合了 Transformer 架构与扩散模型，并利用了大规模高质量数据集（如 SAM-LLaVA-Captions10M）进行优化，从而在保持生成质量的同时大幅提升了训练效率。目前，项目已开放完整的推理代码、模型权重及在线演示，社区支持活跃，方便各类用户快速上手体验。","\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_2dab88eba56d.png\"  height=120>\n\u003C\u002Fp>\n\n\n### \u003Cdiv align=\"center\">👉 PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis\u003Cdiv> \n### \u003Cdiv align=\"center\"> ICLR 2024 Spotlight \u003Cdiv> \n\n\u003Cdiv align=\"center\">\n  \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-sigma\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=PixArt-Sigma Code&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\n\n  \u003Ca href=\"https:\u002F\u002Fpixart-alpha.github.io\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Project%20Page&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002FSAM-LLaVA-Captions10M\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=SAM-LLaVA&message=HF&color=yellow\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00426\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Paper&message=Arxiv:Alpha&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Paper&message=Arxiv:Delta&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Frde6eaE5Ta\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Discuss&message=Discord&color=purple&logo=discord\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fpixart\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Usage&message=Diffusers&color=green&\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcity96\u002FComfyUI_ExtraModels\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=App&message=ComfyUI&&color=green\">\u003C\u002Fa> &ensp;\n\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-alpha\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt&message=HuggingFace&color=yellow\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-LCM\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt-LCM&message=HuggingFace&color=yellow\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fopenxlab.org.cn\u002Fapps\u002Fdetail\u002FPixArt-alpha\u002FPixArt-alpha\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt&message=OpenXLab&color=purple\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fopenxlab.org.cn\u002Fapps\u002Fdetail\u002Fhoushaowei\u002FPixArt-LCM\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt-LCM&message=OpenXLab&color=purple\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Free%20Trial&message=Google%20Colab&logo=google&color=orange\">\u003C\u002Fa> &ensp;\n\u003C\u002Fdiv>\n\n---\n\nThis repo contains PyTorch model definitions, pre-trained weights and inference\u002Fsampling code for our paper exploring \nFast training diffusion models with transformers. You can find more visualizations on our [project page](https:\u002F\u002Fpixart-alpha.github.io\u002F).\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_2dab88eba56d.png\" width=\"10%\" alt=\"\" \u002F> **PixArt-α Community**: Join our PixArt-α discord channels \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Frde6eaE5Ta\" style=\"text-decoration:none;\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_6342e5371027.png\" width=\"3%\" alt=\"\" \u002F>\u003C\u002Fa> for discussions. Coders are welcome to contribute.\n\n> [**PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis**](https:\u002F\u002Fpixart-alpha.github.io\u002F)\u003Cbr>\n> [Junsong Chen*](https:\u002F\u002Flawrence-cj.github.io\u002F), [Jincheng Yu*](https:\u002F\u002Flovesykun.cn\u002Fabout.html), \n> [Chongjian Ge*](https:\u002F\u002Fchongjiange.github.io\u002F), [Lewei Yao*](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=hqDyTg8AAAAJ&hl=zh-CN&oi=ao),\n> [Enze Xie](https:\u002F\u002Fxieenze.github.io\u002F)&#8224;,\n> [Yue Wu](https:\u002F\u002Fyuewuhkust.github.io\u002F), [Zhongdao Wang](https:\u002F\u002Fzhongdao.github.io\u002F), \n> [James Kwok](https:\u002F\u002Fwww.cse.ust.hk\u002F~jamesk\u002F), [Ping Luo](http:\u002F\u002Fluoping.me\u002F), \n> [Huchuan Lu](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=D3nE0agAAAAJ), \n> [Zhenguo Li](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=XboZC1AAAAAJ)\n> \u003Cbr>Huawei Noah’s Ark Lab, Dalian University of Technology, HKU, HKUST\u003Cbr>\n\n> [**PIXART-δ: Fast and Controllable Image Generation with Latent Consistency Models**](https:\u002F\u002Fpixart-alpha.github.io\u002F)\u003Cbr>\n> [Junsong Chen](https:\u002F\u002Flawrence-cj.github.io\u002F), [Yue Wu](https:\u002F\u002Fyuewuhkust.github.io\u002F), [Simian Luo](https:\u002F\u002Fluosiallen.github.io\u002F),  [Enze Xie](https:\u002F\u002Fxieenze.github.io\u002F)&#8224;,\n> [Sayak Paul](https:\u002F\u002Fsayak.dev\u002F), [Ping Luo](http:\u002F\u002Fluoping.me\u002F), [Hang Zhao](), [Zhenguo Li](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=XboZC1AAAAAJ)\n> \u003Cbr>Huawei Noah’s Ark Lab, DLUT, Tsinghua University, HKU, Hugging Face\u003Cbr>\n\n---\n## Breaking News 🔥🔥!!\n- (🔥 New) Apr. 12, 2024. 💥 A better version of [PixArt-Σ](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-sigma) training & inference code, checkpoints are all released!!!\nWelcome to collaborate and contribute. Star 🌟us if you think it is helpful!!\n\n\n- (🔥 New) Jan. 19, 2024. 💥 [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252) ControlNet [app_controlnet.py](app\u002Fapp_controlnet.py) and [Checkpoint](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain) are released!!!\n- (🔥 New) Jan. 16, 2024. 💥 Glad to announce that [PixArt-α](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00426) is accepted by ICLR 2024 (Spotlight).\n- (🔥 New) Dec. 17, 2023. 💥 PixArt supports [ComfyUI](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI#manual-install-windows-linux). Thanks to [@city96](https:\u002F\u002Fgithub.com\u002Fcity96\u002FComfyUI_ExtraModels) with his great work.\n- (🔥 New) Nov. 30, 2023. 💥 PixArt collaborates with [LCMs](https:\u002F\u002Fgithub.com\u002Fluosiallen\u002Flatent-consistency-model) team to make the **fastest** [Training & Inference Text-to-Image Generation System](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha).\nHere, [Training code](train_scripts\u002Ftrain_pixart_lcm.py) & [Inference code](scripts\u002Finference_lcm.py) & [Weights](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS) & [HF Demo](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-LCM) [OpenXLab Demo](https:\u002F\u002Fopenxlab.org.cn\u002Fapps\u002Fdetail\u002Fhoushaowei\u002FPixArt-LCM) are all released, we hope users will enjoy them. \nDetailed **inference speed** and **code guidance** can be found in [docs](asset\u002Fdocs\u002Fpixart_lcm.md). At the same time, we update the codebase for better user experience and fix some bugs in the newest version.\n\n---\n## 🚩 **New Features\u002FUpdates**\n- ✅ Jan. 11, 2024. 💥 [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252): We are excited to announce the release of the [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252) technical report!!!\nThis report offers valuable insights into the training of LCM and ControlNet-like modules in Transformer Models. Along with the report, we have also released all the training and inference code for LCM & ControlNet [in this repository](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha). \nWe encourage you to try them out and warmly welcome any Pull Requests from our users. Your contributions and feedback are highly appreciated!\n- ✅ Feb. 07, 2024. [train_diffusers.py](train_scripts\u002Ftrain_diffusers.py) can directly train with diffusers model and visualize during training.\n- ✅ Jan. 26, 2024. 💥 All checkpoints of [PixArt-α](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha), including 256px checkpoints are all available here [Download Models](#-download-models).\n- ✅ Jan. 19, 2024. 💥 [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252) ControlNet [app_controlnet.py](app\u002Fapp_controlnet.py) and [Checkpoint](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain) is released!!!\n- ✅ Jan. 12, 2024. 💥 We release the [SAM-LLaVA-Captions](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002FSAM-LLaVA-Captions10M) used in PixArt-α training.\n- ✅ Dec. 27, 2023. [PixArt-α](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha) incorporates into [ControlLLM](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FControlLLM)!\n- ✅ Dec. 17, 2023. [PixArt-LCM-Lora](train_scripts\u002Ftrain_pixart_lcm_lora.py) & [PixArt-Lora](train_scripts\u002Ftrain_pixart_lora_hf.py) training scripts in Hugging Face style is released.\n- ✅ Dec. 13, 2023. Add multi-scale vae feature extraction in [tools\u002Fextract_features.py](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002F3b4f0afdbe39def80b41ab05c664c963edeebbcd\u002Ftools\u002Fextract_features.py#L276).\n- ✅ Dec. 01, 2023. Add a [Notebook folder](.\u002Fnotebooks) to help users get started with PixArt quickly! Thanks to [@kopyl](https:\u002F\u002Fgithub.com\u002Fkopyl) for his contribution!\n- ✅ Nov. 27, 2023. 💥 **PixArt-α Community**: Join our PixArt-α discord channels \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Frde6eaE5Ta\" style=\"text-decoration:none;\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_6342e5371027.png\" width=\"3%\" alt=\"\" \u002F>\u003C\u002Fa> for discussions. Coders are welcome to contribute.\n- ✅ Nov. 21, 2023. 💥 [SA-Sovler](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05019) official code first release [here](asset\u002Fdocs\u002Fsasolver.md).\n- ✅ Nov. 19, 2023. Release `PixArt + Dreambooth` training scripts.\n- ✅ Nov. 16, 2023. Diffusers support `random resolution` and `batch images` generation now. Besides, \nrunning `Pixart` in under 8GB GPU VRAM is available in 🧨 [diffusers](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fpixart).\n- ✅ Nov. 10, 2023. Support DALL-E 3 Consistency Decoder in 🧨 diffusers.\n- ✅ Nov. 06, 2023. Release pretrained weights with 🧨 diffusers integration, Hugging Face demo, and Google Colab example.\n- ✅ Nov. 03, 2023. Release the LLaVA-captioning inference code.\n- ✅ Oct. 27, 2023. Release the training & feature extraction code.\n- ✅ Oct. 20, 2023. Collaborate with Hugging Face & Diffusers team to co-release the code and weights. (plz stay tuned.)\n- ✅ Oct. 15, 2023. Release the inference code.\n\n---\n\n## Contents\n* [Training](#-how-to-train)\n* [Inference](#-how-to-test)\n* [Download Models](#-download-models)\n* [Use diffusers](#1---using-in--diffusers)\n* [Data Processing](#-how-to-extract-t5-and-vae-features)\n* [PixArt-**α** Demo](#3---gradio-with-diffusers--faster-)\n* [PixArt-**α** 8GB VRAM](asset\u002Fdocs\u002Fpixart.md)\n* [PixArt-**δ** (LCM)](asset\u002Fdocs\u002Fpixart_lcm.md)\n* [PixArt-**δ** (ControlNet)](asset\u002Fdocs\u002Fpixart_controlnet.md)\n* [PixArt-**δ** (Dreambooth)](asset\u002Fdocs\u002Fpixart-dreambooth.md)\n* [Acknowledgement](#acknowledgements)\n* [Citation](#bibtex)\n\n\n* [PixArt-**Σ** Releasing](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-sigma)\n\n---\n\n## 🐱 Abstract\n\u003Cb>TL; DR: \u003Cfont color=\"red\">PixArt-α\u003C\u002Ffont> is a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), and the training speed markedly surpasses existing large-scale T2I models, e.g., PixArt-α only takes 10.8% of Stable Diffusion v1.5's training time (675 vs. 6,250 A100 GPU days).\u003C\u002Fb>\n\n\u003Cdetails>\u003Csummary>CLICK for the full abstract\u003C\u002Fsummary>\nThe most advanced text-to-image (T2I) models require significant training costs (e.g., millions of GPU hours), \nseriously hindering the fundamental innovation for the AIGC community while increasing CO2 emissions. \nThis paper introduces PixArt-α, a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), \nreaching near-commercial application standards. Additionally, it supports high-resolution image synthesis up to 1024px resolution with low training cost. \nTo achieve this goal, three core designs are proposed: \n(1) Training strategy decomposition: We devise three distinct training steps that separately optimize pixel dependency, text-image alignment, and image aesthetic quality; \n(2) Efficient T2I Transformer: We incorporate cross-attention modules into Diffusion Transformer (DiT) to inject text conditions and streamline the computation-intensive class-condition branch; \n(3) High-informative data: We emphasize the significance of concept density in text-image pairs and leverage a large Vision-Language model to auto-label dense pseudo-captions to assist text-image alignment learning. \nAs a result, PixArt-α's training speed markedly surpasses existing large-scale T2I models, \ne.g., PixArt-α only takes 10.8% of Stable Diffusion v1.5's training time (675 vs. 6,250 A100 GPU days), \nsaving nearly $300,000 ($26,000 vs. $320,000) and reducing 90% CO2 emissions. Moreover, compared with a larger SOTA model, RAPHAEL, \nour training cost is merely 1%. Extensive experiments demonstrate that PixArt-α excels in image quality, artistry, and semantic control. \nWe hope PixArt-α will provide new insights to the AIGC community and startups to accelerate building their own high-quality yet low-cost generative models from scratch.\n\u003C\u002Fdetails>\n\n---\n\n![A small cactus with a happy face in the Sahara desert.](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_657cc1b8e1d0.png)\n\n---\n\n# 🔥🔥🔥 Why PixArt-α? \n## Training Efficiency\nPixArt-α only takes 12% of Stable Diffusion v1.5's training time (753 vs. 6,250 A100 GPU days), saving nearly $300,000 ($28,000 vs. $320,000) and reducing 90% CO2 emissions. Moreover, compared with a larger SOTA model, RAPHAEL, our training cost is merely 1%.\n![Training Efficiency.](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_c4d918b0f706.png)\n\n| Method    | Type | #Params | #Images| FID-30K ↓        | A100 GPU days |\n|-----------|------|---------|--------|------------------|---------------|\n| DALL·E    | Diff | 12.0B   | 250M   | 27.50            |               |\n| GLIDE     | Diff | 5.0B    | 250M   | 12.24            |               |\n| LDM       | Diff | 1.4B    | 400M   | 12.64            |               |\n| DALL·E 2  | Diff | 6.5B    | 650M   | 10.39            | 41,66         |\n| SDv1.5    | Diff | 0.9B    | 2000M  | 9.62             | 6,250         |\n| GigaGAN   | GAN  | 0.9B    | 2700M  | 9.09             | 4,783         |\n| Imagen    | Diff | 3.0B    | 860M   | 7.27             | 7,132         |\n| RAPHAEL   | Diff | 3.0B    | 5000M+ | 6.61             | 60,000        |\n| PixArt-α  | Diff | 0.6B    | 25M    | 7.32 (zero-shot) | 753           |\n| PixArt-α  | Diff | 0.6B    | 25M    | 5.51 (COCO FT)   | 753           |\n\n## Inference Efficiency\nPIXART-δ successfully generates **1024x1024 high resolution** images within **0.5 seconds** on an A100. With the implementation\nof 8-bit inference technology, PIXART-δ requires **less than 8GB of GPU VRAM**. \n\nLet us stress again how liberating it is to explore image generation so easily with PixArt-LCM.\n\n| Hardware                    | PIXART-δ (4 steps) | SDXL LoRA LCM (4 steps) | PixArt-α (14 steps) | SDXL standard (25 steps) |\n|-----------------------------|--------------------|-------------------------|---------------------|---------------------------|\n| T4 (Google Colab Free Tier) | 3.3s               | 8.4s                    | 16.0s               | 26.5s                     |\n| V100 (32 GB)                | 0.8s               | 1.2s                    | 5.5s                | 7.7s                      |\n| A100 (80 GB)                | 0.51s              | 1.2s                    | 2.2s                | 3.8s                      |\n\nThese tests were run with a batch size of 1 in all cases.\n\nFor cards with a lot of capacity, such as A100, performance increases significantly when generating multiple images at once, which is usually the case for production workloads.\n\n## High-quality Generation from PixArt-α\n\n- More samples\n\u003Cdiv id=\"more-samples\" style=\"display: flex; justify-content: center;\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_a8d6a24e2bd8.png\" style=\"width: 50%; height: auto; object-fit: contain; margin: 5px;\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_675e82c96196.png\" style=\"width: 43%; height: auto; object-fit: contain; margin: 5px;\">\n\u003C\u002Fdiv>\n\n- PixArt + [Dreambooth](https:\u002F\u002Fdreambooth.github.io\u002F)\n\u003Cdiv id=\"dreambooth\" style=\"display: flex; justify-content: center;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fdreambooth\u002Fdreambooth_dog.svg\" width=\"46%\" style=\"margin: 5px;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fdreambooth\u002Fdreambooth_m5.svg\" width=\"46%\" style=\"margin: 5px;\">\n\u003C\u002Fdiv>\n\n- PixArt + [ControlNet](https:\u002F\u002Fgithub.com\u002Flllyasviel\u002FControlNet)\n\u003Cdiv id=\"ControlNet\" style=\"display: flex; justify-content: center;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fcontrolnet\u002Fcontrolnet_huawei.svg\" width=\"46%\" style=\"margin: 5px;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fcontrolnet\u002Fcontrolnet_lenna.svg\" width=\"46%\" style=\"margin: 5px;\">\n\u003C\u002Fdiv>\n\n# 🔧 Dependencies and Installation\n\n- Python >= 3.9 (Recommend to use [Anaconda](https:\u002F\u002Fwww.anaconda.com\u002Fdownload\u002F#linux) or [Miniconda](https:\u002F\u002Fdocs.conda.io\u002Fen\u002Flatest\u002Fminiconda.html))\n- [PyTorch >= 1.13.0+cu11.7](https:\u002F\u002Fpytorch.org\u002F)\n```bash\nconda create -n pixart python=3.9\nconda activate pixart\npip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\n\ngit clone https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha.git\ncd PixArt-alpha\npip install -r requirements.txt\n```\n\n# ⏬ Download Models\nAll models will be automatically downloaded. You can also choose to download manually from this [url](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha).\n\n| Model                       | #Params | url                                                                                                                                                                                                          | Download in OpenXLab                                                                                            |\n|:----------------------------|:--------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------|\n| T5                          | 4.3B    | [T5](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Ftree\u002Fmain\u002Ft5-v1_1-xxl)                                                                                                                                 | [T5](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002Ft5-v1_1-xxl.zip)                  |\n| VAE                         | 80M     | [VAE](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Ftree\u002Fmain\u002Fsd-vae-ft-ema)                                                                                                                              | [VAE](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002Fsd-vae-ft-ema.zip)               |\n| PixArt-α-SAM-256            | 0.6B    | [PixArt-XL-2-SAM-256x256.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-SAM-256x256.pth) or [diffusers version](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-SAM-256x256) | [256-SAM](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-SAM-256x256.pth) |\n| PixArt-α-256                | 0.6B    | [PixArt-XL-2-256x256.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-256x256.pth) or [diffusers version](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-256x256)             | [256](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-256x256.pth)         |\n| PixArt-α-256-MSCOCO-FID7.32 | 0.6B    | [PixArt-XL-2-256x256.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-256x256-MSCOCO-FID732.pth)                                                                               | [256]()                                                                                                         |\n| PixArt-α-512                | 0.6B    | [PixArt-XL-2-512x512.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-512x512.pth) or [diffusers version](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-512x512)             | [512](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-512x512.pth)         |\n| PixArt-α-1024               | 0.6B    | [PixArt-XL-2-1024-MS.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-1024-MS.pth) or [diffusers version](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-1024-MS)             | [1024](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-1024-MS.pth)        |\n| PixArt-δ-1024-LCM           | 0.6B    | [diffusers version](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS)                                                                                                                             |                                                                                                                 |\n| ControlNet-HED-Encoder      | 30M     | [ControlNetHED.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Fmain\u002FControlNetHED.pth)                                                                                                            |                                                                                                                 |\n| PixArt-δ-512-ControlNet     | 0.9B    | [PixArt-XL-2-512-ControlNet.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain)                                                                                                            | [512](https:\u002F\u002Fopenxlab.org.cn\u002Fmodels\u002Fdetail\u002FPixArt-alpha\u002FPixArt-ControlNet)                                     |\n| PixArt-δ-1024-ControlNet    | 0.9B    | [PixArt-XL-2-1024-ControlNet.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain)                                                                                                           | [1024](https:\u002F\u002Fopenxlab.org.cn\u002Fmodels\u002Fdetail\u002FPixArt-alpha\u002FPixArt-ControlNet)                                    |\n\nALSO find all models in [OpenXLab_PixArt-alpha](https:\u002F\u002Fopenxlab.org.cn\u002Fmodels\u002Fdetail\u002FPixArt-alpha\u002FPixArt-alpha)\n\n# 🔥 How to Train\n## 1. PixArt Training\n\n**First of all.**\n\nThanks to [@kopyl](https:\u002F\u002Fgithub.com\u002Fkopyl), you can reproduce the full fine-tune training flow on [Pokemon dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flambdalabs\u002Fpokemon-blip-captions) from HugginFace with notebooks:\n1. Train with [notebooks\u002Ftrain.ipynb](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002F53dac066f60fe5fdbdde4f0360145ca96d4cc38c\u002Fnotebooks\u002Ftrain.ipynb).\n2. Convert to Diffusers with [notebooks\u002Fconvert-checkpoint-to-diffusers.ipynb](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Fmaster\u002Fnotebooks\u002Fconvert-checkpoint-to-diffusers.ipynb).\n3. Run the inference with converted checkpoint in step 2 with [notebooks\u002Finfer.ipynb](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Fmaster\u002Fnotebooks\u002Finfer.ipynb).\n\n**Then, for more details.**\n\nHere we take SAM dataset training config as an example, but of course, you can also prepare your own dataset following this method.\n\nYou **ONLY** need to change the **config** file in [config](.\u002Fconfigs\u002Fpixart_config) and **dataloader** in [dataset](.\u002Fdiffusion\u002Fdata\u002Fdatasets).\n```bash\npython -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts\u002Ftrain.py configs\u002Fpixart_config\u002FPixArt_xl2_img256_SAM.py --work-dir output\u002Ftrain_SAM_256\n```\n\nThe directory structure for SAM dataset is:\n```\ncd .\u002Fdata\n\nSA1B\n├──images\u002F  (images are saved here)\n│  ├──sa_xxxxx.jpg\n│  ├──sa_xxxxx.jpg\n│  ├──......\n├──captions\u002F    (corresponding captions are saved here, same name as images)\n│  ├──sa_xxxxx.txt\n│  ├──sa_xxxxx.txt\n├──partition\u002F   (all image names are stored txt file where each line is a image name)\n│  ├──part0.txt\n│  ├──part1.txt\n│  ├──......\n├──caption_feature_wmask\u002F   (run tools\u002Fextract_caption_feature.py to generate caption T5 features, same name as images except .npz extension)\n│  ├──sa_xxxxx.npz\n│  ├──sa_xxxxx.npz\n│  ├──......\n├──img_vae_feature\u002F  (run tools\u002Fextract_img_vae_feature.py to generate image VAE features, same name as images except .npy extension)\n│  ├──train_vae_256\u002F\n│  │  ├──noflip\u002F\n│  │  │  ├──sa_xxxxx.npy\n│  │  │  ├──sa_xxxxx.npy\n│  │  │  ├──......\n\n```\n\n**Here we prepare data_toy for better understanding**\n```bash\ncd .\u002Fdata\n\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002Fdata_toy\n```\nThen, \n[Here](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002Fdata_toy\u002Fblob\u002Fmain\u002Fpart0.txt) is an example of partition\u002Fpart0.txt file.\n\n---\n\nBesides, for json file guided [training](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Ffe0cb78065d64c18ecd8955a04e4f29138d47946\u002Fconfigs\u002Fpixart_config\u002FPixArt_xl2_img1024_internalms.py#L3C2-L3C2),\n[here](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002Fdata_toy\u002Fblob\u002Fmain\u002Fdata_info.json) is a toy json file for better understand.\n\n---\n\n## 2. PixArt + DreamBooth Training\n\nFollowing the `Pixart + DreamBooth` [training guidance](asset\u002Fdocs\u002Fpixart-dreambooth.md)\n\n## 3. PixArt + LCM \u002F LCM-LoRA Training\n\nFollowing the `PixArt + LCM` [training guidance](asset\u002Fdocs\u002Fpixart_lcm.md)\n\n## 4. PixArt + ControlNet Training\n\nFollowing the `PixArt + ControlNet` [training guidance](asset\u002Fdocs\u002Fpixart_controlnet.md)\n\n## 4. PixArt + LoRA Training\n\n```bash\npip install peft==0.6.2\n\naccelerate launch --num_processes=1 --main_process_port=36667  train_scripts\u002Ftrain_pixart_lora_hf.py --mixed_precision=\"fp16\" \\\n  --pretrained_model_name_or_path=PixArt-alpha\u002FPixArt-XL-2-1024-MS \\\n  --dataset_name=lambdalabs\u002Fpokemon-blip-captions --caption_column=\"text\" \\\n  --resolution=1024 --random_flip \\\n  --train_batch_size=16 \\\n  --num_train_epochs=200 --checkpointing_steps=100 \\\n  --learning_rate=1e-06 --lr_scheduler=\"constant\" --lr_warmup_steps=0 \\\n  --seed=42 \\\n  --output_dir=\"pixart-pokemon-model\" \\\n  --validation_prompt=\"cute dragon creature\" --report_to=\"tensorboard\" \\\n  --gradient_checkpointing --checkpoints_total_limit=10 --validation_epochs=5 \\\n  --rank=16\n```\n\n# 💻 How to Test\nInference requires at least `23GB` of GPU memory using this repo, while `11GB and 8GB` using in 🧨 [diffusers](#using-in--diffusers).\n\nCurrently support:\n- [x] [IDDPM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.09672)\n- [x] [DPM-Solver](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.00927)\n- [x] [SA-Solver](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05019)\n- [ ] [DPM-Solver-v3](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.13268v2)\n\n## 1. Quick start with [Gradio](https:\u002F\u002Fwww.gradio.app\u002Fguides\u002Fquickstart)\n\nTo get started, first install the required dependencies. Make sure you've downloaded the [models](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha) to the output\u002Fpretrained_models folder, and then run on your local machine:\n\n```bash\nDEMO_PORT=12345 python app\u002Fapp.py\n```\n\nAs an alternative, a sample [Dockerfile](Dockerfile) is provided to make a runtime container that starts the Gradio app.\n\n```bash\ndocker build . -t pixart\ndocker run --gpus all -it -p 12345:12345 -v \u003Cpath_to_huggingface_cache>:\u002Froot\u002F.cache\u002Fhuggingface pixart\n```\n\nOr use docker-compose.  Note, if you want to change context from the 1024 to 512 or LCM version of the app just change the APP_CONTEXT env variable in the docker-compose.yml file.  The default is 1024\n\n```bash\ndocker compose build\ndocker compose up\n```\n\nLet's have a look at a simple example using the `http:\u002F\u002Fyour-server-ip:12345`.\n\n\n## 2. Integration in diffusers\n### 1). Using in 🧨 diffusers\n\nMake sure you have the updated versions of the following libraries:\n\n```bash\npip install -U transformers accelerate diffusers SentencePiece ftfy beautifulsoup4\n```\n\nAnd then:\n\n```python\nimport torch\nfrom diffusers import PixArtAlphaPipeline, ConsistencyDecoderVAE, AutoencoderKL\ndevice = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n\n# You can replace the checkpoint id with \"PixArt-alpha\u002FPixArt-XL-2-512x512\" too.\npipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha\u002FPixArt-XL-2-1024-MS\", torch_dtype=torch.float16, use_safetensors=True)\n\n# If use DALL-E 3 Consistency Decoder\n# pipe.vae = ConsistencyDecoderVAE.from_pretrained(\"openai\u002Fconsistency-decoder\", torch_dtype=torch.float16)\n\n# If use SA-Solver sampler\n# from diffusion.sa_solver_diffusers import SASolverScheduler\n# pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction')\n\n# If loading a LoRA model\n# transformer = Transformer2DModel.from_pretrained(\"PixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS\", subfolder=\"transformer\", torch_dtype=torch.float16)\n# transformer = PeftModel.from_pretrained(transformer, \"Your-LoRA-Model-Path\")\n# pipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS\", transformer=transformer, torch_dtype=torch.float16, use_safetensors=True)\n# del transformer\n\n# Enable memory optimizations.\n# pipe.enable_model_cpu_offload()\n\npipe.to(device)\n\nprompt = \"A small cactus with a happy face in the Sahara desert.\"\nimage = pipe(prompt).images[0]\nimage.save(\".\u002Fcatcus.png\")\n```\nCheck out the [documentation](.\u002Fasset\u002Fdocs\u002Fsasolver.md) for more information about SA-Solver Sampler.\n\nThis integration allows running the pipeline with a batch size of 4 under 11 GBs of GPU VRAM. \nCheck out the [documentation](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fpixart) to learn more.\n\n### 2). Running the `PixArtAlphaPipeline` in under 8GB GPU VRAM\n\nGPU VRAM consumption under 8 GB is supported now, please refer to [documentation](asset\u002Fdocs\u002Fpixart.md) for more information.\n\n### 3). Gradio with diffusers (Faster)\n\nTo get started, first install the required dependencies, then run on your local machine:\n\n```bash\n# diffusers version\nDEMO_PORT=12345 python app\u002Fapp.py\n```\nLet's have a look at a simple example using the `http:\u002F\u002Fyour-server-ip:12345`.\n\nYou can also click [here](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing) to have a free trial on Google Colab.\n\n### 4). Convert .pth checkpoint into diffusers version\n\n```bash\npython tools\u002Fconvert_pixart_alpha_to_diffusers.py --image_size your_img_size --multi_scale_train (True if you use PixArtMS else False) --orig_ckpt_path path\u002Fto\u002Fpth --dump_path path\u002Fto\u002Fdiffusers --only_transformer=True\n```\n\n\n## 3. Online Demo [![Hugging Face PixArt](https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo&message=HuggingFace%20Gradio&color=orange)](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-alpha) \n![Online Demo sample](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_7d2c042d72a9.png)\n\n# ✏️ How to LLaVA captioning\nThanks to the code base of [LLaVA-Lightning-MPT](https:\u002F\u002Fhuggingface.co\u002Fliuhaotian\u002FLLaVA-Lightning-MPT-7B-preview), \nwe can caption the LAION and SAM dataset with the following launching code:\n```bash\npython tools\u002FVLM_caption_lightning.py --output output\u002Fdir\u002F --data-root data\u002Froot\u002Fpath --index path\u002Fto\u002Fdata.json\n```\nWe present auto-labeling with custom prompts for LAION (left) and SAM (right). The words highlighted in green represent the original caption in LAION, while those marked in red indicate the detailed captions labeled by LLaVA.\n\n![Dialog with LLaVA.](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_14f604cd4d4e.png)\n\n# ✏️ How to extract T5 and VAE features\n\nPrepare T5 text feature and VAE image feature in advance will speed up the training process and save GPU memory.\n```bash\npython tools\u002Fextract_features.py --img_size=1024 \\\n    --json_path \"data\u002Fdata_info.json\" \\\n    --t5_save_root \"data\u002FSA1B\u002Fcaption_feature_wmask\" \\\n    --vae_save_root \"data\u002FSA1B\u002Fimg_vae_features\" \\\n    --pretrained_models_dir \"output\u002Fpretrained_models\" \\\n    --dataset_root \"data\u002FSA1B\u002FImages\u002F\"\n```\n\n## 💪To-Do List (Congratulations🎉)\n\n- [x] Inference code\n- [x] Training code\n- [x] T5 & VAE feature extraction code\n- [x] LLaVA captioning code\n- [x] Model zoo \n- [x] Diffusers version & Hugging Face demo\n- [x] Google Colab example\n- [x] DALLE3 VAE integration\n- [x] Inference under 8GB GPU VRAM with diffusers\n- [x] Dreambooth Training code\n- [x] SA-Solver code\n- [x] PixArt-α-LCM will release soon\n- [x] Multi-scale vae feature extraction code\n- [x] PixArt-α-LCM-LoRA scripts will release soon\n- [x] PixArt-α-LoRA training scripts will release soon\n- [x] ControlNet code will be released\n- [x] SAM-LLaVA caption dataset\n- [x] ControlNet checkpoint\n- [x] 256px pre-trained models\n- [x] PixArt-Σ: Next version model with much better ability is training!\n\n# Other Source\nWe make a video comparing PixArt with current most powerful Text-to-Image models.\n\n[![Watch the video](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_f3ddaae26443.jpg)](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=7_6KsIITgWY)\n\n# 📖BibTeX\n    @misc{chen2023pixartalpha,\n          title={PixArt-$\\alpha$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis}, \n          author={Junsong Chen and Jincheng Yu and Chongjian Ge and Lewei Yao and Enze Xie and Yue Wu and Zhongdao Wang and James Kwok and Ping Luo and Huchuan Lu and Zhenguo Li},\n          year={2023},\n          eprint={2310.00426},\n          archivePrefix={arXiv},\n          primaryClass={cs.CV}\n    }\n    @misc{chen2024pixartdelta,\n          title={PIXART-{\\delta}: Fast and Controllable Image Generation with Latent Consistency Models}, \n          author={Junsong Chen and Yue Wu and Simian Luo and Enze Xie and Sayak Paul and Ping Luo and Hang Zhao and Zhenguo Li},\n          year={2024},\n          eprint={2401.05252},\n          archivePrefix={arXiv},\n          primaryClass={cs.CV}\n    }\n    \n# 🤗Acknowledgements\n- Thanks to [Diffusers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fdiffusers) for their wonderful technical support and awesome collaboration!\n- Thanks to [Hugging Face](https:\u002F\u002Fgithub.com\u002Fhuggingface) for sponsoring the nicely demo!\n- Thanks to [DiT](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FDiT) for their wonderful work and codebase!\n\n## Star History\n\n[![Star History Chart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_31c1ca7cfcfe.png)](https:\u002F\u002Fstar-history.com\u002F#PixArt-alpha\u002FPixArt-alpha&Date)\n","\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_2dab88eba56d.png\"  height=120>\n\u003C\u002Fp>\n\n\n### \u003Cdiv align=\"center\">👉 PixArt-α：用于照片级逼真文生图的扩散Transformer快速训练\u003Cdiv> \n### \u003Cdiv align=\"center\"> ICLR 2024 Spotlight \u003Cdiv> \n\n\u003Cdiv align=\"center\">\n  \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-sigma\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=PixArt-Sigma Code&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\n\n  \u003Ca href=\"https:\u002F\u002Fpixart-alpha.github.io\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Project%20Page&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002FSAM-LLaVA-Captions10M\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=SAM-LLaVA&message=HF&color=yellow\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00426\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Paper&message=Arxiv:Alpha&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Paper&message=Arxiv:Delta&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Frde6eaE5Ta\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Discuss&message=Discord&color=purple&logo=discord\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fpixart\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Usage&message=Diffusers&color=green&\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcity96\u002FComfyUI_ExtraModels\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=App&message=ComfyUI&&color=green\">\u003C\u002Fa> &ensp;\n\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-alpha\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt&message=HuggingFace&color=yellow\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-LCM\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt-LCM&message=HuggingFace&color=yellow\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fopenxlab.org.cn\u002Fapps\u002Fdetail\u002FPixArt-alpha\u002FPixArt-alpha\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt&message=OpenXLab&color=purple\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fopenxlab.org.cn\u002Fapps\u002Fdetail\u002Fhoushaowei\u002FPixArt-LCM\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo PixArt-LCM&message=OpenXLab&color=purple\">\u003C\u002Fa> &ensp;\n  \u003Ca href=\"https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Free%20Trial&message=Google%20Colab&logo=google&color=orange\">\u003C\u002Fa> &ensp;\n\u003C\u002Fdiv>\n\n---\n\n本仓库包含我们论文中探索的使用Transformer进行快速训练扩散模型的PyTorch模型定义、预训练权重以及推理\u002F采样代码。您可以在我们的[项目页面](https:\u002F\u002Fpixart-alpha.github.io\u002F)上找到更多可视化内容。\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_2dab88eba56d.png\" width=\"10%\" alt=\"\" \u002F> **PixArt-α 社区**：欢迎加入我们的PixArt-α Discord频道 \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Frde6eaE5Ta\" style=\"text-decoration:none;\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_6342e5371027.png\" width=\"3%\" alt=\"\" \u002F>\u003C\u002Fa>,参与讨论。欢迎各位开发者贡献代码。\n\n> [**PixArt-α：用于照片级逼真文生图的扩散Transformer快速训练**](https:\u002F\u002Fpixart-alpha.github.io\u002F)\u003Cbr>\n> [陈俊松*](https:\u002F\u002Flawrence-cj.github.io\u002F)、[于锦程*](https:\u002F\u002Flovesykun.cn\u002Fabout.html)、\n> [葛崇健*](https:\u002F\u002Fchongjiange.github.io\u002F)、[姚雷威*](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=hqDyTg8AAAAJ&hl=zh-CN&oi=ao)、\n> [谢恩泽](https:\u002F\u002Fxieenze.github.io\u002F)&#8224;、\n> [吴岳](https:\u002F\u002Fyuewuhkust.github.io\u002F)、[王仲道](https:\u002F\u002Fzhongdao.github.io\u002F)、\n> [郭志伟](https:\u002F\u002Fwww.cse.ust.hk\u002F~jamesk\u002F)、[罗平](http:\u002F\u002Fluoping.me\u002F)、\n> [陆虎川](https:\u002F\u002Fscholar.google.com\u002Fcitations?hl=en&user=D3nE0agAAAAJ)、\n> [李振国](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=XboZC1AAAAAJ)\n> \u003Cbr>华为诺亚方舟实验室、大连理工大学、香港大学、香港科技大学\u003Cbr>\n\n> [**PIXART-δ：基于潜在一致性模型的快速可控图像生成**](https:\u002F\u002Fpixart-alpha.github.io\u002F)\u003Cbr>\n> [陈俊松](https:\u002F\u002Flawrence-cj.github.io\u002F)、[吴岳](https:\u002F\u002Fyuewuhkust.github.io\u002F)、[罗思敏](https:\u002F\u002Fluosiallen.github.io\u002F)、[谢恩泽](https:\u002F\u002Fxieenze.github.io\u002F)&#8224;、\n> [萨亚克·保罗](https:\u002F\u002Fsayak.dev\u002F)、[罗平](http:\u002F\u002Fluoping.me\u002F)、[赵航]()、[李振国](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=XboZC1AAAAAJ)\n> \u003Cbr>华为诺亚方舟实验室、大连理工大学、清华大学、香港大学、Hugging Face\u003Cbr>\n\n---\n## 最新消息 🔥🔥!!\n- (🔥 新) 2024年4月12日。💥 更优秀的[PixArt-Σ](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-sigma)训练与推理代码及检查点已全部发布！！！\n欢迎大家合作与贡献。如果您觉得有用，请为我们点亮🌟！\n\n- (🔥 新) 2024年1月19日。💥 [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252) ControlNet [app_controlnet.py](app\u002Fapp_controlnet.py)和[检查点](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain)已发布！！！\n- (🔥 新) 2024年1月16日。💥 恭喜宣布，[PixArt-α](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00426)已被ICLR 2024接受（Spotlight）。\n- (🔥 新) 2023年12月17日。💥 PixArt现已支持[ComfyUI](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI#manual-install-windows-linux)。感谢[@city96](https:\u002F\u002Fgithub.com\u002Fcity96\u002FComfyUI_ExtraModels)的杰出工作。\n- (🔥 新) 2023年11月30日。💥 PixArt与[LCMs](https:\u002F\u002Fgithub.com\u002Fluosiallen\u002Flatent-consistency-model)团队合作，打造了**最快**的[文生图训练与推理系统](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha)。\n在此，[训练代码](train_scripts\u002Ftrain_pixart_lcm.py)、[推理代码](scripts\u002Finference_lcm.py)、[权重](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS)以及[Hugging Face演示](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-LCM)、[OpenXLab演示](https:\u002F\u002Fopenxlab.org.cn\u002Fapps\u002Fdetail\u002Fhoushaowei\u002FPixArt-LCM)均已发布，我们希望用户能够喜欢。详细的**推理速度**和**代码指南**可在[文档](asset\u002Fdocs\u002Fpixart_lcm.md)中找到。同时，我们还更新了代码库以提升用户体验，并修复了最新版本中的若干Bug。\n\n---\n\n## 🚩 **新功能\u002F更新**\n- ✅ 2024年1月11日。💥 [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252)：我们非常高兴地宣布发布 [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252) 技术报告！！！\n该报告深入探讨了Transformer模型中LCM和ControlNet类似模块的训练方法。同时，我们也在此仓库中发布了LCM与ControlNet的所有训练和推理代码 [在此仓库](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha)。\n我们鼓励大家尝试使用这些代码，并热烈欢迎用户的Pull Request。您的贡献和反馈对我们非常重要！\n- ✅ 2024年2月7日。[train_diffusers.py](train_scripts\u002Ftrain_diffusers.py) 可以直接使用diffusers模型进行训练，并在训练过程中进行可视化。\n- ✅ 2024年1月26日。💥 [PixArt-α](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha) 的所有检查点，包括256px分辨率的检查点，现在都可以在这里下载 [下载模型](#-download-models)。\n- ✅ 2024年1月19日。💥 [PixArt-δ](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.05252) 的ControlNet [app_controlnet.py](app\u002Fapp_controlnet.py) 和 [检查点](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain) 已发布！！！\n- ✅ 2024年1月12日。💥 我们发布了用于PixArt-α训练的 [SAM-LLaVA-Captions](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002FSAM-LLaVA-Captions10M) 数据集。\n- ✅ 2023年12月27日。[PixArt-α](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha) 已集成到 [ControlLLM](https:\u002F\u002Fgithub.com\u002FOpenGVLab\u002FControlLLM) 中！\n- ✅ 2023年12月17日。以Hugging Face风格发布的 [PixArt-LCM-Lora](train_scripts\u002Ftrain_pixart_lcm_lora.py) 和 [PixArt-Lora](train_scripts\u002Ftrain_pixart_lora_hf.py) 训练脚本已发布。\n- ✅ 2023年12月13日。在 [tools\u002Fextract_features.py](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002F3b4f0afdbe39def80b41ab05c664c963edeebbcd\u002Ftools\u002Fextract_features.py#L276) 中添加了多尺度VAE特征提取功能。\n- ✅ 2023年12月1日。新增了一个 [Notebook文件夹](.\u002Fnotebooks)，帮助用户快速上手PixArt！感谢 [@kopyl](https:\u002F\u002Fgithub.com\u002Fkopyl) 的贡献！\n- ✅ 2023年11月27日。💥 **PixArt-α社区**：加入我们的PixArt-α Discord频道 \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Frde6eaE5Ta\" style=\"text-decoration:none;\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_6342e5371027.png\" width=\"3%\" alt=\"\" \u002F>\u003C\u002Fa> 进行讨论。欢迎各位开发者参与贡献。\n- ✅ 2023年11月21日。💥 [SA-Sovler](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05019) 官方代码首次发布 [此处](asset\u002Fdocs\u002Fsasolver.md)。\n- ✅ 2023年11月19日。发布 `PixArt + Dreambooth` 训练脚本。\n- ✅ 2023年11月16日。Diffusers 现在支持 `随机分辨率` 和 `批量生成图片` 功能。此外，\n在低于8GB显存的GPU上运行 `Pixart` 也已成为可能，在 🧨 [diffusers](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fpixart) 中有相关说明。\n- ✅ 2023年11月10日。在 🧨 diffusers 中支持 DALL-E 3 Consistency Decoder。\n- ✅ 2023年11月6日。发布预训练权重，并集成 🧨 diffusers、Hugging Face演示以及Google Colab示例。\n- ✅ 2023年11月3日。发布 LLaVA 字幕生成推理代码。\n- ✅ 2023年10月27日。发布训练及特征提取代码。\n- ✅ 2023年10月20日。与 Hugging Face 和 Diffusers 团队合作，共同发布代码和权重。（请继续关注。）\n- ✅ 2023年10月15日。发布推理代码。\n\n---\n\n## 目录\n* [训练](#-how-to-train)\n* [推理](#-how-to-test)\n* [下载模型](#-download-models)\n* [使用diffusers](#1---using-in--diffusers)\n* [数据处理](#-how-to-extract-t5-and-vae-features)\n* [PixArt-**α** 演示](#3---gradio-with-diffusers--faster-)\n* [PixArt-**α** 8GB VRAM](asset\u002Fdocs\u002Fpixart.md)\n* [PixArt-**δ** (LCM)](asset\u002Fdocs\u002Fpixart_lcm.md)\n* [PixArt-**δ** (ControlNet)](asset\u002Fdocs\u002Fpixart_controlnet.md)\n* [PixArt-**δ** (Dreambooth)](asset\u002Fdocs\u002Fpixart-dreambooth.md)\n* [致谢](#acknowledgements)\n* [引用](#bibtex)\n\n\n* [PixArt-**Σ** 发布](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-sigma)\n\n---\n\n## 🐱 摘要\n\u003Cb>TL; DR: \u003Cfont color=\"red\">PixArt-α\u003C\u002Ffont> 是一种基于Transformer的T2I扩散模型，其图像生成质量可与当前最先进的图像生成器（如Imagen、SDXL，甚至Midjourney）相媲美，而训练速度则显著超越现有的大型T2I模型。例如，PixArt-α仅需675天的A100 GPU时间，而Stable Diffusion v1.5则需要6,250天。\u003C\u002Fb>\n\n\u003Cdetails>\u003Csummary>点击展开完整摘要\u003C\u002Fsummary>\n目前最先进的文本到图像（T2I）模型通常需要高昂的训练成本（例如数百万小时的GPU时间），这不仅严重阻碍了AIGC社区的基础创新，还增加了二氧化碳排放。本文介绍了一种名为PixArt-α的基于Transformer的T2I扩散模型，其图像生成质量可与当前最先进的图像生成器（如Imagen、SDXL，甚至Midjourney）相媲美，几乎达到了商业应用的标准。此外，它还支持高达1024px分辨率的高分辨率图像合成，且训练成本较低。为实现这一目标，我们提出了三个核心设计：\n(1) 训练策略分解：我们设计了三个独立的训练步骤，分别优化像素依赖性、文本与图像的对齐以及图像的美学质量；\n(2) 高效的T2I Transformer：我们在扩散Transformer（DiT）中引入了交叉注意力模块，以注入文本条件并简化计算密集型的类别条件分支；\n(3) 高信息量的数据：我们强调文本-图像对中概念密度的重要性，并利用大型视觉-语言模型自动标注密集的伪字幕，以辅助文本-图像对齐的学习。因此，PixArt-α的训练速度显著优于现有的大型T2I模型，例如，PixArt-α仅需675天的A100 GPU时间，而Stable Diffusion v1.5则需要6,250天，从而节省了近30万美元（26,000美元 vs. 320,000美元）的成本，并减少了90%的二氧化碳排放。此外，与更大的SOTA模型RAPHAEL相比，我们的训练成本仅为它的1%。大量实验表明，PixArt-α在图像质量、艺术性和语义控制方面表现出色。我们希望PixArt-α能为AIGC社区和初创企业带来新的启示，帮助他们从零开始快速构建高质量且低成本的生成模型。\n\u003C\u002Fdetails>\n\n---\n\n![撒哈拉沙漠中一棵带着笑脸的小仙人掌。](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_657cc1b8e1d0.png)\n\n---\n\n# 🔥🔥🔥 为什么选择PixArt-α？\n\n## 训练效率\nPixArt-α 仅需 Stable Diffusion v1.5 训练时间的 12%（753 天 vs. 6,250 天 A100 GPU），节省近 30 万美元（2.8 万美元 vs. 32 万美元），并减少 90% 的二氧化碳排放。此外，与更大的 SOTA 模型 RAPHAEL 相比，我们的训练成本仅为后者的 1%。\n![训练效率。](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_c4d918b0f706.png)\n\n| 方法    | 类型 | 参数量 | 图像数量| FID-30K ↓        | A100 GPU 天数 |\n|-----------|------|---------|--------|------------------|---------------|\n| DALL·E    | 扩散 | 12.0B   | 250M   | 27.50            |               |\n| GLIDE     | 扩散 | 5.0B    | 250M   | 12.24            |               |\n| LDM       | 扩散 | 1.4B    | 400M   | 12.64            |               |\n| DALL·E 2  | 扩散 | 6.5B    | 650M   | 10.39            | 41,66         |\n| SDv1.5    | 扩散 | 0.9B    | 2000M  | 9.62             | 6,250         |\n| GigaGAN   | GAN  | 0.9B    | 2700M  | 9.09             | 4,783         |\n| Imagen    | 扩散 | 3.0B    | 860M   | 7.27             | 7,132         |\n| RAPHAEL   | 扩散 | 3.0B    | 5000M+ | 6.61             | 60,000        |\n| PixArt-α  | 扩散 | 0.6B    | 25M    | 7.32（零样本） | 753           |\n| PixArt-α  | 扩散 | 0.6B    | 25M    | 5.51（COCO 微调）   | 753           |\n\n## 推理效率\nPIXART-δ 在 A100 上成功生成 **1024×1024 高分辨率** 图像，耗时仅 **0.5 秒**。通过实施 8 位推理技术，PIXART-δ 仅需 **不到 8GB 的 GPU 显存**。\n\n让我们再次强调，使用 PixArt-LCM 如此轻松地探索图像生成是多么令人解放。\n\n| 硬件                    | PIXART-δ (4 步) | SDXL LoRA LCM (4 步) | PixArt-α (14 步) | SDXL 标准版 (25 步) |\n|-----------------------------|--------------------|-------------------------|---------------------|---------------------------|\n| T4（Google Colab 免费层） | 3.3s               | 8.4s                    | 16.0s               | 26.5s                     |\n| V100（32 GB）                | 0.8s               | 1.2s                    | 5.5s                | 7.7s                      |\n| A100（80 GB）                | 0.51s              | 1.2s                    | 2.2s                | 3.8s                      |\n\n所有测试均以批大小为 1 运行。\n\n对于像 A100 这样显存较大的显卡，在一次生成多张图像时性能会显著提升，而这通常是生产工作负载中的常见场景。\n\n## PixArt-α 的高质量生成\n\n- 更多样例\n\u003Cdiv id=\"more-samples\" style=\"display: flex; justify-content: center;\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_a8d6a24e2bd8.png\" style=\"width: 50%; height: auto; object-fit: contain; margin: 5px;\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_675e82c96196.png\" style=\"width: 43%; height: auto; object-fit: contain; margin: 5px;\">\n\u003C\u002Fdiv>\n\n- PixArt + [Dreambooth](https:\u002F\u002Fdreambooth.github.io\u002F)\n\u003Cdiv id=\"dreambooth\" style=\"display: flex; justify-content: center;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fdreambooth\u002Fdreambooth_dog.svg\" width=\"46%\" style=\"margin: 5px;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fdreambooth\u002Fdreambooth_m5.svg\" width=\"46%\" style=\"margin: 5px;\">\n\u003C\u002Fdiv>\n\n- PixArt + [ControlNet](https:\u002F\u002Fgithub.com\u002Flllyasviel\u002FControlNet)\n\u003Cdiv id=\"ControlNet\" style=\"display: flex; justify-content: center;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fcontrolnet\u002Fcontrolnet_huawei.svg\" width=\"46%\" style=\"margin: 5px;\">\n  \u003Cimg src=\"asset\u002Fimages\u002Fcontrolnet\u002Fcontrolnet_lenna.svg\" width=\"46%\" style=\"margin: 5px;\">\n\u003C\u002Fdiv>\n\n# 🔧 依赖与安装\n\n- Python >= 3.9（建议使用 [Anaconda](https:\u002F\u002Fwww.anaconda.com\u002Fdownload\u002F#linux) 或 [Miniconda](https:\u002F\u002Fdocs.conda.io\u002Fen\u002Flatest\u002Fminiconda.html)）\n- [PyTorch >= 1.13.0+cu11.7](https:\u002F\u002Fpytorch.org\u002F)\n```bash\nconda create -n pixart python=3.9\nconda activate pixart\npip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\n\ngit clone https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha.git\ncd PixArt-alpha\npip install -r requirements.txt\n```\n\n# ⏬ 下载模型\n所有模型将自动下载。你也可以从这个 [url](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha) 手动下载。\n\n| 模型                       | 参数量 | url                                                                                                                                                                                                          | 在 OpenXLab 中下载                                                                                            |\n|:----------------------------|:--------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------|\n| T5                          | 4.3B    | [T5](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Ftree\u002Fmain\u002Ft5-v1_1-xxl)                                                                                                                                 | [T5](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002Ft5-v1_1-xxl.zip)                  |\n| VAE                         | 80M     | [VAE](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Ftree\u002Fmain\u002Fsd-vae-ft-ema)                                                                                                                              | [VAE](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002Fsd-vae-ft-ema.zip)               |\n| PixArt-α-SAM-256            | 0.6B    | [PixArt-XL-2-SAM-256x256.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-SAM-256x256.pth) 或 [diffusers 版本](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-SAM-256x256) | [256-SAM](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-SAM-256x256.pth) |\n| PixArt-α-256                | 0.6B    | [PixArt-XL-2-256x256.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-256x256.pth) 或 [diffusers 版本](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-256x256)             | [256](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-256x256.pth)         |\n| PixArt-α-256-MSCOCO-FID7.32 | 0.6B    | [PixArt-XL-2-256x256.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-256x256-MSCOCO-FID732.pth)                                                                               | [256]()                                                                                                         |\n| PixArt-α-512                | 0.6B    | [PixArt-XL-2-512x512.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-512x512.pth) 或 [diffusers 版本](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-512x512)             | [512](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-512x512.pth)         |\n| PixArt-α-1024               | 0.6B    | [PixArt-XL-2-1024-MS.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fresolve\u002Fmain\u002FPixArt-XL-2-1024-MS.pth) 或 [diffusers 版本](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-1024-MS)             | [1024](https:\u002F\u002Fdownload.openxlab.org.cn\u002Fmodels\u002FPixArt-alpha\u002FPixArt-alpha\u002Fweight\u002FPixArt-XL-2-1024-MS.pth)        |\n| PixArt-δ-1024-LCM           | 0.6B    | [diffusers 版本](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS)                                                                                                                             |                                                                                                                 |\n| ControlNet-HED-Encoder      | 30M     | [ControlNetHED.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Fmain\u002FControlNetHED.pth)                                                                                                            |                                                                                                                 |\n| PixArt-δ-512-ControlNet     | 0.9B    | [PixArt-XL-2-512-ControlNet.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain)                                                                                                            | [512](https:\u002F\u002Fopenxlab.org.cn\u002Fmodels\u002Fdetail\u002FPixArt-alpha\u002FPixArt-ControlNet)                                     |\n| PixArt-δ-1024-ControlNet    | 0.9B    | [PixArt-XL-2-1024-ControlNet.pth](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-ControlNet\u002Ftree\u002Fmain)                                                                                                           | [1024](https:\u002F\u002Fopenxlab.org.cn\u002Fmodels\u002Fdetail\u002FPixArt-alpha\u002FPixArt-ControlNet)                                    |\n\n此外，你还可以在 [OpenXLab_PixArt-alpha](https:\u002F\u002Fopenxlab.org.cn\u002Fmodels\u002Fdetail\u002FPixArt-alpha\u002FPixArt-alpha) 中找到所有模型。\n\n# 🔥 如何训练\n\n## 1. PixArt 训练\n\n**首先。**\n\n感谢 [@kopyl](https:\u002F\u002Fgithub.com\u002Fkopyl)，您可以通过 HugginFace 上的笔记本重现 [Pokemon 数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flambdalabs\u002Fpokemon-blip-captions) 的完整微调训练流程：\n1. 使用 [notebooks\u002Ftrain.ipynb](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002F53dac066f60fe5fdbdde4f0360145ca96d4cc38c\u002Fnotebooks\u002Ftrain.ipynb) 进行训练。\n2. 使用 [notebooks\u002Fconvert-checkpoint-to-diffusers.ipynb](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Fmaster\u002Fnotebooks\u002Fconvert-checkpoint-to-diffusers.ipynb) 转换为 Diffusers 格式。\n3. 使用步骤 2 中转换后的检查点，通过 [notebooks\u002Finfer.ipynb](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Fmaster\u002Fnotebooks\u002Finfer.ipynb) 运行推理。\n\n**然后，更多细节。**\n\n这里我们以 SAM 数据集的训练配置为例，当然您也可以按照这种方法准备自己的数据集。\n\n您 **只需要** 修改 [config](.\u002Fconfigs\u002Fpixart_config) 中的 **配置文件** 和 [dataset](.\u002Fdiffusion\u002Fdata\u002Fdatasets) 中的 **数据加载器**。\n```bash\npython -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts\u002Ftrain.py configs\u002Fpixart_config\u002FPixArt_xl2_img256_SAM.py --work-dir output\u002Ftrain_SAM_256\n```\n\nSAM 数据集的目录结构如下：\n```\ncd .\u002Fdata\n\nSA1B\n├──images\u002F  (图像保存在此处)\n│  ├──sa_xxxxx.jpg\n│  ├──sa_xxxxx.jpg\n│  ├──......\n├──captions\u002F    (对应的标题保存在此处，与图像同名)\n│  ├──sa_xxxxx.txt\n│  ├──sa_xxxxx.txt\n├──partition\u002F   (所有图像名称存储在一个文本文件中，每行一个图像名称)\n│  ├──part0.txt\n│  ├──part1.txt\n│  ├──......\n├──caption_feature_wmask\u002F   (运行 tools\u002Fextract_caption_feature.py 生成标题 T5 特征，与图像同名但扩展名为 .npz)\n│  ├──sa_xxxxx.npz\n│  ├──sa_xxxxx.npz\n│  ├──......\n├──img_vae_feature\u002F  (运行 tools\u002Fextract_img_vae_feature.py 生成图像 VAE 特征，与图像同名但扩展名为 .npy)\n│  ├──train_vae_256\u002F\n│  │  ├──noflip\u002F\n│  │  │  ├──sa_xxxxx.npy\n│  │  │  ├──sa_xxxxx.npy\n│  │  │  ├──......\n\n```\n\n**为了更好地理解，我们准备了 data_toy 数据集**\n```bash\ncd .\u002Fdata\n\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002Fdata_toy\n```\n然后，\n[这里](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002Fdata_toy\u002Fblob\u002Fmain\u002Fpart0.txt) 是 partition\u002Fpart0.txt 文件的一个示例。\n\n---\n\n此外，对于基于 JSON 文件指导的 [训练](https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha\u002Fblob\u002Ffe0cb78065d64c18ecd8955a04e4f29138d47946\u002Fconfigs\u002Fpixart_config\u002FPixArt_xl2_img1024_internalms.py#L3C2-L3C2),\n[这里](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPixArt-alpha\u002Fdata_toy\u002Fblob\u002Fmain\u002Fdata_info.json) 是一个用于更好理解的玩具 JSON 文件。\n\n---\n\n## 2. PixArt + DreamBooth 训练\n\n请参考 `Pixart + DreamBooth` 的 [训练指南](asset\u002Fdocs\u002Fpixart-dreambooth.md)\n\n## 3. PixArt + LCM \u002F LCM-LoRA 训练\n\n请参考 `PixArt + LCM` 的 [训练指南](asset\u002Fdocs\u002Fpixart_lcm.md)\n\n## 4. PixArt + ControlNet 训练\n\n请参考 `PixArt + ControlNet` 的 [训练指南](asset\u002Fdocs\u002Fpixart_controlnet.md)\n\n## 4. PixArt + LoRA 训练\n\n```bash\npip install peft==0.6.2\n\naccelerate launch --num_processes=1 --main_process_port=36667  train_scripts\u002Ftrain_pixart_lora_hf.py --mixed_precision=\"fp16\" \\\n  --pretrained_model_name_or_path=PixArt-alpha\u002FPixArt-XL-2-1024-MS \\\n  --dataset_name=lambdalabs\u002Fpokemon-blip-captions --caption_column=\"text\" \\\n  --resolution=1024 --random_flip \\\n  --train_batch_size=16 \\\n  --num_train_epochs=200 --checkpointing_steps=100 \\\n  --learning_rate=1e-06 --lr_scheduler=\"constant\" --lr_warmup_steps=0 \\\n  --seed=42 \\\n  --output_dir=\"pixart-pokemon-model\" \\\n  --validation_prompt=\"cute dragon creature\" --report_to=\"tensorboard\" \\\n  --gradient_checkpointing --checkpoints_total_limit=10 --validation_epochs=5 \\\n  --rank=16\n```\n\n# 💻 如何测试\n使用本仓库进行推理时，至少需要 `23GB` 的显存；而在 🧨 [diffusers](#using-in--diffusers) 中则只需 `11GB` 和 `8GB`。\n\n目前支持：\n- [x] [IDDPM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.09672)\n- [x] [DPM-Solver](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.00927)\n- [x] [SA-Solver](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05019)\n- [ ] [DPM-Solver-v3](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.13268v2)\n\n## 1. 快速开始使用 [Gradio](https:\u002F\u002Fwww.gradio.app\u002Fguides\u002Fquickstart)\n\n要开始使用，首先安装所需的依赖项。确保已将 [模型](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-alpha) 下载到 output\u002Fpretrained_models 文件夹中，然后在本地机器上运行：\n\n```bash\nDEMO_PORT=12345 python app\u002Fapp.py\n```\n\n或者，您可以使用提供的示例 [Dockerfile](Dockerfile) 来构建一个运行 Gradio 应用的容器。\n\n```bash\ndocker build . -t pixart\ndocker run --gpus all -it -p 12345:12345 -v \u003Cpath_to_huggingface_cache>:\u002Froot\u002F.cache\u002Fhuggingface pixart\n```\n\n或者使用 docker-compose。请注意，如果您想将上下文从 1024 更改为 512 或 LCM 版本的应用程序，只需更改 docker-compose.yml 文件中的 APP_CONTEXT 环境变量即可。默认值为 1024。\n\n```bash\ndocker compose build\ndocker compose up\n```\n\n让我们通过 `http:\u002F\u002Fyour-server-ip:12345` 来看一个简单的例子。\n\n## 2. 集成到 diffusers 中\n### 1). 在 🧨 diffusers 中使用\n\n请确保您已安装以下库的最新版本：\n\n```bash\npip install -U transformers accelerate diffusers SentencePiece ftfy beautifulsoup4\n```\n\n然后：\n\n```python\nimport torch\nfrom diffusers import PixArtAlphaPipeline, ConsistencyDecoderVAE, AutoencoderKL\ndevice = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n\n# 您也可以将检查点 ID 替换为 \"PixArt-alpha\u002FPixArt-XL-2-512x512\"。\npipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha\u002FPixArt-XL-2-1024-MS\", torch_dtype=torch.float16, use_safetensors=True)\n\n# 如果使用 DALL-E 3 一致性解码器\n# pipe.vae = ConsistencyDecoderVAE.from_pretrained(\"openai\u002Fconsistency-decoder\", torch_dtype=torch.float16)\n\n# 如果使用 SA-Solver 采样器\n# from diffusion.sa_solver_diffusers import SASolverScheduler\n# pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction')\n\n# 如果加载 LoRA 模型\n# transformer = Transformer2DModel.from_pretrained(\"PixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS\", subfolder=\"transformer\", torch_dtype=torch.float16)\n# transformer = PeftModel.from_pretrained(transformer, \"Your-LoRA-Model-Path\")\n# pipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha\u002FPixArt-LCM-XL-2-1024-MS\", transformer=transformer, torch_dtype=torch.float16, use_safetensors=True)\n# del transformer\n\n# 启用内存优化。\n\n# pipe.enable_model_cpu_offload()\n\npipe.to(设备)\n\nprompt = \"撒哈拉沙漠中一棵长着笑脸的小仙人掌。\"\nimage = pipe(prompt).images[0]\nimage.save(\".\u002Fcatcus.png\")\n```\n更多关于 SA-Solver 采样器的信息，请查看[文档](.\u002Fasset\u002Fdocs\u002Fsasolver.md)。\n\n通过此次集成，可以在 11GB 显存的 GPU 上以批量大小为 4 运行该流程。\n如需了解更多信息，请参阅[文档](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fpixart)。\n\n### 2). 在低于 8GB 显存的 GPU 上运行 `PixArtAlphaPipeline`\n\n现在已支持在 8GB 以下显存的 GPU 上运行，请参阅[文档](asset\u002Fdocs\u002Fpixart.md)以获取更多信息。\n\n### 3). 使用 diffusers 的 Gradio（更快速）\n\n要开始使用，首先安装所需的依赖项，然后在本地机器上运行：\n\n```bash\n# diffusers 版本\nDEMO_PORT=12345 python app\u002Fapp.py\n```\n让我们来看一个简单的示例，访问 `http:\u002F\u002Fyour-server-ip:12345`。\n\n你也可以点击[这里](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing)在 Google Colab 上免费试用。\n\n### 4). 将 .pth 检查点转换为 diffusers 版本\n\n```bash\npython tools\u002Fconvert_pixart_alpha_to_diffusers.py --image_size your_img_size --multi_scale_train (如果你使用 PixArtMS 则为 True，否则为 False) --orig_ckpt_path pth 文件路径 --dump_path diffusers 文件路径 --only_transformer=True\n```\n\n\n## 3. 在线演示 [![Hugging Face PixArt](https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Demo&message=HuggingFace%20Gradio&color=orange)](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FPixArt-alpha\u002FPixArt-alpha) \n![在线演示示例](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_7d2c042d72a9.png)\n\n# ✏️ 如何使用 LLaVA 进行图像标注\n感谢 [LLaVA-Lightning-MPT](https:\u002F\u002Fhuggingface.co\u002Fliuhaotian\u002FLLaVA-Lightning-MPT-7B-preview) 的代码库，\n我们可以通过以下启动命令对 LAION 和 SAM 数据集进行标注：\n```bash\npython tools\u002FVLM_caption_lightning.py --output 输出目录 --data-root 数据根路径 --index 数据 JSON 文件路径\n```\n我们展示了使用自定义提示词对 LAION（左）和 SAM（右）数据集进行自动标注的结果。绿色高亮的文字代表 LAION 中的原始描述，而红色标记的部分则是由 LLaVA 添加的详细描述。\n\n![与 LLaVA 的对话。](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_14f604cd4d4e.png)\n\n# ✏️ 如何提取 T5 和 VAE 特征\n\n提前准备好 T5 文本特征和 VAE 图像特征，可以加快训练速度并节省显存。\n```bash\npython tools\u002Fextract_features.py --img_size=1024 \\\n    --json_path \"data\u002Fdata_info.json\" \\\n    --t5_save_root \"data\u002FSA1B\u002Fcaption_feature_wmask\" \\\n    --vae_save_root \"data\u002FSA1B\u002Fimg_vae_features\" \\\n    --pretrained_models_dir \"output\u002Fpretrained_models\" \\\n    --dataset_root \"data\u002FSA1B\u002FImages\u002F\"\n```\n\n## 💪 待办事项清单（恭喜🎉）\n\n- [x] 推理代码\n- [x] 训练代码\n- [x] T5 & VAE 特征提取代码\n- [x] LLaVA 标注代码\n- [x] 模型库\n- [x] diffusers 版本及 Hugging Face 演示\n- [x] Google Colab 示例\n- [x] DALLE3 VAE 集成\n- [x] 在 8GB 以下显存的 GPU 上使用 diffusers 进行推理\n- [x] Dreambooth 训练代码\n- [x] SA-Solver 代码\n- [x] PixArt-α-LCM 即将发布\n- [x] 多尺度 VAE 特征提取代码\n- [x] PixArt-α-LCM-LoRA 脚本即将发布\n- [x] PixArt-α-LoRA 训练脚本即将发布\n- [x] ControlNet 代码即将发布\n- [x] SAM-LLaVA 标注数据集\n- [x] ControlNet 检查点\n- [x] 256px 预训练模型\n- [x] PixArt-Σ：下一代性能更强的模型正在训练中！\n\n# 其他资源\n我们制作了一段视频，对比了 PixArt 与当前最强大的文生图模型。\n\n[![观看视频](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_f3ddaae26443.jpg)](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=7_6KsIITgWY)\n\n# 📖 BibTeX\n    @misc{chen2023pixartalpha,\n          title={PixArt-$\\alpha$: 快速训练扩散 Transformer 实现照片级真实感文生图}, \n          author={Junsong Chen、Jincheng Yu、Chongjian Ge、Lewei Yao、Enze Xie、Yue Wu、Zhongdao Wang、James Kwok、Ping Luo、Huchuan Lu、Zhenguo Li},\n          year={2023},\n          eprint={2310.00426},\n          archivePrefix={arXiv},\n          primaryClass={cs.CV}\n    }\n    @misc{chen2024pixartdelta,\n          title={PIXART-{\\delta}: 基于潜在一致性模型的快速可控图像生成}, \n          author={Junsong Chen、Yue Wu、Simian Luo、Enze Xie、Sayak Paul、Ping Luo、Hang Zhao、Zhenguo Li},\n          year={2024},\n          eprint={2401.05252},\n          archivePrefix={arXiv},\n          primaryClass={cs.CV}\n    }\n    \n# 🤗 致谢\n- 感谢 [Diffusers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fdiffusers) 提供的卓越技术支持和精彩合作！\n- 感谢 [Hugging Face](https:\u002F\u002Fgithub.com\u002Fhuggingface) 对精美演示的支持！\n- 感谢 [DiT](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FDiT) 的杰出工作和代码库！\n\n## 星标历史\n\n[![星标历史图表](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_readme_31c1ca7cfcfe.png)](https:\u002F\u002Fstar-history.com\u002F#PixArt-alpha\u002FPixArt-alpha&Date)","# PixArt-α 快速上手指南\n\nPixArt-α 是一个基于 Transformer 的文本到图像（T2I）扩散模型，生成质量媲美 SDXL 和 Midjourney，但训练速度极快。本指南将帮助你快速在本地部署并运行该模型。\n\n## 1. 环境准备\n\n### 系统要求\n- **操作系统**: Linux (推荐), Windows, macOS\n- **Python**: 3.8 或更高版本\n- **GPU**: 推荐 NVIDIA GPU，显存至少 8GB (使用 Diffusers 管道可在 8GB 显存下运行)\n- **CUDA**: 根据显卡驱动安装对应的 CUDA 版本 (推荐 11.8 或 12.1)\n\n### 前置依赖\n确保已安装 `git` 和 `conda` (推荐) 或 `pip`。\n\n## 2. 安装步骤\n\n### 方法 A：使用 Conda 创建虚拟环境（推荐）\n\n```bash\n# 创建虚拟环境\nconda create -n pixart python=3.10 -y\nconda activate pixart\n\n# 克隆仓库\ngit clone https:\u002F\u002Fgithub.com\u002FPixArt-alpha\u002FPixArt-alpha.git\ncd PixArt-alpha\n\n# 安装 PyTorch (根据你的 CUDA 版本选择，此处以 CUDA 11.8 为例)\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\n\n# 安装项目依赖\npip install -r requirements.txt\n\n# 安装 diffusers (可选，用于简化推理)\npip install diffusers transformers accelerate safetensors\n```\n\n> **国内加速提示**：如果下载依赖较慢，可使用清华或阿里镜像源：\n> `pip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n### 方法 B：直接使用 Diffusers (最简方式)\n\n如果你只想进行推理而不需要训练代码，可以直接安装 `diffusers` 库，无需克隆整个仓库：\n\n```bash\npip install diffusers transformers accelerate torch torchvision\n```\n\n## 3. 基本使用\n\n### 方式一：使用 Diffusers 管道（推荐，代码最简洁）\n\n这是最简单的使用方法，支持自动下载模型权重。\n\n```python\nimport torch\nfrom diffusers import PixArtAlphaPipeline\n\n# 加载模型 (首次运行会自动从 HuggingFace 下载权重)\n# 国内用户若无法连接 HF，可设置 mirror 或使用 OpenXLab 镜像\npipe = PixArtAlphaPipeline.from_pretrained(\n    \"PixArt-alpha\u002FPixArt-XL-2-1024-MS\", \n    torch_dtype=torch.float16\n).to(\"cuda\")\n\n# 生成图像\nprompt = \"一只在赛博朋克城市屋顶上晒太阳的猫，霓虹灯背景，高细节\"\nimage = pipe(prompt).images[0]\n\n# 保存结果\nimage.save(\"output.png\")\n```\n\n**注意**：如果网络受限，可以使用 OpenXLab 的镜像源加载模型：\n```python\npipe = PixArtAlphaPipeline.from_pretrained(\n    \"PixArt-alpha\u002FPixArt-XL-2-1024-MS\",\n    torch_dtype=torch.float16,\n    variant=\"fp16\"\n).to(\"cuda\")\n# 或者手动下载权重后指定本地路径\n# pipe = PixArtAlphaPipeline.from_pretrained(\".\u002Flocal_model_path\", torch_dtype=torch.float16).to(\"cuda\")\n```\n\n### 方式二：使用官方仓库脚本推理\n\n如果你克隆了完整仓库，可以使用官方提供的推理脚本。\n\n1. **下载模型权重**：\n   从 [HuggingFace](https:\u002F\u002Fhuggingface.co\u002FPixArt-alpha\u002FPixArt-XL-2-1024-MS) 或 [OpenXLab](https:\u002F\u002Fopenxlab.org.cn\u002Fmodels\u002Fdetail\u002FPixArt-alpha\u002FPixArt-XL-2-1024-MS) 下载预训练权重到 `pretrained_models` 目录。\n\n2. **运行推理命令**：\n   ```bash\n   python scripts\u002Finference.py \\\n       --model_path pretrained_models\u002FPixArt-XL-2-1024-MS.pth \\\n       --image_size 1024 \\\n       --prompt \"A small cactus with a happy face in the Sahara desert.\" \\\n       --sample_solver sapolver \\\n       --guidance_scale 7.5 \\\n       --seed 0\n   ```\n\n### 低显存优化 (8GB VRAM)\n\n如果你的显存较小，在使用 Diffusers 时可以通过启用 `enable_attention_slicing` 和 `enable_vae_slicing` 来降低显存占用：\n\n```python\npipe.enable_attention_slicing()\npipe.enable_vae_slicing()\n# 如果仍显存不足，可尝试将 torch_dtype 改为 torch.float32 并使用 CPU offload\n# pipe.enable_model_cpu_offload()\n```\n\n现在你可以开始使用 PixArt-α 生成高质量的图像了！更多高级功能（如 ControlNet、LCM 加速、LoRA 训练）请参考项目仓库中的详细文档。","一家独立游戏开发团队正在为即将上线的奇幻 RPG 项目紧急制作大量高分辨率概念图，以统一美术风格并加速资产生产。\n\n### 没有 PixArt-alpha 时\n- **训练成本高昂**：团队若想微调模型以匹配独特画风，需耗费数周时间在昂贵 GPU 集群上训练传统扩散模型。\n- **生成速度缓慢**：使用现有开源模型生成单张 1024x1024 高清图需数十秒，难以满足快速迭代需求。\n- **细节表现不足**：生成的复杂场景（如光影交错的城堡）常出现结构扭曲或纹理模糊，缺乏照片级真实感。\n- **资源门槛过高**：高性能推理依赖顶级显卡，导致普通开发者的本地机器无法流畅运行。\n\n### 使用 PixArt-alpha 后\n- **训练效率飞跃**：借助 Diffusion Transformer 架构，团队仅需少量数据和数天即可在单卡上完成特定风格的高效微调。\n- **极速高清输出**：利用其优化的采样策略，生成同等分辨率图像的时间缩短至几秒，大幅提升了试错频率。\n- **画质显著提升**：模型对文本提示的理解更精准，能稳定输出光影自然、细节丰富的照片级奇幻场景。\n- **部署更加灵活**：得益于高效的推理性能，美术人员可直接在配置普通的 workstation 上实时预览和修改生成结果。\n\nPixArt-alpha 通过突破性的训练速度与卓越的成像质量，让中小团队也能以低成本实现电影级的视觉资产创作。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPixArt-alpha_PixArt-alpha_657cc1b8.png","PixArt","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FPixArt-alpha_09f5cd23.jpg","",null,"xie.enze@huawei.com","https:\u002F\u002Fpixart-alpha.github.io\u002F","https:\u002F\u002Fgithub.com\u002FPixArt-alpha",[81,85,89,93,96],{"name":82,"color":83,"percentage":84},"Python","#3572A5",82.7,{"name":86,"color":87,"percentage":88},"Jupyter Notebook","#DA5B0B",17.2,{"name":90,"color":91,"percentage":92},"Shell","#89e051",0.1,{"name":94,"color":95,"percentage":92},"Dockerfile","#384d54",{"name":97,"color":98,"percentage":99},"CSS","#663399",0,3292,202,"2026-04-18T03:29:26","Apache-2.0","Linux, Windows","需要 NVIDIA GPU。官方支持在 8GB 显存下运行（通过 Diffusers），推荐更高显存以支持高分辨率生成或训练。","未说明",{"notes":108,"python":106,"dependencies":109},"该工具基于 PyTorch 和 Diffusers 库。支持多种运行方式：包括原生 PyTorch 代码、Hugging Face Diffusers 管道、ComfyUI 插件以及 Google Colab 免费试用。官方文档特别指出，使用 Diffusers 集成可在 8GB 显存的 GPU 上运行推理。训练和高分辨率生成可能需要更多显存。项目提供了预训练权重、ControlNet 扩展及 LCM 加速版本。",[110,111,112,113,114,115],"torch","diffusers","transformers","accelerate","gradio","ComfyUI (可选)",[15],"2026-03-27T02:49:30.150509","2026-04-20T04:07:08.560763",[],[]]